Example #1
0
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir,
                                 input_ref_pgx_vcf):
    '''
    extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output
    '''

    print(
        'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.'
    )
    path_output = os.path.join(
        output_dir,
        obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz')

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf)

    # get pgx regions in each chromosome
    input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf)
    input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[
        'CHROM'].replace({
            'chr': ''
        }, regex=True).astype(str).astype(int)
    ref_pgx_regions = input_ref_pgx_pos_pandas.groupby(
        ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index()
    # fix chr names
    chr_name_match = re.compile("^chr")
    if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames):
        # chromosomes have leading 'chr' characters in the original VCF
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)),
            axis=1).replace({'^': 'chr'}, regex=True)
    else:
        # chromosomes do not have leading 'chr' characters in the original VCF
        # add chromosome name with leading 'chr' to the VCF header
        for single_chr in input_vcf_cyvcf2.seqnames:
            input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr +
                                           '>')
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)), axis=1)

    # write to a VCF output file
    # header
    output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz")
    # content
    for single_region in ref_pgx_regions:
        for single_variant in input_vcf_cyvcf2(single_region):
            single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1',
                                          single_variant.CHROM)
            output_vcf_cyvcf2.write_record(single_variant)

    # close pipe
    input_vcf_cyvcf2.close()
    input_ref_pgx_pos_cyvcf2.close()
    output_vcf_cyvcf2.close()

    tabix_index_vcf(tabix_executable_path, path_output)

    return path_output
Example #2
0
def annotate(vcf, roh, bed, v14, quality_threshold, flag_upd_at_fraction,
             output, verbose):
    """Markup VCF file using rho-calls. Use BED file to mark all variants in AZ 
    windows. Alternatively, use a bcftools v>=1.4 file with RG entries to mark 
    all vars. With the --no-v14 flag, use an older bcftools v<=1.2 style roh TSV
    to mark only selected AZ variants. Roh is broken in bcftools v1.3 
    - do not use."""
    loglevel = LEVELS.get(min(verbose, 3))
    configure_stream(level=loglevel)

    proband_vcf = VCF(vcf)

    # add this command to VCF header

    ## This is for logging the command line string ##
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i not in ['frame']
    ]

    logger.info("Running rhocall annotate  {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))

    ## add additional tags to VCF header
    proband_vcf.add_to_header('##rhocall_version={0}'.format(__version__))
    proband_vcf.add_to_header("##rhocall_arguments={0}".format(
        ', '.join(argument_list)))

    if roh and not bed and not v14:
        run_annotate_var(proband_vcf=proband_vcf,
                         roh=roh,
                         quality_threshold=quality_threshold,
                         flag_upd_at_fraction=flag_upd_at_fraction,
                         output=output)
    elif roh and v14 and not bed:
        run_annotate_rg(proband_vcf=proband_vcf,
                        bcfroh=roh,
                        quality_threshold=quality_threshold,
                        flag_upd_at_fraction=flag_upd_at_fraction,
                        output=output)
    elif bed and not roh:
        run_annotate(proband_vcf=proband_vcf,
                     bed=bed,
                     quality_threshold=quality_threshold,
                     flag_upd_at_fraction=flag_upd_at_fraction,
                     output=output)
    else:
        click.echo("""Cannot use both BED and ROH at once. Please apply 
                    them sequentially instead.""")
Example #3
0
    raise ValueError(f"{fai} does not exist.")

TEMPLATE_VCF = f"""##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate={date.today().strftime("%Y%m%d")}
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{sample}
chr1\t1\t.\tA\tT\t.\tPASS\t.\tGT\t0|0
"""

TEMPLATE_VCF_FILE = joboutdir / "template.vcf"
TEMPLATE_VCF_FILE.write_text(TEMPLATE_VCF)

vcf = VCF(TEMPLATE_VCF_FILE)

# Add source
vcf.add_to_header(f"##source=biopipen.ns.bed.Bed2Vcf")

# Add genome assembly
if genome:
    vcf.add_to_header(f"##reference={genome}")

vcf.add_info_to_header(
    {
        "ID": "END",
        "Number": "1",
        "Type": "Integer",
        "Description": "End position of the variant described in this record"
    }
)

vcf.add_format_to_header(
logger.success(f"Extracted {len(genes)} genes from the panel")

logger.info("Extracting intervals for genes from GFF...")
with open(snakemake.input.annotation) as istream:
    ivtree = extract_intervals_for_genes_from_gff(genes, istream, padding)
logger.success(f"Intervals extracted for {len(ivtree)} genes")

logger.info(
    "Extracting those VCF records that fall within the gene intervals and altering "
    "their CHROM and POS accordingly...")
vcf_reader = VCF(snakemake.input.vcf)

logger.debug("Adding genes to header...")
for iv in ivtree:
    vcf_reader.add_to_header(
        f"##contig=<ID={iv.data[0]},length={iv.end-iv.begin}>")
logger.debug("Genes added to header")

with TemporaryDirectory() as tmpdirname:
    tmpvcf = str(Path(tmpdirname) / "tmp.vcf")
    vcf_writer = Writer(tmpvcf, tmpl=vcf_reader)

    for record in vcf_reader:
        if apply_filters and record.FILTER is not None:
            continue

        gt = Genotype.from_arr(record.genotypes[0])
        if only_alt and not gt.is_hom_alt():
            continue

        ivs = ivtree[record.start]