Beispiel #1
0
def _af_filter(data, in_file, out_file):
    """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER)
    """
    min_freq = float(
        utils.get_in(data["config"],
                     ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.info(
        "Filtering MuTect2 calls with allele fraction threshold of %s" %
        min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(
            ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_filter_to_header({
                'ID':
                'MinAF',
                'Description':
                'Allele frequency is lower than %s%% ' % (min_freq * 100) +
                ('(configured in bcbio as min_allele_fraction)'
                 if utils.get_in(data["config"],
                                 ("algorithm", "min_allele_fraction")) else
                 '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)'
                 )
            })
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if np.all(rec.format('AF')[tumor_index] < min_freq):
                    vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter):
    vcf = VCF(vcffile)
    header_param_id = {
        'ID':
        'MISSING',
        'Description':
        'failed variant site missingness threshold ({} %)'.format(
            missing_threshold)
    }
    header_param_info = {
        'ID': 'MISSINGPCT',
        'Description': 'site missingness percentage',
        'Type': 'Float',
        'Number': '1'
    }
    vcf.add_filter_to_header(header_param_id)
    vcf.add_info_to_header(header_param_info)
    out = Writer('-', vcf)
    (total_sites, noted_sites) = (0, 0)

    for variant in vcf(region):
        total_sites += 1
        (missing_pct, missing, total) = compute_missingness(variant)
        verdict = variant_missing_criteria(missing_threshold, missing_pct)
        variant = update_variant(variant, verdict, missing_pct)
        if verdict == "pass":
            noted_sites += 1
            out.write_record(variant)
        elif verdict == "fail" and soft_filter:
            out.write_record(variant)

    out.close()
    msg = "After filtering, passed {} out of a possible {} Sites ({})"
    msg = msg.format(noted_sites, total_sites, 'pass')
    print(msg, file=sys.stderr)
def augment_vcf(vcf_in_file, vcf_out_file, bed_files, decimals):
    """Parses and augments VCF file."""

    # Read in the input VCF file
    vcf_in = VCF(vcf_in_file)

    # Add rows to the header for each new field
    vcf_in = modify_header(vcf_in, bed_files)

    # Set up a write based on the tweaked input VCF file
    vcf_out = Writer(vcf_out_file, vcf_in)

    # Parse BED files
    bed = parse_bed_files(bed_files)

    # Iterate over every variant record
    for variant in vcf_in:
        # Augment the variant by adding new fields (if there are samples)
        num_samples = len(vcf_in.samples)
        if num_samples > 0:
            variant = add_fields_to_variant(variant, bed, decimals)
        # Output the augmented variant
        vcf_out.write_record(variant)

    # Close input and output VCF files
    vcf_in.close()
    vcf_out.close()
Beispiel #4
0
def writeVCF(vcf, inIDs, out):
    vcf_in = VCF(vcf)
    vcf_out = Writer(out, vcf_in)
    # vcf_out = VariantFile(out, 'wb', header=vcf_in.header)
    for rec in vcf_in:
        # for en,rec in enumerate(vcf_in.fetch()):
        chrom = rec.CHROM
        try:
            test = int(chrom)
        except ValueError:
            continue
        id1 = str(rec.ID) + ":" + str(rec.REF)
        id2 = str(rec.ID) + ":" + ''.join(rec.ALT)
        # recChang = list(set.intersection(*map(set,[[id1, id2], inIDs])))
        # if len(recChang) != 0:
        # vcf_out.write_record(rec)
        # inIDs = [inID for inID in inIDs if inID not in recChang[0]]
        # recChang = list(set(id1) & set(id2) & set(inIDS))
        if id1 in inIDs:
            #modify id
            rec.ID = id1
            vcf_out.write_record(rec)
            # inIDS = [x for x in inIDs if id1 not in inIDs]
            # inIDs = list(filter(lambda x: x != id1, inIDs))
            inIDs = [inID for inID in inIDs if inID not in id1]
        elif id2 in inIDs:
            rec.ID = id2
            #modify id
            # vcf_out.write(rec)
            vcf_out.write_record(rec)
            # inIDs = list(filter(lambda x: x != id2, inIDs))
            inIDs = [inID for inID in inIDs if inID not in id2]
        if len(inIDs) == 0:
            break
Beispiel #5
0
def merge(in_vcf, cadd_tsv):
    new_headers = annotation_info_headers()

    log("Collecting the CADD annotation information")
    cadd_annotations = create_CADD_annotation_dictionary(cadd_tsv)

    log("Processing the build37 vcf")
    vcf = VCF(in_vcf)

    for info_hdr in new_headers:
        vcf.add_info_to_header(info_hdr)

    out = Writer('-', vcf)

    in_vcf_variants = set()
    for variant in vcf:
        (variant, key) = update_variant(variant, cadd_annotations)
        in_vcf_variants.add(key)
        out.write_record(variant)

    out.close()

    log("Checking whether CADD completed correctly")
    ensure_cadd_completed_successfully(
        in_vcf, cadd_tsv, in_vcf_variants,
        frozenset(list(cadd_annotations.keys())))

    log("All Done!")
Beispiel #6
0
def main(fname_in, fname_out, ambiguous_base_coverage_threshold):
    """
    ambiguous_base_coverage_threshold:
        frequency threshold to include a variant in computation of ambiguous code
    """
    vcf_reader = VCF(fname_in)
    vcf_writer = Writer(fname_out, vcf_reader)

    for variant in vcf_reader:
        base_list = [variant.REF] + variant.ALT
        coverage_list = variant.INFO.get("AD")

        total_coverage = sum(coverage_list)
        assert len(base_list) == len(coverage_list)

        # genotype 0 is reference (base is not really needed)
        genotype = [
            i
            for i, (base, coverage) in enumerate(zip(base_list, coverage_list))
            if coverage / total_coverage >= ambiguous_base_coverage_threshold
        ]

        variant.genotypes = [[*genotype, False]]

        vcf_writer.write_record(variant)

    vcf_writer.close()
    vcf_reader.close()
Beispiel #7
0
def filter_vcf(vcf,
               output,
               minlength=0,
               truncate_svlen=float("inf"),
               suffix=""):
    vcf_in = VCF(vcf)
    if not output:
        output = vcf.replace(".vcf", "_{}.vcf".format(suffix))
    vcf_in.add_info_to_header({
        'ID': 'TRUNCATED',
        'Description': "SVLEN truncated",
        'Type': 'Flag',
        'Number': '0'
    })
    vcf_out = Writer(output, vcf_in)
    records_truncated = 0
    records_filtered = 0
    for v in vcf_in:
        svlen = get_svlen(v)
        if svlen >= minlength:
            if svlen > truncate_svlen:
                v.INFO['SVLEN'] = 1
                v.INFO['END'] = v.start + 1
                v.INFO['TRUNCATED'] = True
                records_truncated += 1
            vcf_out.write_record(v)
        else:
            records_filtered += 1
    if records_truncated != 0:
        sys.stderr.write("Truncated {} records where SVLEN > {}\n".format(
            records_truncated, int(truncate_svlen)))
    if records_filtered != 0:
        sys.stderr.write("Filtered {} records where SVLEN < {}\n".format(
            records_filtered, int(minlength)))
Beispiel #8
0
def _af_filter(data, in_file, out_file):
    """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER)
    """
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            # GATK 3.x can produce VCFs without sample names for empty VCFs
            try:
                tumor_index = vcf.samples.index(dd.get_sample_name(data))
            except ValueError:
                tumor_index = None
            for rec in vcf:
                if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq):
                    vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Beispiel #9
0
def test_add_info_to_header():
    v = VCF(VCF_PATH)
    v.add_info_to_header({
        'ID': 'abcdefg',
        'Description': 'abcdefg',
        'Type': 'Character',
        'Number': '1'
    })
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    import sys
    rec = next(v)

    rec.INFO["abcdefg"] = "XXX"
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    ret = v.INFO["abcdefg"]
    if isinstance(ret, bytes):
        ret = ret.decode()
    assert ret == "XXX", (dict(v.INFO), v.INFO["abcdefg"])
Beispiel #10
0
def test_add_flag():
    vcf = VCF(VCF_PATH)
    vcf.add_info_to_header({
        'ID': 'myflag',
        'Description': 'myflag',
        'Type': 'Flag',
        'Number': '0'
    })
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec = next(vcf)

    rec.INFO["myflag"] = True
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["myflag"] is True, dict(v.INFO)

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec.INFO["myflag"] = False
    w.write_record(rec)
    v = next(VCF(f))
    assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def annotate_allelic_balance(vcffile, region):
    vcf = VCF(vcffile)

    header_hetab_param_info = {
        'ID': 'HetAB',
        'Description': 'heterozygous genotype allele balance',
        'Type': 'Float',
        'Number': '1'
    }

    header_het_hom_alt_ab_param_info = {
        'ID': 'HetHomAltAB',
        'Description': 'heterozygous + homozygous ALT genotype allele balance',
        'Type': 'Float',
        'Number': '1'
    }

    vcf.add_info_to_header(header_hetab_param_info)
    vcf.add_info_to_header(header_het_hom_alt_ab_param_info)
    out = Writer('-', vcf)
    (total_sites, noted_sites) = (0, 0)

    for variant in vcf(region):
        total_sites += 1
        if is_biallelic(variant):
            noted_sites += 1
            (hetab, het_hom_alt_ab) = compute_allelic_balances(variant)
            variant = update_variant(variant, hetab, het_hom_alt_ab)
        out.write_record(variant)

    out.close()
    msg = "Annotated {} out of a possible {} sites"
    msg = msg.format(noted_sites, total_sites)
    log(msg)
Beispiel #12
0
def write_pass_vcf(annotated_vcf):

    out_vcf = re.sub(r'\.annotated\.vcf\.gz$', '.annotated.pass.vcf',
                     annotated_vcf)
    vcf = VCF(annotated_vcf)
    w = Writer(out_vcf, vcf)

    num_rejected = 0
    num_pass = 0
    for rec in vcf:
        if rec.FILTER is None or rec.FILTER == 'None':
            w.write_record(rec)
            num_pass += 1
        else:
            num_rejected += 1

    vcf.close()
    w.close()

    logger.info('Number of non-PASS/REJECTED variant calls: ' +
                str(num_rejected))
    logger.info('Number of PASSed variant calls: ' + str(num_pass))
    if num_pass == 0:
        logger.warning(
            'There are zero variants with a \'PASS\' filter in the VCF file')
        os.system('bgzip -dc ' + str(annotated_vcf) + ' egrep \'^#\' > ' +
                  str(out_vcf))
    #else:
    os.system('bgzip -f ' + str(out_vcf))
    os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')

    return
Beispiel #13
0
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_in.add_info_to_header({
        'ID': 'SVLEN',
        'Description': 'length of sv',
        'Type': 'Integer',
        'Number': '1'
    })
    vcf_in.add_info_to_header({
        'ID': 'SVTYPE',
        'Description': 'type of sv - just DEL or INS based on SVLEN',
        'Type': 'String',
        'Number': '1'
    })
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49:
            v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF)
            if v.INFO["SVLEN"] > 0:
                v.INFO["SVTYPE"] = "INS"
            else:
                v.INFO["SVTYPE"] = "DEL"
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()
Beispiel #14
0
def test_write_missing_contig():
    input_vcf = VCF('{}/seg.vcf.gz'.format(HERE))
    output_vcf = Writer('/dev/null', input_vcf)
    for v in input_vcf:
        v.genotypes = [[1, 1, False]]
        output_vcf.write_record(v)
    output_vcf.close()
Beispiel #15
0
def use_cyvcf2(vcf_file, vcf_out=None):
    """ Working.
        File out:       2:17.51
        stdout + bgzip: 2:50.35
    """
    from cyvcf2 import VCF, Writer

    vcf = VCF(vcf_file)
    vcf.add_filter_to_header({
        'ID': 'MSI_FAIL',
        'Description': 'Possible homopolymer artefact'
    })
    if vcf_out:
        w = Writer(vcf_out, vcf)
    else:
        w = None
        sys.stdout.write(vcf.raw_header)
    for rec in vcf:
        msi_fail = proc_fields(rec.REF, rec.ALT[0],
                               rec.format('AF')[0][0], rec.INFO['MSI'])
        if msi_fail:
            filters = rec.FILTER.split(';') if rec.FILTER else []
            filters.append('MSI_FAIL')
            rec.FILTER = ';'.join(filters)
        if w:
            w.write_record(rec)
        else:
            sys.stdout.write(str(rec))
    if w:
        w.close()
Beispiel #16
0
def test_add_flag():
    vcf = VCF(VCF_PATH)
    vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag',
        'Type':'Flag', 'Number': '0'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec = vcf.next()

    rec.INFO["myflag"] = True
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["myflag"] is None, dict(v.INFO)

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec.INFO["myflag"] = False
    w.write_record(rec)
    v = next(VCF(f))
    assert_raises(KeyError, v.INFO.__getitem__, "myflag")
Beispiel #17
0
def test_write_missing_contig():
    input_vcf = VCF('{}/seg.vcf.gz'.format(HERE))
    output_vcf = Writer('/dev/null', input_vcf)
    for v in input_vcf:
        v.genotypes = [[1,1,False]]
        output_vcf.write_record(v)
    output_vcf.close()
def prepare_benign_training_sets(vcf, output, intersection_clinvar_hgmd,
                                 intersection_circularity, db):
    o = Writer(output, vcf)
    stats_dict = collections.defaultdict()
    stats_dict['Database'] = db
    stats_dict['Circularity_filtering'] = 0
    stats_dict['High_confidence'] = 0

    for counter, variant in enumerate(
            tqdm(
                vcf,
                desc=
                'Removing overlapping variants : [gnomAD] ∩ [ClinVar, HGMD, Training_sets]'
            )):
        if counter == 3000:
            break
        if len(variant.REF) == 1 and len(variant.ALT[0]) == 1:

            id_var = str(variant.CHROM) + '_' + str(variant.POS) + '_' + str(
                variant.REF) + '_' + str(variant.ALT[0])
            if id_var not in intersection_clinvar_hgmd:
                stats_dict['High_confidence'] += 1

                if id_var not in intersection_circularity:
                    stats_dict['Circularity_filtering'] += 1
                    variant.INFO['True_Label'] = -1
                    variant.INFO['Source'] = db
                o.write_record(variant)
    return stats_dict
Beispiel #19
0
def output_pharmcat_ready_vcf(input_vcf, output_dir, output_prefix):
    '''
    iteratively write to a PharmCAT-ready VCF for each sample

    "bcftools view <options> <input_vcf>". For bcftools common options, see running_bcftools().
    "-U" exclude sites without a called genotype, i.e., GT = './.'
    '''

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_vcf_sample_list = input_vcf_cyvcf2.samples
    input_vcf_sample_list.remove('PharmCAT')
    input_vcf_cyvcf2.close()

    # output each single sample to a separete VCF
    for single_sample in input_vcf_sample_list:
        print('Generating a PharmCAT-ready VCF for ' + single_sample)
        input_vcf_cyvcf2 = VCF(input_vcf, samples=single_sample)

        # write to a VCF output file
        output_file_name = os.path.join(
            output_dir, output_prefix + '.' + single_sample + '.vcf')
        # header
        output_vcf_cyvcf2 = Writer(output_file_name,
                                   input_vcf_cyvcf2,
                                   mode='w')
        # content
        for single_var in input_vcf_cyvcf2:
            output_vcf_cyvcf2.write_record(single_var)
        output_vcf_cyvcf2.close()
        input_vcf_cyvcf2.close()
Beispiel #20
0
def processVCF(invcf, remm, dann, out):
    vcf_data = VCF(invcf, gts012=True)
    tbx_remm = pysam.TabixFile(remm)
    tbx_dann = pysam.TabixFile(dann)
    vcf_data.add_info_to_header({
        'ID': 'DANN',
        'Description':
        'A deep neural network aimed to recognize pathogenic variants by annotating genetic variants, especially in noncoding regions.',
        'Type': 'String',
        'Number': '.'
    })
    w = Writer(out, vcf_data)
    for record in vcf_data:
        try:
            for row in tbx_remm.fetch(record.CHROM, record.start, record.end):

                if int(str(row).split()[1]) == record.POS:
                    record.INFO["ReMM"] = str(row).split()[2]
            if not record.INFO["ReMM"]:
                record.INFO["ReMM"] = "."
        except ValueError:
            record.INFO["ReMM"] = "."

        try:
            for row in tbx_dann.fetch(record.CHROM, record.start, record.end):
                if int(row.split()[1]) == record.POS and row.split(
                )[2] == record.REF and row.split()[3] == record.ALT[0]:
                    record.INFO["DANN"] = round(float(row.split()[4]), 3)
                    break
                else:
                    record.INFO["DANN"] = "."
        except ValueError:
            record.INFO["DANN"] = "."

        w.write_record(record)
Beispiel #21
0
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir,
                                 input_ref_pgx_vcf):
    '''
    extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output
    '''

    print(
        'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.'
    )
    path_output = os.path.join(
        output_dir,
        obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz')

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf)

    # get pgx regions in each chromosome
    input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf)
    input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[
        'CHROM'].replace({
            'chr': ''
        }, regex=True).astype(str).astype(int)
    ref_pgx_regions = input_ref_pgx_pos_pandas.groupby(
        ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index()
    # fix chr names
    chr_name_match = re.compile("^chr")
    if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames):
        # chromosomes have leading 'chr' characters in the original VCF
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)),
            axis=1).replace({'^': 'chr'}, regex=True)
    else:
        # chromosomes do not have leading 'chr' characters in the original VCF
        # add chromosome name with leading 'chr' to the VCF header
        for single_chr in input_vcf_cyvcf2.seqnames:
            input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr +
                                           '>')
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)), axis=1)

    # write to a VCF output file
    # header
    output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz")
    # content
    for single_region in ref_pgx_regions:
        for single_variant in input_vcf_cyvcf2(single_region):
            single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1',
                                          single_variant.CHROM)
            output_vcf_cyvcf2.write_record(single_variant)

    # close pipe
    input_vcf_cyvcf2.close()
    input_ref_pgx_pos_cyvcf2.close()
    output_vcf_cyvcf2.close()

    tabix_index_vcf(tabix_executable_path, path_output)

    return path_output
def main(invcf: str = typer.Argument(..., help="输入的vcf文件"),
         outvcf: str = typer.Argument(..., help="输出的vcf文件"),
         mindepth: int = typer.Option(10, help="最低reads覆盖率"),
         het_altrange: Tuple[float, float] = typer.Option((0.2, 0.8),
                                                          help="杂合位点的alt频率范围"),
         homref_maxaltrate: float = typer.Option(
             0, help="纯合ref型GT,最大alt reads比例不超过这个"),
         homalt_minaltrate: float = typer.Option(
             1, help="纯合alt型GT,最小alt reads比例不低于这个")):
    """
    mask掉满足以下的genotype:
    杂合位点alt reads的频率不在20%到80%范围之内的。
    纯合位点reads支持比例不是100%的。
    覆盖的reads小于10条的。
    """
    vcf = VCF(invcf)
    w = Writer(outvcf, vcf)
    for v in vcf:
        indicies_mask = filter_samples(v, mindepth, het_altrange,
                                       homref_maxaltrate, homalt_minaltrate)
        if indicies_mask:
            for index in indicies_mask:
                v.genotypes[index] = [-1] * v.ploidy + [False]
            v.genotypes = v.genotypes
        w.write_record(v)
    w.close()
    vcf.close()
Beispiel #23
0
    def seperate_vcffile(self):
        # start = time.time()
        file_list = self.search_vcf_file(self.from_directory)
        for file in file_list:
            vcf_read = VCF(file)
            samples = vcf_read.samples
            chromosome_num = ""
            for variant in vcf_read:
                chromosome_num = variant.CHROM
                break

            for sample in samples:
                start = time.time()
                # print(sample, "file write start...  ", start)
                try:
                    if not (os.path.isdir(self.target_directory)):
                        os.makedirs(os.path.join(self.target_directory))
                    if not (os.path.isdir(self.target_directory + "/" +
                                          sample)):
                        os.makedirs(
                            os.path.join(self.target_directory + "/" + sample))
                except OSError as e:
                    print("Failed to create directory!!!!!", e)
                    raise

                filepath = os.path.join(self.target_directory + "/" + sample,
                                        chromosome_num + "-" + sample + ".vcf")
                index = 0
                while os.path.exists(filepath):
                    index = index + 1
                    filepath = os.path.join(
                        self.target_directory + "/" + sample,
                        chromosome_num + "-" + sample + str(index) + ".vcf")

                out_read_vcf = VCF(file, samples=[sample])
                write_file = Writer(filepath, out_read_vcf)

                for variant in out_read_vcf:
                    if chromosome_num == "Y":
                        if not variant.genotypes[0][0] == 0:
                            write_file.write_record(variant)

                    elif not (variant.genotypes[0][0] == 0
                              and variant.genotypes[0][1] == 0):
                        write_file.write_record(variant)

                write_file.close()
                out_read_vcf.close()

                with open(filepath, "rb") as f_in:
                    with gzip.open(filepath + ".gz", "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)

                os.remove(filepath)
                sec = time.time() - start
                print(sample + " write end...",
                      time.strftime("%H:%M:%S", time.gmtime(sec)))
                break
            vcf_read.close()
Beispiel #24
0
def write_truncate_vcf(path_in: str, path_out: str, trunc: int) -> int:
    w = Writer(path_out, VCF(path_in, threads=nb_cores))
    for i, v in enumerate(VCF(path_in, threads=nb_cores)):
        if i == trunc:
            break
        else:
            w.write_record(v)
    return i
Beispiel #25
0
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if v.INFO["SVLEN"] > 49:
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()
Beispiel #26
0
def main():
    args = get_args()
    vcf = VCF(args.vcf)
    w = Writer(args.output, vcf)
    for v in vcf:
        if v.INFO["SVTYPE"] == "DEL":
            if not v.INFO["SVLEN"] < 0:
                v.INFO["SVLEN"] = -v.INFO["SVLEN"]
        w.write_record(v)
Beispiel #27
0
def main():
    args = get_args()
    vars = defaultdict(list)
    vcf = VCF(args.vcf)
    output = args.vcf.replace('.vcf', '')
    for v in vcf:
        vars[v.INFO.get('SVTYPE')].append(v)
    for k, varlist in vars.items():
        w = Writer(output + '_' + k.replace('/', '') + '.vcf', vcf)
        for v in varlist:
            w.write_record(v)
Beispiel #28
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Beispiel #29
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Beispiel #30
0
    def to_vcf(self, path):
        """
        Parse query result as vcf file.

        Args:
          path: path of the file.
        """
        from cyvcf2 import Writer
        writer = Writer(path, self.vcf)
        for v in self:
            writer.write_record(v.source)
Beispiel #31
0
def canonicalize_vcf(input: PathType, output: PathType) -> None:
    """Canonicalize the fields in a VCF file by writing all INFO fields in the order that they appear in the header."""

    with open_vcf(input) as vcf:

        info_field_names = _info_fields(vcf.raw_header)

        w = Writer(str(output), vcf)
        for v in vcf:
            v = _reorder_info_fields(w, v, info_field_names)
            w.write_record(v)
        w.close()
Beispiel #32
0
def main():
    args = get_args()
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect = 0
    for v in vcf:
        if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL":
            v.ALT = "<DEL>"
            incorrect += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect))
    output.close()
    vcf.close()
Beispiel #33
0
def main(min_allele_balance, max_allele_balance, allele_balance_tag,
         variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields,
         vcf):
    reader = VCF(vcf)
    refilter = Filter(min_allele_balance, max_allele_balance,
                      allele_balance_tag, variant_sample_depth_tag, min_depth,
                      exclude_filters, exclude_fields)
    reader.add_filter_to_header(refilter.filtered_header())
    reader.add_info_to_header(refilter.rescued_header())
    writer = Writer('-', reader)

    for variant in reader:
        refilter(variant)  # Modifies variant filter status in place
        writer.write_record(variant)
Beispiel #34
0
def main():
    args = get_args()
    genome = Fasta(args.genome)
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect_reference = 0
    for v in vcf:
        ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome)
        if v.REF != ref_nucl:
            v.REF = ref_nucl
            incorrect_reference += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect_reference))
    output.close()
    vcf.close()
Beispiel #35
0
def test_issue44():
    vcf = VCF('{}/issue_44.vcf'.format(HERE))
    w = Writer('__o.vcf', vcf)
    for v in vcf:
        tmp = v.genotypes
        #print(tmp, file=sys.stderr)
        v.genotypes = tmp
        w.write_record(v)
    w.close()
    #           "./."            "."          ".|."           "0|0"
    expected = [[-1, -1, False], [-1, False], [-1, -1, True], [0, 0, True]]
    print("", file=sys.stderr)
    for i, v in enumerate(VCF('__o.vcf')):
        #print(v.genotypes, file=sys.stderr)
        assert v.genotypes == [expected[i]], (v.genotypes, expected[i])
    os.unlink("__o.vcf")
Beispiel #36
0
def test_add_filter_to_header():
    v = VCF(VCF_PATH)
    # NOTE that we have to add the filter to the header of the reader,
    # not the writer because the record will be associated with the reader
    v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'})

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    rec = v.next()

    rec.FILTER = ["abcdefg"]
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.FILTER == "abcdefg", v.FILTER
Beispiel #37
0
def test_add_info_to_header():
    v = VCF(VCF_PATH)
    v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg',
        'Type':'Character', 'Number': '1'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    import sys
    rec = v.next()

    rec.INFO["abcdefg"] = "XXX"
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
Beispiel #38
0
def test_writer():

    v = VCF(VCF_PATH)
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)

    o = Writer(f, v)
    rec = next(v)
    rec.INFO["AC"] = "3"
    rec.FILTER = ["LowQual"]
    o.write_record(rec)

    rec.FILTER = ["LowQual", "VQSRTrancheSNP99.90to100.00"]
    o.write_record(rec)


    rec.FILTER = "PASS"
    o.write_record(rec)

    o.close()

    expected = ["LowQual".encode(), "LowQual;VQSRTrancheSNP99.90to100.00".encode(), None]

    for i, variant in enumerate(VCF(f)):
        assert variant.FILTER == expected[i], (variant.FILTER, expected[i])
Beispiel #39
0
def run(inheritance_model, ped, vcf, min_depth, min_gq, min_kindreds, severity):
    from cyvcf2 import VCF, Writer
    vcf = VCF(vcf, samples="-")

    annos = {}
    if "ANN" in vcf:
        desc = vcf["ANN"]["Description"]
        parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))]
        annos["ANN"] = desc
    if "EFF" in vcf:
        desc = vcf["EFF"]["Description"]
        parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())]
        annos["EFF"] = parts
    if "CSQ" in vcf:
        desc = vcf["CSQ"]["Description"]
        parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())]
        annos["CSQ"] = parts

    vcf.update(id="inheritance", type="String", number="1", description="inheritance stuffs")
    out = Writer("-", vcf)

    vcf_order = dict((n, i) for i, n in (enumerate(vcf.samples)))
    fams = Family.from_ped(ped, order=vcf_order)
    for fam_id in fams:
        fams[fam_id] = (EvalFamily(fams[fam_id]), [s._i for s in fams[fam_id].subjects])

    def get_gene(variant):
        for anno in annos:
            consequences = variant.INFO[anno].split(",")
            effs = (Effect.new(anno, c, annos[anno]) for c in consequences)
            # limit to requested severity
            if severity is not None:
                effs = [e for e in effs if e.impact_severity in severity]
            effs = sorted(effs, reverse=True)
            for eff in effs:
                if eff.gene:
                    return eff.gene

    # TODO: more flexible groupby
    for gene, variants in it.groupby(vcf, get_gene):

        matching_fams = defaultdict(list)
        saved_vars = []
        uniq_fams = []

        for i, variant in enumerate(variants):
            saved_vars.append(variant)

            for family_id, (fam, idxs) in fams.items():
                fam.gt_types = variant.gt_types[idxs]
                fam.gt_depths = variant.gt_depths[idxs]
                fam.gt_quals = variant.gt_quals[idxs]
                # this dispatches to fam.auto_rec/auto_dom/de_novo/, etc. by the string
                # in inheritance model
                res = getattr(fam, inheritance_model)(min_depth=min_depth, min_gq=min_gq)

                # matched the inheritance model.
                if res: # can add custom logic here, e.g. and v.call_rate > 0.9:
                    matching_fams[i].append(family_id)
                    uniq_fams.append(family_id)

        if 0 < len(set(uniq_fams)) >= min_kindreds:

            if inheritance_model == 'comp_het':
                # TODO: idxs = matching_fams.keys()
                # run idxs[1:] vs idxs[:-1] for variants
                pass
            for i, family_ids in sorted(matching_fams.items()):
                variant = saved_vars[i]
                variant.INFO["inheritance"] = "%s:%s" % (gene, ",".join(set(family_ids)))

                out.write_record(variant)
Beispiel #40
0
from cyvcf2 import VCF, Writer
import re
import sys

patt = re.compile(',|\|')

def clinvar(v):
    return v.INFO.get("CLNSIG") == "5"
    #return [x in "45" for x in re.split(patt,v.INFO.get("CLNSIG"))][0]

def aaf(v, max_aaf):
    if v.INFO.get("max_aaf_all") != None:
        return float(v.INFO.get("max_aaf_all")) <= float(max_aaf)
    else:
        return True

vcf_path = sys.argv[1]
max_aaf = float(sys.argv[2])

viter = VCF(vcf_path)
w = Writer("-", viter)
pos = lambda v: (v.CHROM, v.start, v.end)
for v in viter:
    if clinvar(v) and aaf(v, max_aaf):
        w.write_record(v)
w.close()