Example #1
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #2
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #3
0
def test_set_format_int_b():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0
    v = next(vcf)

    v.set_format("PI", np.array([855, 11], dtype=np.int64))
    assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11])
Example #4
0
def test_set_format_int3():
    "test that we can handle multiple (in this case 3) values per sample"
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="P3", Number=3, Type="Integer", Description="Int example")) == 0
    v = next(vcf)
    exp = np.array([[1, 11, 111], [2, 22, 222]], dtype=np.int)
    v.set_format("P3", exp)
    res = get_gt_str(v, "P3")
    assert res == ["1,11,111", "2,22,222"], (res, str(v))

    assert np.allclose(v.format("P3"), exp)
Example #5
0
def test_set_format_int3():
    "test that we can handle multiple (in this case 3) values per sample"
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="P3", Number=3, Type="Integer", Description="Int example")) == 0
    v = next(vcf)
    exp = np.array([[1, 11, 111], [2, 22, 222]], dtype=np.int)
    v.set_format("P3", exp)
    res = get_gt_str(v, "P3")
    assert res == ["1,11,111", "2,22,222"], (res, str(v))

    assert np.allclose(v.format("P3"), exp)
Example #6
0
def test_set_format_float():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PS", Number=1, Type="Float", Description="PS example")) == 0
    v = next(vcf)
    v.set_format("PS", np.array([0.555, 1.111], dtype=np.float))
    assert allclose(fmap(float, get_gt_str(v, "PS")), np.array([0.555, 1.111]))

    v.set_format("PS", np.array([8.555, 11.111], dtype=np.float64))
    assert allclose(fmap(float, get_gt_str(v, "PS")), [8.555, 11.111])

    v.set_format("PS", np.array([9998.555, 99911.111], dtype=np.float32))
    obs = fmap(float, get_gt_str(v, "PS"))
    assert allclose(obs, [9998.555, 99911.111]), obs
Example #7
0
def test_set_format_int():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0
    v = next(vcf)
    v.set_format("PI", np.array([5, 1], dtype=np.int))
    assert allclose(fmap(float, get_gt_str(v, "PI")), [5, 1])

    v.set_format("PI", np.array([855, 11], dtype=np.int64))
    assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11])

    v.set_format("PI", np.array([9998, 99911], dtype=np.int32))
    obs = fmap(float, get_gt_str(v, "PI"))
    assert allclose(obs, [9998, 99911]), obs
Example #8
0
def test_set_format_float():
    vcf = VCF('{}/test-format-string.vcf'.format(HERE))
    assert vcf.add_format_to_header(dict(ID="PS", Number=1, Type="Float", Description="PS example")) == 0
    v = next(vcf)
    v.set_format("PS", np.array([0.555, 1.111], dtype=np.float))
    assert allclose(fmap(float, get_gt_str(v, "PS")), np.array([0.555, 1.111]))

    v.set_format("PS", np.array([8.555, 11.111], dtype=np.float64))
    assert allclose(fmap(float, get_gt_str(v, "PS")), [8.555, 11.111])

    v.set_format("PS", np.array([9998.555, 99911.111], dtype=np.float32))
    obs = fmap(float, get_gt_str(v, "PS"))
    assert allclose(obs, [9998.555, 99911.111]), obs
Example #9
0
if genome:
    vcf.add_to_header(f"##reference={genome}")

vcf.add_info_to_header(
    {
        "ID": "END",
        "Number": "1",
        "Type": "Integer",
        "Description": "End position of the variant described in this record"
    }
)

vcf.add_format_to_header(
    {
        "ID": "GT",
        "Number": "1",
        "Type": "String",
        "Description": "Genotype",
    }
)

# Add contigs
contigs = set()
with open(fai) as f:
    for line in f:
        contig, length, *_ = line.strip().split("\t")
        contigs.add(contig)
        vcf.add_to_header(f"##contig=<ID={contig},length={length}>")

for header in headers:
    vcf.add_to_header(header)
def generate_vcf(gnomad_vcf, outfile, pop, format_fields):
    logging.info("Processing gnomAD file")
    nind = get_number_individuals(gnomad_vcf, pop)
    gt_dp, gt_qual = generate_putative_GQ_DP(format_fields, nind)
    vcf_data = VCF(gnomad_vcf, gts012=True)
    with open(outfile, 'w') as out:
        #with gzip.open(outfile, 'wb') as out:
        vcf_data.add_format_to_header({
            'ID': 'GT',
            'Description': 'Genotype',
            'Type': 'String',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'AD',
            'Description':
            'Allelic depths for the ref and alt alleles in the order listed',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'DP',
            'Description': 'Approximate read depth',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'GQ',
            'Description': 'Genotyp Quality',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'PL',
            'Description':
            'normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification',
            'Type': 'Integer',
            'Number': "G"
        })

        individuals = ["ind_" + str(i) for i in range(1, nind, 1)]
        header = filter(None, vcf_data.raw_header.split("\n"))
        final_header = [
            line + '\t' +
            '\t'.join(individuals) if line.startswith("#CHROM") else line
            for line in header
        ]
        out.write('\n'.join(final_header) + "\n")
        #out.write('\n'.join(final_header).encode() + "\n".encode())
        info_fields = [
            field["ID"] for field in vcf_data.header_iter()
            if field["HeaderType"] == "INFO"
        ]

        for record in vcf_data:
            info = record.INFO
            if pop == "All":
                nhomalt = info["nhomalt"]
                nheterozygous = info["AC"] - (nhomalt * 2)
            else:
                nhomalt = info["nhomalt" + "_" + pop]
                nheterozygous = info["AC" + "_" + pop] - (nhomalt * 2)

            record_combined_gt = simulate_genotypes(nind, nhomalt,
                                                    nheterozygous)
            gt_phred_ll_homref = np.zeros((nind, ), dtype=np.int)
            gt_phred_ll_het = np.full(shape=nind,
                                      fill_value=1500,
                                      dtype=np.int)
            gt_phred_ll_homalt = np.full(shape=nind,
                                         fill_value=1500,
                                         dtype=np.int)
            gt_alt_depth = np.zeros((nind, ), dtype=np.int)
            gt_ref_depth = copy.deepcopy(gt_dp)
            fmt = []
            for i, gt in enumerate(record_combined_gt):
                if gt == "0/1":  # if het
                    gt_phred_ll_het[i] = 0
                    gt_phred_ll_homref[i] = 1500
                    gt_alt_depth[i] = gt_ref_depth[i] = 50
                elif gt == "1/1":  # if homalt
                    gt_phred_ll_homalt[i] = 1500
                    gt_phred_ll_homref[i] = 1500
                    gt_alt_depth[i] = 100
                    gt_ref_depth[i] = 0

                fmt.append("{}:{},{}:{}:{}:{},{},{}".format(
                    gt, gt_ref_depth[i], gt_alt_depth[i], gt_dp[i], gt_qual[i],
                    gt_phred_ll_homref[i], gt_phred_ll_het[i],
                    gt_phred_ll_homalt[i]))
            str_info = []
            for i in info_fields:
                try:
                    str_info.append(i + "=" + str(record.INFO[i]))
                except KeyError as abs_fied:
                    #print("Field {} absent".format(abs_fied))
                    continue

            write_record = [
                '.' if v is None else v for v in [
                    record.CHROM,
                    str(record.POS), record.ID, record.REF, record.ALT[0],
                    str(record.QUAL), record.FILTER, ';'.join(str_info),
                    "GT:AD:DP:GQ:PL"
                ]
            ]

            out.write('\t'.join(write_record + fmt) + "\n")
            #out.write('\t'.join(write_record + fmt).encode() + "\n".encode())

        vcf_data.close()
        out.close()
    return nind
Example #11
0
def process_file(data: VCF, groups: list, f: int, fileout: list) -> None:
    #TODO: clean/refactor execution comments like processed file name
    #TODO: refactor processing lis tof files into single file processing + remove MSS param
    # data: VCF, groups: list, simul: str, fileout: str
    """
    Computes and rewrites genotypes of all individuals for all samples from input files
    :param data: cyvcf2 object reader pointing on a VCF-file
    :param groups: samples identifiers split in pools
    :param f: integer, index of the file to process in the list
    :param fileout: VCF-files with simulated pooled or randomly missing genotypes
    """
    print('Simulation type: ', 'simul')
    print('file out: ', os.path.join(os.getcwd(), fileout[f]))  # prm.PATH_OUT[simul]
    if prm.GTGL == 'GL' and prm.unknown_gl == 'adaptative':
        dic_header = {'ID': 'GL',
                      'Number': 'G',
                      'Type': 'Float',
                      'Description': 'three log10-scaled likelihoods for RR,RA,AA genotypes'}
        data.add_format_to_header(dic_header)
        whead = Writer(fileout[f], data)
        #TODO: whead = Writer(prm.PATH_OUT[simul], data)
        whead.write_header()
        whead.close()
        w = open(fileout[f], 'ab')
        #TODO:  w = open(prm.PATH_OUT[simul], 'ab')
        # Load adaptive GL values for missing data
        df = pd.read_csv(os.path.join(prm.WD, 'adaptive_gls.csv'),
                         header=None,
                         names=['rowsrr', 'rowsra', 'rowsaa', 'colsrr', 'colsra', 'colsaa',
                                'n', 'm',
                                'rr', 'ra', 'aa']
                         )
        df2dict = dict(((int(rwrr), int(rwra), int(rwaa), int(clrr), int(clra), int(claa),
                         int(n), int(m)),
                        [rr, ra, aa]) for rwrr, rwra, rwaa, clrr, clra, claa,
                                          n, m,
                                          rr, ra, aa in df.itertuples(index=False, name=None))

        sig = allfqc.SigmoidInterpolator(os.path.join(prm.PATH_GT_FILES, prm.RAW['gz'].replace('gl', 'gt')),
                                         os.path.join(prm.PATH_GT_FILES, prm.POOLED['gz'].replace('gl', 'gt')))
        params = sig.get_sigmoid_params()
        interp = sig.interpolate_derivative()

    else:  # prm.GTGL == 'GT' or fixed GL
        w = Writer(fileout[f], data)
        #TODO: w = Writer(prm.PATH_OUT[simul], data)
        w.set_threads(4)
        df2dict = None
        sig = None
        params = None
        interp = None

    tm = time.time()
    # for n, variant in enumerate(data('20:59973567-59973568')):
    for n, variant in enumerate(data):
        process_line(groups, f, w, variant, df2dict, sig, params, interp)
        if n % 1000 == 0:
            print('{} variants processed in {:06.2f} sec'.format(n+1, time.time()-tm).ljust(80, '.'))
        # if n+1 == 1000:
        #     break
    w.close()

    # GL converted from GT, missing GLs will be filled with [0.33, 0.33, 0.33]
    if prm.GTGL == 'GL' and prm.unknown_gl != 'adaptative':
        alltls.file_likelihood_converter(os.path.join(prm.PATH_GT_FILES,
                                                      fileout[f].replace('.gl', '.gt')) + '.gz',  # prm.PATH_OUT[simul]
                                         fileout[f])  # prm.PATH_OUT[simul]