def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def test_set_format_int_b(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0 v = next(vcf) v.set_format("PI", np.array([855, 11], dtype=np.int64)) assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11])
def test_set_format_int3(): "test that we can handle multiple (in this case 3) values per sample" vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="P3", Number=3, Type="Integer", Description="Int example")) == 0 v = next(vcf) exp = np.array([[1, 11, 111], [2, 22, 222]], dtype=np.int) v.set_format("P3", exp) res = get_gt_str(v, "P3") assert res == ["1,11,111", "2,22,222"], (res, str(v)) assert np.allclose(v.format("P3"), exp)
def test_set_format_float(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="PS", Number=1, Type="Float", Description="PS example")) == 0 v = next(vcf) v.set_format("PS", np.array([0.555, 1.111], dtype=np.float)) assert allclose(fmap(float, get_gt_str(v, "PS")), np.array([0.555, 1.111])) v.set_format("PS", np.array([8.555, 11.111], dtype=np.float64)) assert allclose(fmap(float, get_gt_str(v, "PS")), [8.555, 11.111]) v.set_format("PS", np.array([9998.555, 99911.111], dtype=np.float32)) obs = fmap(float, get_gt_str(v, "PS")) assert allclose(obs, [9998.555, 99911.111]), obs
def test_set_format_int(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0 v = next(vcf) v.set_format("PI", np.array([5, 1], dtype=np.int)) assert allclose(fmap(float, get_gt_str(v, "PI")), [5, 1]) v.set_format("PI", np.array([855, 11], dtype=np.int64)) assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11]) v.set_format("PI", np.array([9998, 99911], dtype=np.int32)) obs = fmap(float, get_gt_str(v, "PI")) assert allclose(obs, [9998, 99911]), obs
if genome: vcf.add_to_header(f"##reference={genome}") vcf.add_info_to_header( { "ID": "END", "Number": "1", "Type": "Integer", "Description": "End position of the variant described in this record" } ) vcf.add_format_to_header( { "ID": "GT", "Number": "1", "Type": "String", "Description": "Genotype", } ) # Add contigs contigs = set() with open(fai) as f: for line in f: contig, length, *_ = line.strip().split("\t") contigs.add(contig) vcf.add_to_header(f"##contig=<ID={contig},length={length}>") for header in headers: vcf.add_to_header(header)
def generate_vcf(gnomad_vcf, outfile, pop, format_fields): logging.info("Processing gnomAD file") nind = get_number_individuals(gnomad_vcf, pop) gt_dp, gt_qual = generate_putative_GQ_DP(format_fields, nind) vcf_data = VCF(gnomad_vcf, gts012=True) with open(outfile, 'w') as out: #with gzip.open(outfile, 'wb') as out: vcf_data.add_format_to_header({ 'ID': 'GT', 'Description': 'Genotype', 'Type': 'String', 'Number': 1 }) vcf_data.add_format_to_header({ 'ID': 'AD', 'Description': 'Allelic depths for the ref and alt alleles in the order listed', 'Type': 'Integer', 'Number': 1 }) vcf_data.add_format_to_header({ 'ID': 'DP', 'Description': 'Approximate read depth', 'Type': 'Integer', 'Number': 1 }) vcf_data.add_format_to_header({ 'ID': 'GQ', 'Description': 'Genotyp Quality', 'Type': 'Integer', 'Number': 1 }) vcf_data.add_format_to_header({ 'ID': 'PL', 'Description': 'normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification', 'Type': 'Integer', 'Number': "G" }) individuals = ["ind_" + str(i) for i in range(1, nind, 1)] header = filter(None, vcf_data.raw_header.split("\n")) final_header = [ line + '\t' + '\t'.join(individuals) if line.startswith("#CHROM") else line for line in header ] out.write('\n'.join(final_header) + "\n") #out.write('\n'.join(final_header).encode() + "\n".encode()) info_fields = [ field["ID"] for field in vcf_data.header_iter() if field["HeaderType"] == "INFO" ] for record in vcf_data: info = record.INFO if pop == "All": nhomalt = info["nhomalt"] nheterozygous = info["AC"] - (nhomalt * 2) else: nhomalt = info["nhomalt" + "_" + pop] nheterozygous = info["AC" + "_" + pop] - (nhomalt * 2) record_combined_gt = simulate_genotypes(nind, nhomalt, nheterozygous) gt_phred_ll_homref = np.zeros((nind, ), dtype=np.int) gt_phred_ll_het = np.full(shape=nind, fill_value=1500, dtype=np.int) gt_phred_ll_homalt = np.full(shape=nind, fill_value=1500, dtype=np.int) gt_alt_depth = np.zeros((nind, ), dtype=np.int) gt_ref_depth = copy.deepcopy(gt_dp) fmt = [] for i, gt in enumerate(record_combined_gt): if gt == "0/1": # if het gt_phred_ll_het[i] = 0 gt_phred_ll_homref[i] = 1500 gt_alt_depth[i] = gt_ref_depth[i] = 50 elif gt == "1/1": # if homalt gt_phred_ll_homalt[i] = 1500 gt_phred_ll_homref[i] = 1500 gt_alt_depth[i] = 100 gt_ref_depth[i] = 0 fmt.append("{}:{},{}:{}:{}:{},{},{}".format( gt, gt_ref_depth[i], gt_alt_depth[i], gt_dp[i], gt_qual[i], gt_phred_ll_homref[i], gt_phred_ll_het[i], gt_phred_ll_homalt[i])) str_info = [] for i in info_fields: try: str_info.append(i + "=" + str(record.INFO[i])) except KeyError as abs_fied: #print("Field {} absent".format(abs_fied)) continue write_record = [ '.' if v is None else v for v in [ record.CHROM, str(record.POS), record.ID, record.REF, record.ALT[0], str(record.QUAL), record.FILTER, ';'.join(str_info), "GT:AD:DP:GQ:PL" ] ] out.write('\t'.join(write_record + fmt) + "\n") #out.write('\t'.join(write_record + fmt).encode() + "\n".encode()) vcf_data.close() out.close() return nind
def process_file(data: VCF, groups: list, f: int, fileout: list) -> None: #TODO: clean/refactor execution comments like processed file name #TODO: refactor processing lis tof files into single file processing + remove MSS param # data: VCF, groups: list, simul: str, fileout: str """ Computes and rewrites genotypes of all individuals for all samples from input files :param data: cyvcf2 object reader pointing on a VCF-file :param groups: samples identifiers split in pools :param f: integer, index of the file to process in the list :param fileout: VCF-files with simulated pooled or randomly missing genotypes """ print('Simulation type: ', 'simul') print('file out: ', os.path.join(os.getcwd(), fileout[f])) # prm.PATH_OUT[simul] if prm.GTGL == 'GL' and prm.unknown_gl == 'adaptative': dic_header = {'ID': 'GL', 'Number': 'G', 'Type': 'Float', 'Description': 'three log10-scaled likelihoods for RR,RA,AA genotypes'} data.add_format_to_header(dic_header) whead = Writer(fileout[f], data) #TODO: whead = Writer(prm.PATH_OUT[simul], data) whead.write_header() whead.close() w = open(fileout[f], 'ab') #TODO: w = open(prm.PATH_OUT[simul], 'ab') # Load adaptive GL values for missing data df = pd.read_csv(os.path.join(prm.WD, 'adaptive_gls.csv'), header=None, names=['rowsrr', 'rowsra', 'rowsaa', 'colsrr', 'colsra', 'colsaa', 'n', 'm', 'rr', 'ra', 'aa'] ) df2dict = dict(((int(rwrr), int(rwra), int(rwaa), int(clrr), int(clra), int(claa), int(n), int(m)), [rr, ra, aa]) for rwrr, rwra, rwaa, clrr, clra, claa, n, m, rr, ra, aa in df.itertuples(index=False, name=None)) sig = allfqc.SigmoidInterpolator(os.path.join(prm.PATH_GT_FILES, prm.RAW['gz'].replace('gl', 'gt')), os.path.join(prm.PATH_GT_FILES, prm.POOLED['gz'].replace('gl', 'gt'))) params = sig.get_sigmoid_params() interp = sig.interpolate_derivative() else: # prm.GTGL == 'GT' or fixed GL w = Writer(fileout[f], data) #TODO: w = Writer(prm.PATH_OUT[simul], data) w.set_threads(4) df2dict = None sig = None params = None interp = None tm = time.time() # for n, variant in enumerate(data('20:59973567-59973568')): for n, variant in enumerate(data): process_line(groups, f, w, variant, df2dict, sig, params, interp) if n % 1000 == 0: print('{} variants processed in {:06.2f} sec'.format(n+1, time.time()-tm).ljust(80, '.')) # if n+1 == 1000: # break w.close() # GL converted from GT, missing GLs will be filled with [0.33, 0.33, 0.33] if prm.GTGL == 'GL' and prm.unknown_gl != 'adaptative': alltls.file_likelihood_converter(os.path.join(prm.PATH_GT_FILES, fileout[f].replace('.gl', '.gt')) + '.gz', # prm.PATH_OUT[simul] fileout[f]) # prm.PATH_OUT[simul]