def sv(args): #gts012 sets the uncalled GTs to 3 reader = cyvcf2.VCF(args.i, gts012=True) #get list of samples samples = reader.samples #read sample map sample_map, animal_map = load_sample_map(args.m) #add the new INFO tags reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant") reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample") reader.update("TISSUE", "String", 1, "Source Tissue Type") reader.update("CASE", "String", 1, "Control or SCNT?") reader.update("EXPT", "String", 1, "Experiment") reader.update("ANIMAL", "String", 1, "ID of origin animal") # reader.update("FILTER", "String", 1, "VAF Filter PASS/FAIL") if not args.o: writer = cyvcf2.Writer("/dev/stdout", reader) else: writer = cyvcf2.Writer(args.o, reader) min_su = 5 for var in reader: SUs = var.format('SU') ABs = var.format('AB') for i in range(len(samples)): if SUs[i][0] >= min_su and ABs[i][0] >= 0.15: unique = True for j in range(len(samples)): if j != i: if ABs[j][0] > 0.0: unique = False break if unique: #set new info fields var.INFO['TISSUE'] = sample_map[samples[i]]['Source'] var.INFO['CASE'] = sample_map[samples[i]]['Case'] var.INFO['EXPT'] = sample_map[samples[i]]['Experiment'] var.INFO['UNIQ'] = samples[i] var.INFO['UAB'] = str(numpy.around(ABs[i][0], 3)) var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal'] # var.INFO['FILTER'] = filt writer.write_record(var) writer.close()
def filter_to_pass_and_reject(in_file, paired, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ from bcbio.heterogeneity import bubbletree out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired) tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: reader = _add_db_to_header(reader) with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]] if len(other_filters) == 0 or bubbletree.is_info_germline(rec): # Germline, check if we should include based on frequencies if "REJECT" in filters or bubbletree.is_info_germline(rec): stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired, use_status=True, max_normal_depth=max_depth) if stats: rec.FILTER = "PASS" rec.INFO["DB"] = True writer.write_record(rec) # Somatic, always include else: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"]) return out_file
def unphase(inVcf, outVcf): # read the vcf with scikit-allel, just to get number of snps print("[GET_NR_SNPS]") print(f"Reading: {inVcf}") startTime = time.perf_counter() callset = allel.read_vcf(inVcf) print(f"Took {(time.perf_counter() - startTime):.2f} seconds.") # no tri-allelic? assert (sum(callset["variants/ALT"][:, 2] != '') == 0) assert (sum(callset["variants/ALT"][:, 1] != '') == 0) assert (sum(callset["variants/ALT"][:, 0] == '') == 0) snpsInFile = callset["calldata/GT"].shape[0] print(snpsInFile) print("[DONE]") print("[UNPHASE]") print(f"File to unphase: {inVcf}") print(f"Unphased output written to: {outVcf}") # go through the vcf vcfIFS = cyvcf2.VCF(inVcf) # get some randomness numIndividuals = len(vcfIFS.samples) randomness = numpy.random.randint(2, size=(numIndividuals, snpsInFile)) # create a new vcf Writer using the input vcf as a template. vcfOFS = cyvcf2.Writer(outVcf, vcfIFS) count = 0 allIdxs = numpy.arange(numIndividuals) for v in vcfIFS: # see what goes # what are the indices to be flipped? toFlip = allIdxs[randomness[:, count] == 1] for idx in toFlip: # flip it v.genotypes[idx][0], v.genotypes[idx][1] = v.genotypes[idx][ 1], v.genotypes[idx][0] # make sure we have new genotypes v.genotypes = v.genotypes # and write it vcfOFS.write_record(v) # increase count count += 1 if (count % 100000 == 0): print(count) vcfOFS.close() vcfIFS.close() print("[DONE]")
def main(): vcf, normal_bams, output_dir, reference = argument_parser() vcf_handle = cyvcf2.VCF(vcf) vcf_handle.add_info_to_header({ 'ID': 'PON_VAF', 'Description': 'VAF in Panel of Normals', 'Type': 'Float', 'Number': '1' }) vcf_handle.add_info_to_header({ 'ID': 'PON_DEPTH', 'Description': 'Total depth in Panel of Normals', 'Type': 'Float', 'Number': '1' }) vcf_handle.add_info_to_header({ 'ID': 'PON_VC', 'Description': 'Total variant read count in Panel of Normals', 'Type': 'Float', 'Number': '1' }) output_vcf = os.path.join( output_dir, re.sub(r'.vcf$', '.pon.vcf', os.path.basename(vcf))) output_handle = cyvcf2.Writer(output_vcf, vcf_handle) for variant in vcf_handle: variant_position = f'{variant.CHROM}:{variant.POS}-{variant.POS}' pon_vafs, total_depths, mismatches = calculate_vaf( normal_bams, variant_position, reference) variant.INFO['PON_VAF'] = str(pon_vafs) variant.INFO['PON_DEPTH'] = str(total_depths) variant.INFO['PON_VC'] = str(mismatches) output_handle.write_record(variant)
def _remove_prioritization(in_file, data, out_dir=None): """Remove tumor-only prioritization and return non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate( out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(str(in_file)) reader.add_filter_to_header({ 'ID': 'Somatic', 'Description': 'Variant called as Somatic' }) # with open(tx_out_file, "w") as out_handle: # out_handle.write(reader.raw_header) with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: for rec in reader: rec = _update_prioritization_filters(rec) # out_handle.write(str(rec)) writer.write_record(rec) return out_file
def variants(args): """subroutine for variants subcommand """ ancestor = setup_ancestor(args) vcf = cyvcf2.VCF(args.vcf) vcf.add_info_to_header({ 'ID': 'mutation_type', 'Description': f'ancestral {args.k}-mer mutation ' 'type', 'Type': 'Character', 'Number': '1' }) vcf_writer = cyvcf2.Writer('-', vcf) vcf_writer.write_header() for variant in vcf: # biallelic snps only if not (variant.is_snp and len(variant.ALT) == 1): continue # mutation type as ancestral kmer and derived kmer anc_kmer, der_kmer = ancestor.mutation_type(variant.CHROM, variant.start, variant.REF, variant.ALT[0]) if anc_kmer is None or der_kmer is None: continue mutation_type = f'{anc_kmer}>{der_kmer}' variant.INFO['mutation_type'] = mutation_type # ancestral allele AA = ancestor[variant.CHROM][variant.start].seq # polarize genotypes (and associated INFO) if alternative allele is # ancestral if variant.ALT[0] == AA: variant.INFO['AC'] = variant.INFO['AN'] - variant.INFO['AC'] variant.INFO['AF'] = variant.INFO['AC'] / variant.INFO['AN'] # cyvcf2 docs say we need to reassign genotypes like this for the # change to propagate (can't just update indexwise) if variant.ploidy == 2: # diploid variant.genotypes = [[int(not gt[0]), int(not gt[1]), gt[2]] for gt in variant.genotypes] elif variant.ploidy == 1: # haploid variant.genotypes = [[int(not gt[0]), gt[1]] for gt in variant.genotypes] else: raise ValueError(f"invalid ploidy {variant.ploidy}") elif not variant.REF == AA: raise ValueError(f'ancestral allele {AA} is not equal to ' f'reference {variant.REF} or alternative ' f'{variant.ALT[0]}') # set REF to ancestral allele and ALT to derived allele variant.REF = anc_kmer[ancestor.target] variant.ALT = der_kmer[ancestor.target] vcf_writer.write_record(variant) # this line required to exit on a SIGTERM in a pipe, e.g. from head signal.signal(signal.SIGPIPE, signal.SIG_DFL)
def main(): input, vaf_threshold, output_dir = argument_parser() outputfile = os.path.join( output_dir, re.sub('.vcf$', '.filtered.vcf', os.path.basename(input))) vcf_handle = cyvcf2.VCF(input) print(vcf_handle) writer = cyvcf2.Writer(outputfile, vcf_handle) for variant in cyvcf2.VCF(input): if variant.INFO['PON_VAF'] < vaf_threshold: writer.write_record(variant) vcf_handle.close() writer.close()
def main(): input_vcf, reference, output_dir = argument_parser() output_vcf = os.path.join( output_dir, re.sub('.vcf$', '.sig9.vcf', os.path.basename(input_vcf))) vcf_handle = cyvcf2.VCF(input_vcf) output_vcf_handle = cyvcf2.Writer(output_vcf, vcf_handle) for variant in cyvcf2.VCF(input_vcf): var_position = Position(variant.CHROM, variant.POS, variant.POS) refbase, altbase, var_trinucleotide = get_trinucleotide( var_position, variant.REF, variant.ALT[0], reference) if var_trinucleotide in ['TTT', 'TTA', 'CTT'] and altbase == 'G': output_vcf_handle.write_record(variant)
def _extract_germline(in_file, data): """Extract germline calls non-somatic, non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate( out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(in_file) reader.add_filter_to_header({ 'ID': 'Somatic', 'Description': 'Variant called as Somatic' }) with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: for rec in reader: writer.write_record(_update_germline_filters(rec)) return out_file
def filter_to_pass_and_reject(in_file, data, out_dir=None): """Filter VCF to only those with a strict PASS/REJECT: somatic + germline. Removes low quality calls filtered but also labeled with REJECT. """ out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0] if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf") with contextlib.closing(cyvcf2.VCF(in_file)) as reader: with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer: for rec in reader: filters = rec.FILTER.split(";") if rec.FILTER else [] filters = [ x for x in filters if x not in ["PASS", ".", "REJECT"] ] if len(filters) == 0: writer.write_record(rec) vcfutils.bgzip_and_index(tx_out_plain, data["config"]) return out_file
for email in scored_variants[key]['email']: #latest_timestamp = datetime.datetime.min latest_timestamp = 0 answer = '' #find latest answer for each user for entry in scored_variants[key]['email'][email]: if entry[1] > latest_timestamp: answer = entry[0] latest_timestamp = entry[1] scored_variants[key]['score_fields'][answer] += 1 scored_variants[key]['score_fields']['scorer_count'] += 1 vcf = cyvcf2.VCF(os.path.expanduser(args.vcf)) vcf.add_info_to_header({"ID": "SVPD", "Description": "Details of SV-plaudit scorer count and scores in the format COUNT|SCORE1,SCORE2,SCOREN. Answers the question: `" + question + "` Available answers were as follows: `" + "`; `".join(answers) + "`", "Type":'Character', 'Number':'1'}) vcf.add_info_to_header({"ID": "SVP", "Description": "SV-plaudit curation score, the " + args.operation + " of scores for that entry where the values of the following curation answers: `" + "`; `".join(answers) + "` are " + ",".join(args.number_map), "Type":'Float', 'Number':'1'}) writer = cyvcf2.Writer(args.annotated_outfile, vcf) for variant in vcf: if variant.INFO.get('END'): key = variant.INFO.get('SVTYPE') + '_' + \ variant.CHROM + '_' + \ str(variant.POS) + '-' + \ str(variant.INFO.get('END')) if key in scored_variants: vcf_annotation = str(scored_variants[key]['score_fields']['scorer_count']) + "|" for answer in answers: vcf_annotation += str(scored_variants[key]['score_fields'][answer]) + "," vcf_annotation = vcf_annotation[:-1] if args.operation: score_counts = vcf_annotation.split("|")[1].split(",") score_values = []
# Load Data if args.sample is not None: vcf = cyvcf2.VCF(args.vcf, samples=args.sample) else: vcf = cyvcf2.VCF(args.vcf) # Sample name if len(vcf.samples) > 1: sys.stderr.write("Error: " + str(len(vcf.samples)) + " sample detected. This version is designed for a single sample !") sys.exit(-1) # Ouptuts if args.export: wx = cyvcf2.Writer(re.sub(r'\.vcf$|\.vcf.gz$|\.bcf', '_export.vcf', os.path.basename(args.vcf)), vcf) if args.debug: vcf.add_info_to_header({'ID': 'TMB_FILTERS', 'Description': 'Detected filters for TMB calculation', 'Type': 'Character', 'Number': '1'}) wd = cyvcf2.Writer(re.sub(r'\.vcf$|\.vcf.gz$|\.bcf', '_debug.vcf', os.path.basename(args.vcf)), vcf) # Load config dbFlags = loadConfig(args.dbConfig) callerFlags = loadConfig(args.varConfig) # Genome size if args.effGenomeSize is None: if args.bed is not None: effGS = getEffGenomeSizeFromBed(args.bed) else:
return (rec) if __name__ == "__main__": args = argsParse() ## Loading Data vcf = cyvcf2.VCF(args.vcf) rec = loadRec(args.rec) ## rec file vcf.add_info_to_header({ 'ID': 'RUNREC', 'Description': 'Run recurrence', 'Type': 'Character', 'Number': '1' }) w = cyvcf2.Writer(args.out, vcf) for variant in vcf: k = str(variant.CHROM) + ":" + str(variant.start + 1) + "-" + str( variant.end) if k in rec: variant.INFO["RUNREC"] = rec[k] w.write_record(variant) w.close() vcf.close()
def snp(args): genome = pyfaidx.Fasta(args.r) #should look specifically at X chrom VAFs in males VAF=0.30 MVAF=0.95 if args.vaf: VAF=float(args.vaf) if args.mvaf: MVAF=float(args.mvaf) sample_map, animal_map = load_sample_map(args.m) #gts012 sets the uncalled GTs to 3 reader = cyvcf2.VCF(args.i, gts012=True) #get list of samples samples = reader.samples #add the two new INFO tags reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant") reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample") reader.update("TYPE", "String", 1, "Varant Type (SNPS: TS/TV) (INDELS: INS/DEL)") reader.update("TISSUE", "String", 1, "Source Tissue Type") reader.update("CASE", "String", 1, "Control or SCNT?") reader.update("EXPT", "String", 1, "Experiment") reader.update("CONTEXT", "String", 1, "Trinucleotide Context") reader.update("ANIMAL", "String", 1, "ID of origin animal") reader.update("UDP", "Integer", 1, "Depth at uniq site") reader.update("AAGR", "Float", 1, "AAG/RR Ratio") reader.update("UGQ", "Float", 1, "Uniq Genotype Quality") reader.add_filter_to_header({"ID":"LowVAF", "Description":"Somatic VAF below threshold"}) reader.add_filter_to_header({"ID":"MGP", "Description":"Variant present in MGP"}) if not args.o: writer = cyvcf2.Writer("/dev/stdout", reader) else: writer = cyvcf2.Writer(args.o, reader) AAG_RR_MIN = numpy.power(10.,10.) RR_AAG_MIN = numpy.power(10.,5.) min_depth = 10 max_depth = 250 allosomes = set(["X", "Y"]) #iterate over vars for var in reader: unique = True # set max alt VAF for snp or indel if var.is_snp: MAX_VAF = 0.05 elif var.is_indel: MAX_VAF = 0.00 else: sys.stderr.write("Skipping Variant: Not SNP/Indel") continue #get RR and AAG genotype likelihoods RR_PLs = unphred(var.gt_phred_ll_homref) AAG_PLs = unphred(var.gt_phred_ll_het) #get AAG/RR and RR/AAG likelihood ratios AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs) RR_AAG_ratios = numpy.true_divide(RR_PLs, AAG_PLs) #get genotypes, depths, and alt allele depths GTs = var.gt_types DEPTHS = var.gt_depths ALT_DEPTHS = var.gt_alt_depths QUALS = var.gt_quals #get allele balances ABs = numpy.true_divide(ALT_DEPTHS, DEPTHS) for i in range(len(samples)): # dont waste time in uneeded loops if not unique: break AAG = 1 MIN_VAF = VAF #change AAG to 1/1 and min vaf to male allosome min [0.95] if sample_map[samples[i]]['Sex']=="M" and var.CHROM in allosomes: AAG = 2 MIN_VAF = MVAF VAF_FILT = False #criteria for presence in given sample if (max_depth > DEPTHS[i] > min_depth and AAG_RR_ratios[i] >= AAG_RR_MIN and GTs[i] == AAG): if ABs[i] < MIN_VAF: VAF_FILT = True for j in range(len(samples)): if i == j: continue #default RR/AAG min is 1. ratio_min = 1 #if same animal if same_animal(sample_map, samples, i, j): if not max_depth > DEPTHS[j] > min_depth: unique = False break #if different case, set ratio_min to control RR_AAG min #thus, SCNT lines for the same animal are treated as the control for controls if sample_map[samples[i]]['Case'] != sample_map[samples[j]]['Case']: ratio_min = RR_AAG_MIN #criteria for failing or presence in other samples if (ABs[j] > MAX_VAF or RR_AAG_ratios[j] < ratio_min): unique = False break if unique: #get trinucleotide context (VCF coords are 1-based) if var.is_snp: context = genome[str(var.CHROM)][var.POS-2:var.POS+1] #ts or tv? tstv = 'Tv' if var.is_transition: tstv = 'Tr' var.INFO['CONTEXT'] = context.seq var.INFO['TYPE'] = tstv var.INFO['TISSUE'] = sample_map[samples[i]]['Source'] var.INFO['CASE'] = sample_map[samples[i]]['Case'] var.INFO['EXPT'] = sample_map[samples[i]]['Experiment'] var.INFO['UNIQ'] = samples[i] var.INFO['UAB'] = str(numpy.around(ABs[i], 3)) var.INFO['UDP'] = str(DEPTHS[i]) var.INFO['AAGR'] = str(AAG_RR_ratios[i]) var.INFO['UGQ'] = str(QUALS[i]) var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal'] filters = [] f = var.FILTER if f: filters = f.split(";") if VAF_FILT: filters.append("LowVAF") try: var.INFO["MGP"] filters.append("MGP") except KeyError: pass if filters: var.FILTER = filters #write record writer.write_record(var) writer.close()
def snp_fnr(args): #only concerned with autosome VAF cutoff VAF=0.30 #gts012 sets the uncalled GTs to 3 reader = cyvcf2.VCF(args.i, gts012=True) reader.update("ANIMAL", "String", ".", "Animals with this GSS var") reader.update("PRESENT", "String", ".", "SCNT lines detecting this GSS var") #open writerr if not args.o: writer = cyvcf2.Writer("/dev/stdout", reader) else: writer = cyvcf2.Writer(args.o, reader) counts_out = open(args.c, 'w') #get list of samples samples = reader.samples #sample to index map stoi = {s: samples.index(s) for s in samples} #load sample map and get animal sample groups sample_map, animal_map = load_sample_map(args.m) AAG_RR_MIN = numpy.power(10.,10.) min_depth = 10 max_depth = 250 counter = Counter() HIcounter = Counter() #iterate over vars for var in reader: #false by default gss = False PASS = False animals = [] present = [] #max alt allele balance for control samples if not (var.is_snp or var.is_indel): sys.stderr.write("Skipping Variant: Not SNP/Indel") continue #get RR and AAG genotype likelihoods RR_PLs = unphred(var.gt_phred_ll_homref) AAG_PLs = unphred(var.gt_phred_ll_het) #get AAG/RR ratios AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs) #get genotypes, depths, and alt allele depths GTs = var.gt_types DEPTHS = var.gt_depths ALT_DEPTHS = var.gt_alt_depths #get allele balances ABs = numpy.true_divide(ALT_DEPTHS, DEPTHS) if not var.FILTER: PASS = True for animal, group in animal_map.items(): #get sample name of control control = group['Control'][0] SCNTs = group['SCNT'] #if var not called in the control, continue: # if GTs[stoi[control]] == [0,3]: # continue ALL = [control] + SCNTs #if at least one sample was called het, # var is present in mouse if 1 in [GTs[stoi[x]] for x in ALL]: counter[control] += 1 if PASS: HIcounter[control] += 1 gss = True control_depth = False if (DEPTHS[stoi[control]] >= min_depth and DEPTHS[stoi[control]] <= max_depth): control_depth = True animals.append(animal) for sample in SCNTs: i = stoi[sample] if (DEPTHS[i] >= min_depth and DEPTHS[i] <= max_depth and control_depth and AAG_RR_ratios[i] >= AAG_RR_MIN and ABs[i] >= VAF and GTs[i] == 1): counter[sample] += 1 present.append(sample) if PASS: HIcounter[sample] += 1 if gss: var.INFO['ANIMAL'] = ",".join(animals) var.INFO['PRESENT'] = ",".join(present) writer.write_record(var) counts_out.write("#COUNTS\tSample\tCase\tCount\tHQCount\n") for sample in sorted(counter.keys()): outstr = "\t".join(["#COUNT", sample, sample_map[sample]['Case'], str(counter[sample]), str(HIcounter[sample])]) counts_out.write(outstr+"\n") counts_out.write("#FNR\tAnimal\tFNR\tHQFNR\n") for animal, group in sorted(animal_map.items()): #get sample name of control control = group['Control'][0] SCNTs = group['SCNT'] present = counter[control] rates = [] hrates = [] for sample in SCNTs: called = counter[sample] hcalled = HIcounter[sample] rate = 1.0-(called/float(present)) hrate = 1.0-(hcalled/float(present)) rates.append(rate) hrates.append(hrate) a_rate = numpy.mean(rates) a_hrate = numpy.mean(hrates) counts_out.write("\t".join(["#FNR", animal, str(a_rate), str(a_hrate)])+"\n") # writer.close() counts_out.close()
def mei(args): #gts012 sets the uncalled GTs to 3 reader = cyvcf2.VCF(args.i, gts012=True) #get list of samples samples = reader.samples #read sample map sample_map, animal_map = load_sample_map(args.m) #add the new INFO tags reader.update("UNIQ", "String", 1, "Sample(s) with unique somatic variant") # reader.update("UAB", "Float", 1, "Allele Balance in UNIQ sample") reader.update("TISSUE", "String", 1, "Source Tissue Type") reader.update("CASE", "String", 1, "Control or SCNT?") reader.update("EXPT", "String", 1, "Experiment") reader.update("ANIMAL", "String", 1, "ID of origin animal") # reader.update("PL", "String", 1, "RR") if not args.o: writer = cyvcf2.Writer("/dev/stdout", reader) else: writer = cyvcf2.Writer(args.o, reader) allosomes = set(["X", "Y"]) min_su = 3 for var in reader: LP = var.INFO['LP'] RP = var.INFO['RP'] if not var.FILTER and (LP > min_su and RP > min_su): # if (LP > min_su and RP > min_su): unique = True GTs = var.gt_types #strange behaviour... MELT PLs read as the input value *10. divide by 10 to correct. #should have MELT return positive integers rather than negative floats. RR_PLs = unphred(numpy.divide(var.gt_phred_ll_homref, 10.)) AAG_PLs = unphred(numpy.divide(var.gt_phred_ll_het, 10.)) #get AAG/RR and RR/AAG likelihood ratios AAG_RR_ratios = numpy.true_divide(AAG_PLs, RR_PLs) RR_AAG_ratios = numpy.true_divide(RR_PLs, AAG_PLs) for i in range(len(samples)): if not unique: break AAG = 1 #change AAG to 1/1 and min vaf to male allosome min [0.95] if sample_map[samples[i]]['Sex']=="M" and var.CHROM in allosomes: AAG = 2 #heterozygote if GTs[i] == AAG: for j in range(len(samples)): if j != i: if GTs[j] != 0 or RR_PLs[j] < 0.60: unique = False break if unique: # print RR_PLs #set new info fields var.INFO['TISSUE'] = sample_map[samples[i]]['Source'] var.INFO['CASE'] = sample_map[samples[i]]['Case'] var.INFO['EXPT'] = sample_map[samples[i]]['Experiment'] var.INFO['UNIQ'] = samples[i] var.INFO['ANIMAL'] = sample_map[samples[i]]['Animal'] writer.write_record(var) writer.close()
def main(ARGS=None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read cnds files """ var_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() if args.annotation_subfield == "ANN": cyvcf2_vcf.get_csq_keys(spliton="Functional annotations: ", delim="|", chars_del=[" ", "'", '"'], ann_id=args.annotation_subfield) else: cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|", ann_id=args.annotation_subfield) vcf_header_str = cyvcf2_vcf.header_to_list( gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ since we're writing to a VCF, if any new INFO items written, need to add to header to reflect this. """ if args.max_impact_csqs != None: for csq_name in args.max_impact_csqs: csq_name_ext = csq_name + "_maximpact" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'max '+csq_name+' to go along '+\ 'with transcripts with max IMPACT', 'Type':'Character', 'Number':'1'}) if args.max_csq_scores != None: for csq_name in args.max_csq_scores: csq_name_ext = csq_name + "_max" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'max value for '+csq_name + \ 'along assessed transcripts '+\ 'in CSQ field.', 'Type':'Float', 'Number':'1'}) if args.min_csq_scores != None: for csq_name in args.min_csq_scores: csq_name_ext = csq_name + "_min" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'min value for '+csq_name + \ 'along assessed transcripts '+\ 'in CSQ field.', 'Type':'Float', 'Number':'1'}) """ init VCF writer object """ w = cyvcf2.Writer(args.out_vcf, vcf) # to write variant record, for v in vcf: w.write_record(v) """ iterate through all variants, performing de novo screen on each one """ vargeno_counts = defaultdict(int) prev_chrom = None n_var = 0 n_var_keep = 0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): n_var += 1 #if linenum == 1000000: break """ create new Cyvcf2Variant instance """ cyvcf2_variant = Cyvcf2Variant(vcf_variant) if vcf_variant.CHROM != prev_chrom: print("Extracting variants from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen( args.qual_impacts, csq_subfield=args.annotation_subfield) if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield=args.annotation_subfield) if args.annotation_subfield == "ANN": impact_subfield = "Annotation_Impact" else: impact_subfield = "IMPACT" res = cyvcf2_variant.maxmin_csqs( csq_subfield=args.annotation_subfield, impact_subfield=impact_subfield, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores) (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res """ if corresponding values defined, add to vcf record """ if args.max_impact_csqs != None: for i in range(len(args.max_impact_csqs)): max_impact_csq_name = args.max_impact_csqs[i] + "_maximpact" max_impact_csq = csqs_maximpact_list[i] vcf_variant.INFO[max_impact_csq_name] = max_impact_csq if args.min_csq_scores != None: for i in range(len(args.min_csq_scores)): min_csq_score_name = args.min_csq_scores[i] + "_min" min_csq_score = float(min_csq_scores[i]) vcf_variant.INFO[min_csq_score_name] = min_csq_score if args.max_csq_scores != None: for i in range(len(args.max_csq_scores)): max_csq_score_name = args.max_csq_scores[i] + "_max" max_csq_score = float(max_csq_scores[i]) vcf_variant.INFO[max_csq_score_name] = max_csq_score ## filter on variant cnds file provided if var_cnds.test_variant(vcf_variant) == False: continue ## if variant survives filters, retain record w.write_record(vcf_variant) n_var_keep += 1 w.close() vcf.close() ## print basic stats on number of input variants, number of ## variants to keep print("Number of variants in parent VCF : " + str(n_var)) print("Number of variants retained post-filtration : " + str(n_var_keep)) return
def __call__(self, predictions, records, line_ids=None): # First itertation: the output file has to be created and the headers defined import cyvcf2 if len(predictions) == 0: return None metdata_id_infotag = self.info_tag_prefix + ":rID" if self.prediction_labels is None: # setup the header self.prediction_labels = list(predictions.keys()) for k in predictions: col_labels_here = predictions[k].columns.tolist() # Make sure that the column are consistent across different prediction methods if self.column_labels is None: self.column_labels = col_labels_here else: if not np.all( np.array(self.column_labels) == np.array( col_labels_here)): raise Exception( "Prediction columns are not identical for methods %s and %s" % (predictions.keys()[0], k)) # Add the tag to the vcf file # "##INFO=<ID={ID},Number={Number},Type={Type},Description=\"{Description}\">".format(**adict) info_tag = { "ID": self.info_tag_prefix + ":%s" % k.upper(), "Number": None, "Type": "String", "Description": "%s SNV effect prediction. Prediction from model outputs: %s" % (k.upper(), "|".join(self.column_labels)) } self.vcf_reader.add_info_to_header(info_tag) # Add a tag in which the line_id = ranges_id will be written info_tag = { "ID": metdata_id_infotag, "Number": None, "Type": "String", "Description": "Range or region id taken from metadata, generated by the DataLoader." } self.vcf_reader.add_info_to_header(info_tag) # Now we can also create the vcf writer self.vcf_writer = cyvcf2.Writer(self.out_vcf_fpath, self.vcf_reader) else: if (len(predictions) != len(self.prediction_labels)) or not all( [k in predictions for k in self.prediction_labels]): raise Exception( "Predictions are not consistent across batches") for k in predictions: col_labels_here = predictions[k].columns.tolist() if not np.all( np.array(self.column_labels) == np.array( col_labels_here)): raise Exception( "Prediction columns are not identical for methods %s and %s" % (self.prediction_labels[0], k)) # sanity check that the number of records matches the prediction rows: for k in predictions: if predictions[k].shape[0] != len(records): raise Exception( "number of records does not match number the prediction rows for prediction %s." % str(k)) if line_ids is not None: if line_ids.shape[0] != len(records): raise Exception( "number of line_ids does not match number of VCF records") # Actually write the vcf entries. for pred_line, record in enumerate(records): if self.standardise_var_id and self.vcf_id_generator is not None: record.ID = self.vcf_id_generator(record) for k in predictions: # In case there is a pediction for this line, annotate the vcf... preds = predictions[k].iloc[pred_line, :] info_tag = self.info_tag_prefix + ":{0}".format(k.upper()) record.INFO[info_tag] = "|".join([str(pred) for pred in preds]) line_id = "" if line_ids is not None: line_id = line_ids[pred_line] record.INFO[metdata_id_infotag] = line_id self.vcf_writer.write_record(record)
def extend_vcf_annotations(query_vcf, pcgr_db_dir, logger, pon_annotation, regulatory_annotation, cpsr, debug): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions 5. Panel-of-normal (blacklisted variants) annotation List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_dir """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'pcgr_infotags.tsv')) if cpsr is True: vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'cpsr_infotags.tsv')) pcgr_onco_xref_map = annoutils.read_genexref_namemap(os.path.join(pcgr_db_dir, 'pcgr_onco_xref', 'pcgr_onco_xref_namemap.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = cyvcf2.VCF(query_vcf) for tag in sorted(vcf_infotags_meta): if pon_annotation == 0 and regulatory_annotation == 0: if not tag.startswith('PANEL_OF_NORMALS') and not tag.startswith('REGULATORY_'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) elif pon_annotation == 1 and regulatory_annotation == 0: if not tag.startswith('REGULATORY_'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) elif pon_annotation == 0 and regulatory_annotation == 1: if not tag.startswith('PANEL_OF_NORMALS'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) else: vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) w = cyvcf2.Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype vars_no_csq = list() for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: if not current_chrom is None: logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}") current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = f"g.{rec.CHROM}:{pos}{rec.REF}>{alt_allele}" vars_no_csq.append(variant_id) continue num_chromosome_records_processed += 1 pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF") if regulatory_annotation == 1: csq_record_results_all = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = False, csq_identifier = 'CSQ') if 'vep_block' in csq_record_results_all: vep_csq_records_all = csq_record_results_all['vep_block'] rec.INFO['REGULATORY_ANNOTATION'] = annoutils.map_regulatory_variant_annotations(vep_csq_records_all) csq_record_results_pick = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ') vep_csq_records = None if 'vep_all_csq' in csq_record_results_pick: rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results_pick['vep_all_csq']) if 'vep_block' in csq_record_results_pick: vep_csq_records = csq_record_results_pick['vep_block'] block_idx = 0 if cpsr is True: block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records) record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms) w.write_record(rec) if vars_no_csq: logger.warning(f"There were {len(vars_no_csq)} records with no CSQ tag from VEP (was --vep_no_intergenic flag set?). Skipping them and showing (up to) the first 100:") print('----') print(', '.join(vars_no_csq[:100])) print('----') w.close() if current_chrom is not None: logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}") vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: check_subprocess(logger, f'bgzip -f {out_vcf}', debug=False) check_subprocess(logger, f'tabix -f -p vcf {out_vcf}.gz', debug=False) annotated_vcf = f'{out_vcf}.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger) else: error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
pdp3_vaf = calculate_vaf(variant.gt_alt_depths[2], variant.gt_depths[2]) pdp4_vaf = calculate_vaf(variant.gt_alt_depths[3], variant.gt_depths[3]) ssc1_vaf = calculate_vaf(variant.gt_alt_depths[4], variant.gt_depths[4]) ssc2_vaf = calculate_vaf(variant.gt_alt_depths[5], variant.gt_depths[5]) ssc3_vaf = calculate_vaf(variant.gt_alt_depths[6], variant.gt_depths[6]) ssc4_vaf = calculate_vaf(variant.gt_alt_depths[7], variant.gt_depths[7]) ssc5_vaf = calculate_vaf(variant.gt_alt_depths[8], variant.gt_depths[8]) pf1_vaf = calculate_vaf(variant.gt_alt_depths[9], variant.gt_depths[9]) return pdp1_vaf, pdp2_vaf, pdp3_vaf, pdp4_vaf, ssc1_vaf, ssc2_vaf, ssc3_vaf, ssc4_vaf, ssc5_vaf, pf1_vaf vcf_handle = cyvcf2.VCF( '/home/users/cjyoon/Projects/rheum/data_processing/01c_freebayes/everyone.freebayes.decomposed.norm.vep.centelexcl.vcf.gz' ) writer = cyvcf2.Writer( '/home/users/cjyoon/Projects/rheum/data_processing/01c_freebayes/everyone.freebayes.decomposed.norm.vep.centelexcl.denovo_v2.vcf', vcf_handle) for variant in vcf_handle: pdp1, pdp2, pdp3, pdp4, ssc1, ssc2, ssc3, ssc4, ssc5, pf1 = variant.genotypes pdp1_geno = variant_type(pdp1[0:2]) pdp2_geno = variant_type(pdp2[0:2]) pdp3_geno = variant_type(pdp3[0:2]) pdp4_geno = variant_type(pdp4[0:2]) ssc1_geno = variant_type(ssc1[0:2]) ssc2_geno = variant_type(ssc2[0:2]) ssc3_geno = variant_type(ssc3[0:2]) ssc4_geno = variant_type(ssc4[0:2]) ssc5_geno = variant_type(ssc5[0:2]) pf1_geno = variant_type(pf1[0:2]) pdp1_vaf, pdp2_vaf, pdp3_vaf, pdp4_vaf, ssc1_vaf, ssc2_vaf, ssc3_vaf, ssc4_vaf, ssc5_vaf, pf1_vaf = sample_vafs(
def main(): args = parse_args() vcf = cyvcf2.VCF(args.vcf) vcf.add_to_header( "##Filter_vcf_CMD=python Filter_vcf.py " "--vcf {} " "--output_vcf {} " "--variant_caller {} " "--min_samples {} " "--QUAL {} " "--sample_depth {} " "--min_support {} " "--genotype_quality {} " "--type {}".format( str(args.vcf), str(args.output_vcf), str(args.variant_caller), str(args.min_samples), str(args.QUAL), str(args.sample_depth), str(args.min_support), str(args.genotype_quality), str(args.var_type))) out_vcf = cyvcf2.Writer(args.output_vcf, vcf) for variant in vcf: if variant.QUAL < args.QUAL: continue if variant.INFO.get("DP") < args.min_samples * args.sample_depth: continue if args.var_type != "ALL": var_type = variant.INFO.get("type") if args.var_type == "INDEL": if var_type != "ins": if var_type != "del": continue else: if var_type != args.var_type.lower(): continue dp = variant.format('DP') dp = dp[np.where(dp >= args.sample_depth)] if len(dp) < args.min_samples: continue gq = variant.format('GQ') gq = gq[np.where(gq >= args.genotype_quality)] if len(gq) < args.min_samples: continue gt = variant.gt_types gt_ref = variant.gt_ref_depths gt_alt = variant.gt_alt_depths if args.min_support > 0: passing = 0 for idx, i in enumerate(gt): if i == 0: if gt_ref[idx] >= args.min_support: passing += 1 elif i == 1: if gt_ref[idx] >= args.min_support and gt_alt[idx] >= args.min_support: passing += 1 elif i == 2: if gt_alt[idx] >= args.min_support: passing += 1 if passing < args.min_samples: continue out_vcf.write_record(variant) out_vcf.close()