def test_add_flag(): vcf = VCF(VCF_PATH) vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag', 'Type':'Flag', 'Number': '0'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec = vcf.next() rec.INFO["myflag"] = True w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["myflag"] is None, dict(v.INFO) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec.INFO["myflag"] = False w.write_record(rec) v = next(VCF(f)) assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def test_ibd(): samples = ['101976-101976', '100920-100920', '100231-100231'] vcf = VCF(VCF_PATH, gts012=True, samples=samples) res = vcf.ibd() assert len(res) == 3, (len(res)) arr = res[(b'101976-101976', b'100920-100920')] assert len(arr) > 0
def test_relatedness(): vcf = VCF(VCF_PATH, gts012=True) viter = iter(vcf.relatedness(gap=0, linkage_max=2)) res = next(viter) assert "ibs0" in res assert "ibs2" in res assert "ibs2*" in res
def __init__(self, filename, reference=None): if not os.path.isfile(filename) and filename != "-": exit(message("Error: " + filename + " does not exist")) self.filename = filename if reference: self.reference = reference self.reference_file = resolve_reference_genome(reference) cyvcf2.__init__(self, self.filename) # Check if file exists self.n = len(self.samples) # Number of Samples # Meta Data comp = re.compile(r'''^##(?P<key>[^<#]+?)=(?P<val>[^<#]+$)''', re.M) self.metadata = OrderedDict(comp.findall(self.raw_header)) # Contigs self.contigs = OrderedDict(zip( re.compile("##contig=<ID=(.*?),").findall(self.raw_header), map(int, re.compile("##contig.*length=([^,>]*?)>").findall(self.raw_header)) )) self.info_set = [x for x in self.header_iter() if x.type == "INFO"] self.filter_set = [x for x in self.header_iter() if x.type == "FILTER"] self.format_set = [x for x in self.header_iter() if x.type == "FORMAT"] self.header = copy(self.raw_header)
def get_variant_type(variant_source): """Try to find out what type of variants that exists in a variant source Args: variant_source (str): Path to variant source source_mode (str): 'vcf' or 'gemini' Returns: variant_type (str): 'sv' or 'snv' """ file_type = get_file_type(variant_source) variant_type = 'sv' if file_type == 'vcf': variants = VCF(variant_source) elif file_type == 'gemini': variants = GeminiQuery(variant_source) gemini_query = "SELECT * from variants" variants.run(gemini_query) # Check 1000 first variants, if anyone is a snv we set the variant_type # to 'snv' for i,variant in enumerate(variants): if file_type == 'vcf': if variant.is_snp: variant_type = 'snv' elif file_type == 'gemini': if variant['type'] == 'snp': variant_type = 'snv' if i > 1000: break return variant_type
def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) # GATK 3.x can produce VCFs without sample names for empty VCFs try: tumor_index = vcf.samples.index(dd.get_sample_name(data)) except ValueError: tumor_index = None for rec in vcf: if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def test_hrec(): vcf = VCF(VCF_PATH) for item in vcf.header_iter(): info = item.info() if info['HeaderType'] != 'GENERIC': assert 'ID' in info
def test_set_samples(): vcf = VCF(VCF_PATH) assert len(vcf.samples) == 189, len(vcf.samples) vcf.set_samples([vcf.samples[2]]) assert len(vcf.samples) == 1 v = next(vcf) assert len(v.gt_types) == 1
def test_header_stuff(): vcf = VCF('{}/test.vcf.gz'.format(HERE)) import sys seen_formats, seen_infos = 0, 0 for h in vcf.header_iter(): i = h.info(extra=True) assert isinstance(i, dict) seen_formats += i['HeaderType'] == 'FORMAT' seen_infos += i['HeaderType'] == 'INFO' assert seen_formats == 9, seen_formats assert seen_infos == 73, seen_infos
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def test_set_format_int3(): "test that we can handle multiple (in this case 3) values per sample" vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="P3", Number=3, Type="Integer", Description="Int example")) == 0 v = next(vcf) exp = np.array([[1, 11, 111], [2, 22, 222]], dtype=np.int) v.set_format("P3", exp) res = get_gt_str(v, "P3") assert res == ["1,11,111", "2,22,222"], (res, str(v)) assert np.allclose(v.format("P3"), exp)
def test_set_format_int(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="PI", Number=1, Type="Integer", Description="Int example")) == 0 v = next(vcf) v.set_format("PI", np.array([5, 1], dtype=np.int)) assert allclose(fmap(float, get_gt_str(v, "PI")), [5, 1]) v.set_format("PI", np.array([855, 11], dtype=np.int64)) assert allclose(fmap(float, get_gt_str(v, "PI")), [855, 11]) v.set_format("PI", np.array([9998, 99911], dtype=np.int32)) obs = fmap(float, get_gt_str(v, "PI")) assert allclose(obs, [9998, 99911]), obs
def test_set_format_float(): vcf = VCF('{}/test-format-string.vcf'.format(HERE)) assert vcf.add_format_to_header(dict(ID="PS", Number=1, Type="Float", Description="PS example")) == 0 v = next(vcf) v.set_format("PS", np.array([0.555, 1.111], dtype=np.float)) assert allclose(fmap(float, get_gt_str(v, "PS")), np.array([0.555, 1.111])) v.set_format("PS", np.array([8.555, 11.111], dtype=np.float64)) assert allclose(fmap(float, get_gt_str(v, "PS")), [8.555, 11.111]) v.set_format("PS", np.array([9998.555, 99911.111], dtype=np.float32)) obs = fmap(float, get_gt_str(v, "PS")) assert allclose(obs, [9998.555, 99911.111]), obs
def test_add_filter_to_header(): v = VCF(VCF_PATH) # NOTE that we have to add the filter to the header of the reader, # not the writer because the record will be associated with the reader v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'}) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) rec = v.next() rec.FILTER = ["abcdefg"] w.write_record(rec) w.close() v = next(VCF(f)) assert v.FILTER == "abcdefg", v.FILTER
def validate(self, vcf_path, plot=False, king=False): if king: from .king import run_king run_king(vcf_path, self) else: from cyvcf2 import VCF vcf = VCF(vcf_path, gts012=True, lazy=True) rels = list(vcf.relatedness(min_af=0.02, n_variants=39000, gap=10000, linkage_max=1.5)) if plot: fig = vcf.plot_relatedness(rels[:]) fig.show() fig.savefig('t.png') print("sample_1\tsample_2\tped_relation\tvcf_relation\trel\tIBS") for rel in rels: sample_a, sample_b = rel['pair'] ped_rel = self.relation(sample_a, sample_b) if ped_rel is None: continue out_line = "%s\t%s\t%s\t%s\t%.2f\t%.3f" % (sample_a, sample_b, ped_rel, "|".join(rel['tags']), rel['rel'], rel['ibs']) if rel['rel'] < 0.04: # likely unrelated if ped_rel not in ('related level 2', 'unrelated'): print(out_line) continue if rel['rel'] < 0.15: if ped_rel not in ('unrelated', 'related level 2', 'distant relations'): print(out_line) continue if 0.26 < rel['rel'] < 0.78: if ped_rel not in ('parent-child', 'full siblings'): print(out_line) continue if 0.15 < rel['rel'] < 0.3: if ped_rel not in ('related level 2', 'unrelated'): print(out_line) continue if ped_rel > 0.78: if ped_rel not in ('identical twins', 'self'): print(out_line) continue
def test_add_info_to_header(): v = VCF(VCF_PATH) v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg', 'Type':'Character', 'Number': '1'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) import sys rec = v.next() rec.INFO["abcdefg"] = "XXX" w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
def test_writer(): v = VCF(VCF_PATH) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) o = Writer(f, v) rec = v.next() rec.INFO["AC"] = "3" rec.FILTER = ["LowQual"] o.write_record(rec) rec.FILTER = ["LowQual", "VQSRTrancheSNP99.90to100.00"] o.write_record(rec) rec.FILTER = "PASS" o.write_record(rec) o.close() expected = ["LowQual", "LowQual;VQSRTrancheSNP99.90to100.00", None] for i, variant in enumerate(VCF(f)): assert variant.FILTER == expected[i], (variant.FILTER, expected[i])
def gvcf2coverage(threshold, merge, distance): vcf = VCF(fname='-', gts012=False, lazy=False, strict_gt=False) # eprint(f"samples: {vcf.samples}") assert len(vcf.samples) == 1 # eprint(f"number of seqnames: {len(vcf.seqnames)}") assert len(vcf.seqnames) > 0 first = True # # Loop over all entries # for entry in vcf: jump = False # Depth dp = entry.format('DP') if dp is None: depth = 0 else: depth = dp[0][0] # # If depth is below the threshold, no need to go proceed # if depth < threshold: continue # # Convenience handles # chrom = entry.CHROM start = entry.start end = entry.end ploidy = entry.ploidy # # When we don't merge, just print here and proceed # if not merge: print(chrom, start, end, ploidy, sep="\t") continue # # We just started # if first: # First entry window_start = start window_end = end window_chrom = chrom window_ploidy = ploidy first = False # eprint(f"First! c:{window_chrom} s:{start}, w_s={window_start} e:{end} w_e={window_end}") continue if window_chrom != chrom: # eprint(f"Chrom changed from {window_chrom} to {chrom}.") jump = True elif window_ploidy != ploidy: # eprint(f"Ploidy changed from {window_ploidy} to {ploidy}") jump = True elif window_end + distance < start: # eprint("Gap! (window_end:%d < start:%d)" % (window_end + distance, start)) jump = True if jump: # eprint("Jump!") print(window_chrom, window_start, window_end, window_ploidy, sep="\t") window_start = start window_end = end window_chrom = chrom window_ploidy = ploidy else: window_start = min(window_start, start) window_end = max(window_end, end) # eprint(f"No jump! s:{start}, w_s={window_start} e:{end} w_e={window_end}") # # If the last iteration of the loop was not a jump, we still need to print # if merge and not jump: print(window_chrom, window_start, window_end, window_ploidy, sep="\t")
def query_bed_region(region, vcf_path, fasta, kmer_size, singleton_path, af_path, an_path, ac_path, model_dir): """ @param ac_path: @param an_path: @param af_path: @param singleton_path: @param region: @param vcf_path: @param fasta: @param kmer_size: @return: """ # TODO: Add binning somehow (either keep equal size or equal number of bins start = time.time() vcf = VCF(vcf_path) fasta = Fasta(fasta) window = QueryWindow(kmer_size, singleton_path=singleton_path, af_path=af_path, an_path=an_path, ac_path=ac_path, model_dir=model_dir) # The first kmer actually begins centered around first nucleotide in sequence so # start position is shifted upstream by half the kmer length # end position is shifted downstream by the same shift = kmer_size // 2 try: if region.strand is not None: if is_dash(region.strand): sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).complement.seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() exp = window.calculate_expected(sequence) # this does account for strandedness AF, AC, AN, singletons, count = count_regional_alleles(vcf(str(region))) field1 = count # 'NumSNVs' field2 = singletons # 'Singletons' field3 = AC # 'AC' field4 = AN # 'AN' field5 = AF # 'AF' field6 = exp.get('singleton') # 'ExpectedSingletons' field7 = exp.get('AC') # 'ExpectedAC' field8 = exp.get('AN') # 'ExpectedAN' field9 = exp.get('AF') # 'ExpectedAF' except (KeyError, FetchError): field1 = 0 # 'NumSNVs' field2 = 0 # 'Singletons' field3 = 0 # 'AC' field4 = 0 # 'AN' field5 = 0 # 'AF' field6 = 0 # 'ExpectedSingletons' field7 = 0 # 'ExpectedAC' field8 = 0 # 'ExpectedAN' field9 = 0 # 'ExpectedAF' # print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( # region.printstr(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), # str(field8), str(field9)), flush=True) regname = region.str_name().split('\t') print( '{: <8} {: <12} {: <12} {: <20} {: <8} {: <10} {: <12} {: <10} {: <10} {: <24} {: <22} {: <20} {: <20} {: <20}'.format( str(regname[0]), str(regname[1]), str(regname[2]), str(regname[3]), str(regname[4]), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9)), flush=True) return '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( region.str_name(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9))
def test_relatedness(): vcf = VCF(VCF_PATH, gts012=True) df = vcf.relatedness(gap=0, linkage_max=2) assert "ibs0" in df, df assert "rel" in df
def test_raw_header(): v = VCF(VCF_PATH) h = v.raw_header.strip().split("\n") s = h[0] assert s == "##fileformat=VCFv4.1", s assert len(h) == 185, len(h)
def test_read_flag(): vcf = VCF(VCF_PATH) for v in vcf: assert ("in_exac_flag" in str(v)) == v.INFO.get('in_exac_flag', False)
def test_init(): v = VCF(VCF_PATH) assert v
"""Filter variants with QUAL above or below cutoff""" ret = variant.QUAL >= cutoff return nonrev and ret for name, filt in filters.items(): if name in BUILTIN_FILTERS: if not isinstance(filt, tuple): filt = (filt, ) filters[name] = lambda variant: BUILTIN_FILTERS[name](variant, *filt) filters[name].__doc__ = BUILTIN_FILTERS[name].__doc__ else: filters[name] = eval(filt) filters[name].__doc__ = filter_descs.get(name, filt) invcf = VCF(infile) for name, filt in filters.items(): invcf.add_filter_to_header({ 'ID': name, 'Description': filt.__doc__, }) if outfile.endswith(".gz"): outvcf = Writer(outfile, invcf, "wz") else: outvcf = Writer(outfile, invcf) for variant in invcf: for name, filt in filters.items(): if not filt(variant): if not variant.FILTER:
######################################################################################################################## # Run parser = argparse.ArgumentParser(description="Convert a vcf to mixqtl formats") parser.add_argument("-vcf") parser.add_argument("-output_prefix") args = parser.parse_args() folder = os.path.split(args.output_prefix)[0] if not os.path.exists(folder): os.makedirs(folder) variant_annotation, hap_1, hap_2 = activate_output(args.output_prefix, current_chromosome, None) vcf = VCF(args.vcf) print("processing") for i, variant in enumerate(vcf): chromosome = variant.CHROM if chromosome != current_chromosome: deactivate_output(variant_annotation, hap_1, hap_2) current_chromosome = chromosome variant_annotation, hap_1, hap_2 = activate_output( args.output_prefix, current_chromosome, vcf.samples) _write(variant_annotation, [ variant.ID, variant.CHROM, str(variant.POS), variant.REF, variant.ALT[0] ])
def transcript_for_alt(transcripts, alt): for transcript in transcripts[alt]: if transcript['PICK'] == '1': return transcript return transcripts[alt][0] def decode_hex(string): hex_string = string.group(0).replace('%', '') return binascii.unhexlify(hex_string).decode('utf-8') (script, tsv_filename, vcf_filename, vep_fields, output_dir) = sys.argv vep_fields_list = vep_fields.split(',') vcf_file = VCF(vcf_filename) csq_fields = parse_csq_header(vcf_file) vep = {} for variant in vcf_file: chr = str(variant.CHROM) pos = str(variant.POS) ref = str(variant.REF) alts = variant.ALT if chr not in vep: vep[chr] = {} if pos not in vep[chr]: vep[chr][pos] = {}
def process_vcf(self, inputfile): """ Main function for parsing VCF """ # initialize reference genome fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000) # initialize vcf reader if self.args.samplefile: keep_samples = parseSampleFile(self.args.samplefile) vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True, samples=keep_samples) else: vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True) nbp = (self.args.length - 1) // 2 # index samples if (self.args.samplefile and self.args.groupvar): all_samples = vcf_reader.samples sg_dict = indexGroups(self.args.samplefile, self.args.groupvar) samples = sorted(list(set(sg_dict.values()))) # get boolean vector of samples that are in sample file samples_keep_match = np.isin(all_samples, list(sg_dict.keys())) # get indices of matching samples samples_keep_idx = np.where(samples_keep_match) # get list of individual sample ids to keep samples_keep = sorted(list(set(sg_dict.keys()))) util_log.debug("%s samples will be pooled into %s groups: %s", len(all_samples), len(samples), ",".join(samples)) else: samples = vcf_reader.samples samples_dict = {} for i, sample in enumerate(samples): samples_dict[sample] = i # Query records in VCF and build matrix M = np.zeros((len(samples), len(self.subtypes_dict))) numsites_keep = 0 numsites_skip = 0 chrseq = '0' chr_check = "none" for record in vcf_reader: # Filter by SNP status, # alt alleles, and FILTER column if (not record.is_snp or len(record.ALT) != 1 or record.FILTER is not None): numsites_skip += 1 continue # Filter by allele count if record.INFO['AC'] > self.args.maxac > 0: numsites_skip += 1 continue row_chr = record.CHROM # check chromosome formatting matches between MAF and fasta files if numsites_keep == 0: if "chr1" in fasta_reader and "chr" not in row_chr: chr_check = "add" util_log.debug( "formatting mismatch: 'chr' only in fasta file") elif "chr1" not in fasta_reader and "chr" in row_chr: chr_check = "delete" util_log.debug( "formatting mismatch: 'chr' only in MAF file") else: util_log.debug("chromosome formatting matches") if chr_check == "add": row_chr = "chr" + row_chr elif chr_check == "delete": row_chr = row_chr.replace('chr', '') if row_chr != chrseq: sequence = fasta_reader[row_chr] chrseq = row_chr # check and update chromosome sequence # if record.CHROM != chrseq: # sequence = fasta_reader[record.CHROM] # chrseq = record.CHROM lseq = sequence[record.POS - (nbp + 1):record.POS + nbp].seq mu_type = record.REF + str(record.ALT[0]) category = getCategory(mu_type) motif_a = getMotif(lseq) subtype = str(category + "." + motif_a) if subtype not in self.subtypes_dict: numsites_skip += 1 continue st = self.subtypes_dict[subtype] # currently only works with singletons-- if (self.args.samplefile and self.args.groupvar): gt_new = record.gt_types if (self.args.impute and 3 in gt_new): gt_complete = gt_new[gt_new != 3] freq = sum(gt_complete) / len(gt_complete) gt_new[gt_new == 3] = freq else: gt_new[gt_new == 3] = 0 # if not any("/" in b for b in record.gt_bases): if self.args.haploid: gt_new = np.divide(gt_new, 2.) # get array of genotypes only for samples in samplefile gt_sub = gt_new[samples_keep_idx] if gt_sub.sum() == 0: numsites_skip += 1 continue # initialize dict of group allele counts = 0 sg_counts = {k: 0 for k in sorted(list(set(sg_dict.values())))} # initialize dict of allele counts per sample d2 = dict(zip(samples_keep, gt_sub)) # iterate per-sample counts and update per-group counts for key, value in d2.items(): sg_counts[sg_dict[key]] += value # add to matrix M[:, st] = M[:, st] + list(sg_counts.values()) numsites_keep += 1 else: gt_new = record.gt_types if (self.args.impute and 3 in gt_new): gt_complete = gt_new[gt_new != 3] freq = sum(gt_complete) / len(gt_complete) gt_new[gt_new == 3] = freq else: gt_new[gt_new == 3] = 0 # if not any("/" in b for b in record.gt_bases): if self.args.haploid: gt_new = np.divide(gt_new, 2.) M[:, st] = M[:, st] + gt_new numsites_keep += 1 # util_log.debug(gt_new) if numsites_keep % 100000 != 0: continue util_log.debug("%s : %s sites counted", inputfile, numsites_keep) util_log.debug("%s : %s sites counted", inputfile, numsites_keep) util_log.debug("%s : %s sites skipped", inputfile, numsites_skip) out = collections.namedtuple('Out', ['M', 'samples'])(M, samples) if self.par: out = M return out
action="store_true", help= "this adds special weighting to the X chromosome if you want to run the full model", default=False) args = parser.parse_args() cpg = args.cpg synonymous = args.synonymous nosingletons = args.nosingletons rfile = args.file varflag = args.varflag chromosomes = args.chromosomes exclude = args.exclude xweighted = args.xweighted gnomad = VCF('data/gnomad-vep-vt.vcf.gz') kcsq = gnomad["CSQ"]["Description"].split(":")[1].strip(' "').split("|") ys, genes = [], [] def syn_density(pairs, d, gnomad, kcsq, nosingletons, varflag): syn = 0 prevvar = None if varflag: if 'VARTRUE' in d[ 'varflag']: # don't need syn for a 0 bp region, i.e., variant, so give it the lowest possible, 0 return syn for pair in pairs: if varflag: r0 = str(int(pair[0]) + 1)
functional = args.functional variants = args.variants def isfunctional(csqs): for csq in csqs.split(","): eff = csq.strip("|").split("|", 2)[0] if any([ c in eff for c in ('stop_gained', 'stop_lost', 'start_lost', 'initiator_codon', 'rare_amino_acid', 'missense', 'protein_altering', 'frameshift', 'inframe_insertion', 'inframe_deletion') ]) or (('splice_donor' in eff or 'splice_acceptor' in eff) and 'coding_sequence' in eff): return True return False vcf = VCF(variants) print vcf.raw_header, for v in vcf: if functional: csq = v.INFO.get("BCSQ") or v.INFO.get("CSQ") if csq is None or not isfunctional(csq): continue if v.INFO.get("_exclude"): continue print(str(v).strip())
import sys from cyvcf2 import VCF kg_vcf = sys.argv[1] pops = ["EUR", "AFR", "AMR", "SAS"] good = 0 for v in VCF(kg_vcf): info = v.INFO if 'OLD_MULTIALLELIC' in info: continue if info['VT'] != 'SNP': continue if info['NS'] < 2500: continue if info['AF'] < 0.04: continue if info['AF'] > 0.95: continue try: info['EX_TARGET'] except KeyError: continue if not all(info[p + "_AF"] > 0.04 for p in pops): continue good += 1 print "%s:%d" % (v.CHROM, v.POS)
def get_header(vcf, vep_field, vep_separator): index_dict = dict() if vep_field: for h in vcf.header_iter(): try: if h.info()['ID'] == vep_field: csq_header = h.info()['Description'].split(vep_separator) for elem in csq_header: index_dict[elem] = csq_header.index(elem) except: pass return index_dict vcf = VCF(sys.argv[1]) vcf.add_info_to_header({ 'ID': 'True_Label', 'Description': 'True_Label of the variation', 'Type': 'String', 'Number': '1', }) output = sys.argv[2] o = Writer(output, vcf) vep_field = sys.argv[3] vep_separator = sys.argv[4] index_dict = get_header(vcf, vep_field, vep_separator) for record in tqdm(vcf):
joined_freq.to_csv("final_file.csv") return joined_freq def impose_distance_requirement(sorted_vars, dist_bw_variants): # TODO print('No distance requirment imposed. Onwards!') return if __name__ == "__main__": filename = sys.argv[1] variant_positions = defaultdict(Variant) kmer_len = 3 if is_vcf(filename): # import vcf file for variant in VCF(filename): if is_quality_variant(variant): # join is required because 'ALT' is returned as a list variant_positions[variant.POS] = Variant(variant.REF, "".join(variant.ALT), variant.POS) saved_csv_name = "chr22_variant_singletons.csv" generate_csv_from_variants(variant_positions, outfile=saved_csv_name) variant_singletons = import_variants(saved_csv_name) else: variant_singletons = import_variants(filename) print("Variants imported and saved.") if len(sys.argv) > 1: # impose user supplied minimum distance between variants try: kmer_len = int(sys.argv[2]) # dist_bw_variants = int(sys.argv[2]) # if dist_bw_variants < 1: # raise ValueError("Minimum distance must be positive integer!")
print("Cannot find input file ", args.inf) sys.exit(1) vcf = cyvcf2.VCF(args.inf) # create a new vcf Writer using the input vcf as a template. w = Writer(f, vcf) # Create other output output = args.inf + ".stats" df = pd.DataFrame(columns=vcf.samples) #creates a new dataframe that's empty v = -1 for variant in VCF(args.inf): # or VCF('some.bcf') v = v + 1 alt = [item.encode('utf-8') for item in variant.ALT] #print(variant.REF, alt) # e.g. REF='A', ALT=['C', 'T'] # Somehow assessing the number of alternative alleles if (len(alt) == 1): pass # Multiple alternative alleles elif (len(alt) > 1): #print("Long",str(variant)) pass # No alernative variant - possibly a SN deletion elif (len(alt) < 1): #print("Null",str(variant)) pass
def test_format_field(): vcf = VCF(VCF_PATH) for v in vcf: assert isinstance(v.FORMAT, list)
def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data, keep_rejected_calls, compress, print_data_type_header): vcf = VCF(query_vcf, gts012=True) out = open(out_tsv, 'w') fixed_columns_header = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER' ] fixed_columns_header_type = [ 'String', 'Integer', 'String', 'String', 'String', 'Float', 'String' ] samples = vcf.samples info_columns_header = [] format_columns_header = [] sample_columns_header = [] column_types = {} gt_present_header = 0 if len(samples) > 0: sample_columns_header.append('VCF_SAMPLE_ID') for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys( ) and 'HeaderType' in header_element.keys(): if header_element['HeaderType'] == 'INFO' or header_element[ 'HeaderType'] == 'FORMAT': column_types[header_element['ID']] = header_element['Type'] if header_element['HeaderType'] == 'INFO': if skip_info_data is False: info_columns_header.append(header_element['ID']) if header_element['HeaderType'] == 'FORMAT': if len(sample_columns_header ) > 0 and skip_genotype_data is False: if header_element['ID'] != 'GT': format_columns_header.append(header_element['ID']) else: gt_present_header = 1 header_tags = fixed_columns_header if skip_info_data is False: header_tags = fixed_columns_header + sorted(info_columns_header) if len(sample_columns_header) > 0: if skip_genotype_data is False: header_tags = fixed_columns_header + sorted( info_columns_header) + sample_columns_header + sorted( format_columns_header) + ['GT'] else: header_tags = fixed_columns_header + sorted( info_columns_header) else: if len(sample_columns_header) > 0: if skip_genotype_data is False: header_tags = fixed_columns_header + sample_columns_header + sorted( format_columns_header) + ['GT'] else: header_tags = fixed_columns_header header_line = '\t'.join(header_tags) out.write('#https://github.com/sigven/vcf2tsv version=' + str(version) + '\n') if print_data_type_header is True: header_types = [] for h in header_tags: if h in column_types: header_types.append(str(column_types[h])) header_line_type = '\t'.join(fixed_columns_header_type + header_types) out.write('#' + str(header_line_type) + '\n') out.write(str(header_line) + '\n') else: out.write(str(header_line) + '\n') for rec in vcf: rec_id = '.' rec_qual = '.' rec_filter = '.' alt = ",".join(str(n) for n in rec.ALT) if not rec.ID is None: rec_id = str(rec.ID) if not rec.QUAL is None: rec_qual = str("{0:.2f}".format(rec.QUAL)) rec_filter = str(rec.FILTER) if rec.FILTER is None: rec_filter = 'PASS' pos = int(rec.start) + 1 fixed_fields_string = str( rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str( rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str( rec_filter) if not 'PASS' in rec_filter and not keep_rejected_calls: continue variant_info = rec.INFO vcf_info_data = [] if skip_info_data is False: for info_field in sorted(info_columns_header): if column_types[info_field] == 'Flag': if variant_info.get(info_field) is None: vcf_info_data.append('False') else: vcf_info_data.append('True') elif column_types[info_field] == 'Float' or column_types[ info_field] == 'Integer' or column_types[ info_field] == 'String' or column_types[ info_field] == 'Character': if type(variant_info.get(info_field)) is list or type( variant_info.get(info_field)) is tuple: vcf_info_data.append(",".join( str(n) for n in variant_info.get(info_field))) else: if variant_info.get(info_field) is None: vcf_info_data.append('.') else: if column_types[info_field] == 'Float': if not isinstance(variant_info.get(info_field), float): print( 'vcf2tsv.py WARNING:\tINFO tag ' + str(info_field) + ' is defined in the VCF header as type \'Float\', yet parsed as other type:' + str(type(variant_info.get(info_field))) ) if not ',' in str(alt): print( 'Warning: Multiple values in INFO tag for single ALT allele (VCF multiallelic sites not decomposed properly?):' + str(fixed_fields_string) + '\t' + str(info_field) + '=' + str(variant_info.get(info_field))) vcf_info_data.append('.') else: val = str("{0:.7f}".format( variant_info.get(info_field))) vcf_info_data.append(val) else: if column_types[ info_field] == 'String' or column_types[ info_field] == 'Character': if isinstance(variant_info.get(info_field), str): #print(str(info_field) + '\t' + variant_info.get(info_field).encode('ascii','ignore').rstrip().decode('ascii')) vcf_info_data.append( variant_info.get( info_field).encode( 'ascii', 'ignore').decode('ascii')) else: vcf_info_data.append('.') if column_types[ info_field] == 'String': print( 'vcf2tsv.py WARNING:\tINFO tag ' + str(info_field) + ' is defined in the VCF header as type \'String\', yet parsed as other type:' + str( type( variant_info.get( info_field)))) if column_types[ info_field] == 'Character': print( 'vcf2tsv.py WARNING:\tINFO tag ' + str(info_field) + ' is defined in the VCF header as type \'Character\', yet parsed as other type:' + str( type( variant_info.get( info_field)))) else: if isinstance(variant_info.get(info_field), int): vcf_info_data.append( str(variant_info.get(info_field))) else: print( 'vcf2tsv.py WARNING:\tINFO tag ' + str(info_field) + ' is defined in the VCF header as type \'Integer\', yet parsed as other type:' + str( type( variant_info.get( info_field)))) vcf_info_data.append( re.sub( '\(|\)', '', variant_info. get(info_field).encode( 'ascii', 'ignore').decode('ascii'))) #print(str(vcf_info_data)) #dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys) vcf_sample_genotype_data = {} if len(samples) > 0 and skip_genotype_data is False: gt_cyvcf = rec.gt_types i = 0 while i < len(samples): vcf_sample_genotype_data[samples[i]] = {} gt = './.' if gt_present_header == 1: if gt_cyvcf[i] == 0: gt = '0/0' if gt_cyvcf[i] == 1: gt = '0/1' if gt_cyvcf[i] == 2: gt = '1/1' vcf_sample_genotype_data[samples[i]]['GT'] = gt i = i + 1 for format_tag in sorted(format_columns_header): if len(samples) > 0 and skip_genotype_data is False: sample_dat = rec.format(format_tag) if sample_dat is None: k = 0 while k < len(samples): if samples[k] in vcf_sample_genotype_data: vcf_sample_genotype_data[ samples[k]][format_tag] = '.' k = k + 1 continue dim = sample_dat.shape j = 0 ## sample-wise while j < dim[0]: if sample_dat[j].size > 1: d = ','.join( str(e) for e in np.ndarray.tolist(sample_dat[j])) if samples[j] in vcf_sample_genotype_data: vcf_sample_genotype_data[ samples[j]][format_tag] = d else: d = '.' if column_types[format_tag] == 'String': d = str(sample_dat[j]) if column_types[format_tag] == 'Integer': d = str(sample_dat[j][0]) if samples[j] in vcf_sample_genotype_data: vcf_sample_genotype_data[ samples[j]][format_tag] = d j = j + 1 #print(str(vcf_sample_genotype_data)) tsv_elements = [] tsv_elements.append(fixed_fields_string) if skip_info_data is False: if skip_genotype_data is False: if len(sample_columns_header) > 0: tsv_elements.append("\t".join( str(n) for n in vcf_info_data)) ## one line per sample variant for s in sorted(vcf_sample_genotype_data.keys()): sample = s line_elements = [] line_elements.extend(tsv_elements) line_elements.append(sample) gt_tag = '.' for tag in sorted( vcf_sample_genotype_data[sample].keys()): if tag != 'GT': line_elements.append( vcf_sample_genotype_data[sample] [tag].encode('ascii', 'ignore').decode('ascii')) else: gt_tag = vcf_sample_genotype_data[sample][ tag].encode('ascii', 'ignore').decode('ascii') line_elements.append(gt_tag) if gt_tag == './.' or gt_tag == '.': if keep_rejected_calls: out.write('\t'.join(line_elements) + '\n') else: out.write("\t".join(str(n) for n in line_elements) + '\n') else: tsv_elements.append("\t".join( str(n) for n in vcf_info_data)) line_elements = [] line_elements.extend(tsv_elements) out.write('\t'.join(line_elements) + '\n') else: tsv_elements.append("\t".join(str(n) for n in vcf_info_data)) line_elements = [] line_elements.extend(tsv_elements) out.write('\t'.join(line_elements) + '\n') else: if skip_genotype_data is False: if len(sample_columns_header) > 0: ## one line per sample variant for s in sorted(vcf_sample_genotype_data.keys()): sample = s line_elements = [] line_elements.extend(tsv_elements) line_elements.append(sample) gt_tag = '.' for tag in sorted( vcf_sample_genotype_data[sample].keys()): if tag != 'GT': line_elements.append( vcf_sample_genotype_data[sample][tag]) else: gt_tag = vcf_sample_genotype_data[sample][tag] line_elements.append(gt_tag) if gt_tag == './.' or gt_tag == '.': if keep_rejected_calls: out.write('\t'.join(line_elements) + '\n') else: out.write('\t'.join(line_elements) + '\n') else: line_elements = [] line_elements.extend(tsv_elements) line_elements = tsv_elements out.write('\t'.join(line_elements) + '\n') out.close() if compress is True: command = 'gzip -f ' + str(out_tsv) check_subprocess(command)
def test_snpeff_header(): v = VCF(VCF_PATH2) f = v['SnpEffVersion'] assert f != {}, f assert 'SnpEffVersion' in f
def verify_pcgr_input(pcgr_directory, input_vcf, input_cna, tumor_dp_tag, tumor_af_tag, normal_dp_tag, normal_af_tag, call_conf_tag): """ Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks: 1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) 2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR 3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF 4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose 5. Check that copy number segment file has required columns and correct data types (and range) 6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) """ logger = pcgrutils.getlogger('pcgr-check-input') input_vcf_pcgr_ready = '/workdir/output/' + re.sub( r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.tmp.vcf', os.path.basename(input_vcf)) input_vcf_pcgr_ready_decomposed = '/workdir/output/' + re.sub( r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.vcf', os.path.basename(input_vcf)) if not input_vcf == 'None': logger.info('Validating VCF file with EBIvariation/vcf-validator') vcf_validation_output_file = '/workdir/output/' + re.sub( r'(\.vcf$|\.vcf\.gz$)', '.vcf_validator_output', os.path.basename(input_vcf)) command_v42 = 'vcf_validator --input ' + str( input_vcf) + ' --version v4.2 > ' + str(vcf_validation_output_file) if input_vcf.endswith('.gz'): command_v42 = 'bgzip -dc ' + str( input_vcf) + ' | vcf_validator --version v4.2 > ' + str( vcf_validation_output_file) os.system(command_v42) validation_results = is_valid_vcf(vcf_validation_output_file) if not validation_results['validation_status']: error_string_42 = '\n'.join(validation_results['error_messages']) validation_status = 'VCF file is NOT valid according to v4.2 specification' logger.error(validation_status + ':\n' + str(error_string_42)) return -1 else: validation_status = 'VCF file ' + str( input_vcf) + ' is valid according to v4.2 specification' logger.info(validation_status) tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, logger) if tag_check == -1: return -1 else: logger.info('No query VCF INFO tags coincide with PCGR INFO tags') if validation_results['validation_status']: multiallelic_alt = 0 vcf = VCF(input_vcf) check_ad_dp_tags(vcf, tumor_dp_tag, tumor_af_tag, normal_dp_tag, normal_af_tag, call_conf_tag, logger) for rec in vcf: POS = rec.start + 1 alt = ",".join(str(n) for n in rec.ALT) if len(rec.ALT) > 1: logger.warning("Multiallelic site detected:" + str(rec.CHROM) + '\t' + str(POS) + '\t' + str(rec.REF) + '\t' + str(alt)) multiallelic_alt = 1 command_vcf_sample_free1 = 'egrep \'^##\' ' + str( input_vcf) + ' > ' + str(input_vcf_pcgr_ready) command_vcf_sample_free2 = 'egrep \'^#CHROM\' ' + str( input_vcf) + ' | cut -f1-8 >> ' + str(input_vcf_pcgr_ready) command_vcf_sample_free3 = 'egrep -v \'^#\' ' + str( input_vcf ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) command_vcf_sample_free4 = 'egrep -v \'^#\' ' + str( input_vcf ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) command_vcf_sample_free5 = 'egrep -v \'^#\' ' + str( input_vcf ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) if input_vcf.endswith('.gz'): command_vcf_sample_free1 = 'bgzip -dc ' + str( input_vcf) + ' | egrep \'^##\' > ' + str( input_vcf_pcgr_ready) command_vcf_sample_free2 = 'bgzip -dc ' + str( input_vcf) + ' | egrep \'^#CHROM\' | cut -f1-8 >> ' + str( input_vcf_pcgr_ready) command_vcf_sample_free3 = 'bgzip -dc ' + str( input_vcf ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) command_vcf_sample_free4 = 'bgzip -dc ' + str( input_vcf ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) command_vcf_sample_free5 = 'bgzip -dc ' + str( input_vcf ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str( input_vcf_pcgr_ready) os.system(command_vcf_sample_free1) os.system(command_vcf_sample_free2) os.system(command_vcf_sample_free3) os.system(command_vcf_sample_free4) os.system(command_vcf_sample_free5) if multiallelic_alt == 1: logger.info( 'Decomposing multi-allelic sites in input VCF file using \'vt decompose\'' ) command_decompose = 'vt decompose -s ' + str( input_vcf_pcgr_ready) + ' > ' + str( input_vcf_pcgr_ready_decomposed ) + ' 2> /workdir/output/decompose.log' os.system(command_decompose) else: command_copy = 'cp ' + str(input_vcf_pcgr_ready) + ' ' + str( input_vcf_pcgr_ready_decomposed) os.system(command_copy) os.system('bgzip -f ' + str(input_vcf_pcgr_ready_decomposed)) os.system('tabix -p vcf ' + str(input_vcf_pcgr_ready_decomposed) + '.gz') os.system('rm -f ' + str(input_vcf_pcgr_ready) + ' /workdir/output/decompose.log') if not input_cna == 'None': ret = is_valid_cna_segment_file(input_cna, logger) return ret return 0
def test_iterate(): for i, v in enumerate(VCF(VCF_PATH), start=1): pass assert i == 115, i
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float( utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug( "Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists( ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.' }) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq * 100) + ('(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)' ) }) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format( rec.ALT[0] + 'U')[:, 0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format( 'TIR')[:, 0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:, 0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:, 1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:, 0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate( divide='ignore', invalid='ignore' ): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
from cyvcf2 import VCF import sys import numpy as np import gzip import csv import glob import os import fnmatch from collections import defaultdict, Counter vcf = VCF(sys.argv[1], gts012=True) samps = list(vcf.samples) s_counts_snp = np.zeros((len(vcf.samples))) s_counts_indel = np.zeros((len(vcf.samples))) idx2samp = dict(zip(range(len(vcf.samples)), vcf.samples)) samp2idx = dict(zip(vcf.samples, range(len(vcf.samples)))) ii = np.arange(len(vcf.samples)) ### TODO: make sure there are intervals for the ends of chromosomes! for i, v in enumerate(vcf): if i % 10000 == 0: print('done with {} variants, on {}:{}'.format(i, v.CHROM, v.POS)) print(np.sum(s_counts_snp)) if v.call_rate != 1.: continue gts = v.gt_types unk_gts = np.where(gts == 3) unique, counts = np.unique(gts, return_counts=True)
vars['novelsv'] = [] # read genelist knowngenelist = set() with open(genelist, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: l = row[0] p = re.compile("_exon_\d+") l = p.sub("", l) knowngenelist.add(l) # get gene-level variants genevcf = VCF(genevcf_file) genecsq_fields = parse_csq_header(genevcf) for variant in genevcf: csq = variant.INFO.get('CSQ') if csq is None: sys.exit("No VEP fields") transcripts = list( parse_csq_entries(csq.split(','), genecsq_fields).items())[0][ 1] # just get the first allele in the list. genes = get_csq_entries_bygene(transcripts) vartype = ''
if __name__ == '__main__': args = docopt(__doc__, version='1.0') #print(args) if (args['--format']): ShowFormat() sys.exit(-1) MISS_THRESHOLD = float(args['-m']) # API and example # https://brentp.github.io/cyvcf2/ # https://brentp.github.io/cyvcf2/docstrings.html#api invcf = VCF( '/dev/stdin', lazy=True, gts012=True ) # if gts012=True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. # invcf = VCF('test.vcf.gz', lazy=True) # adjust the header to contain the new field # the keys 'ID', 'Description', 'Type', and 'Number' are required. invcf.add_filter_to_header({ 'ID': 'VCFSiteMissingFilter.py', 'Description': 'Exclude the site with missing rate higher than > ' + str(MISS_THRESHOLD) }) # create a new vcf Writer using the input vcf as a template.
def test_empty_info(): for v in VCF(VCF_PHASE_PATH): dict(v.INFO)
def main(): # Step 01: define argument variables parser = argparse.ArgumentParser() parser.add_argument("--vcf", help="Sorted VCF file as input", required=True) parser.add_argument( "--out", help="Name of the output file that contains simplified VCF as table.", required=True) parser.add_argument( "--samples", help="SAMPLE of interest; write as comma separated names, " "for e.g: 'sampleA,sampleB' or 'all'.", default='all') parser.add_argument( "--pre_header", help= "Comma separated pre-header fields before the 'INFO' field in the input VCF file. " "Write as comma separated fields, for e.g: 'CHR,POS,ID' or 'all'. " "Default: 'all'. ", default='all') parser.add_argument( "--infos", help="INFO tags that are of interest; write as comma separated tags; " "for e.g: 'AC,AF,AN' or 'all'.", default='all') parser.add_argument( "--formats", help="FORMAT tags that are of interest; for e.g: 'GT,PG,PI' or 'all'.", default='all') parser.add_argument("--keep_header", default='no', help="Keep the HEADER data in the output file." "Options: 'yes' or 'no' ") parser.add_argument("--mode", help="Structure of the output table." "Options: wide(0), long(1). Default: 0 .", required=False, default=0) parser.add_argument("--gtbase", help="write the GT field as IUPAC base code." "Options: no(0), yes(1). Default: 0 .", required=False, default=0) global args # creating a global argument variable args = parser.parse_args() global gtbase # ******************** only activate during non-interactive mode # Step 02: Set the parameters that are of interest in VCF file #pre_header = 'CHR,POS' #pre_header = 'all' #info_of_interest = 'AC,AN' #info_of_interest = 'all' #sample_of_interest = 'MA611,ms02g' #sample_of_interest = 'all' #format_of_interest = 'GT,PG,PL' #format_of_interest = 'all' # keep_header = 'yes' # if 'yes' then add the header to the output file, but set default at 'no' # keep_header = 'no' # ************************ # Step 02: Now, pipe the "input arguments" to a variable pre_header = args.pre_header info_of_interest = args.infos sample_of_interest = args.samples format_of_interest = args.formats keep_header = args.keep_header if args.mode == '1' or args.mode == 'long': mode = 'long' else: mode = 'wide' if args.gtbase == '1' or args.gtbase == 'yes': gtbase = 'yes' else: gtbase = 'no' # Step 03: Read vcf file using cyvcf2 and start mining the data start_time01 = time.time() #with open("simplified_vcf.txt", 'w') as write_block: with open(args.out, 'w') as write_block: #vcf_file = VCF('input_test.vcf') vcf_file = VCF(args.vcf) sample_ids = vcf_file.samples #print(sample_ids) print('- %i samples found.' % len(sample_ids)) print() # mining header # add argument to keep or discard header while writing output file header = vcf_file.raw_header.split('\n') if keep_header == 'yes': write_block.write(vcf_file.raw_header) print() # Step 03-A: now, write the appropriate front part of the header all_header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'] if pre_header != 'all': output_header = pre_header.split(',') else: output_header = all_header # Step 03-B: mine fields that are of interest - for rear part of the header and for computing values from variants. # all data are returned as list from the defined function # the output data are: sample names, infos and formats of interest which can be used in downstream analyses my_samples, my_infos, my_formats = process_fields_of_interest( header, sample_of_interest, info_of_interest, format_of_interest, sample_ids) # create empty dictionary - to store Key, Values from FORMAT field of each sample format_ids_values = collections.OrderedDict() for ks in my_formats: format_ids_values[ks] = None my_sample_idx = [sample_ids.index(x) for x in my_samples] # we can move this elsewhere ## Step 03-C: add tags from the "INFO" of interest to the header output_header += my_infos ## Step 03-D: Decide between "long" vs. "wide" way of representing FORMAT field values # Now, simplify the TAGS from the FORMAT field by assigning it to each SAMPLE names # add the tags from the "FORMAT" field as it is. Each sample is represented in different line if mode == 'long': output_header.append('SAMPLE') output_header.extend(my_formats) # write the final header to the output file print('\t'.join(output_header), file=write_block) elif mode == 'wide': # add the tags from the "FORMAT" field as suffix to the sample of interest (on output header) for name in my_samples: for tags in my_formats: output_header.append(name + '_' + tags) # write the final header to the output file print('\t'.join(output_header), file=write_block) print() print('- Output header of the simplified VCF file:') print(output_header) print() chr_on_process = '' ''' Step 04: now, start parsing the VCF file using cyVCF2 and add the data for each header fields''' print('Reading the input vcf file ... ') print() for variant in vcf_file: line_to_write = '' # create new emtpy variable contig = str(variant.CHROM) # find which chr is in the process if chr_on_process != contig: print('Contig %s is being processed ... ' % str(contig)) print() chr_on_process = contig ########################## ************************** # these methods are deprecated for now, but keeping for future use. #pos = str(variant.POS) #id_ = variant.ID #ref_allele = variant.REF #alt_alleles = variant.ALT #all_alleles = [ref_allele] + alt_alleles #alt_freq = variant.INFO.get('AF') # pass the "alt_freq" values to a function to compute "all_freq" as string #all_freq = compute_allele_freq(alt_freq) ########################## ************************** # Step 04-A : Write the values for the pre-fields of the header (i.e CHR upto FILTER) # If user desires less number of fields that is also achievable chr_to_filter = str(variant).split('\t')[0:7] # to store the data when not all pre-headers are of interest pre_header_dict = collections.OrderedDict() if pre_header == 'all': #write_block.write('\t'.join(chr_to_filter)) line_to_write += '\t'.join(chr_to_filter) elif pre_header != 'all': for idx, heads in enumerate(pre_header.split(',')): pre_header_dict[heads] = chr_to_filter[all_header.index( heads)] #write_block.write('\t'.join(pre_header_dict.values())) line_to_write += '\t'.join(pre_header_dict.values()) # Step 04-B: compute values for the INFO tags of interest infos_to_write = process_info(my_infos, variant) #write_block.write('\t' + infos_to_write) line_to_write += '\t' + infos_to_write # Step 04-C: compute values for the FORMAT fields of interest for each SAMPLE names of interest # so, we need to use both format_fields and sample_names together # and pass it to a defined function if mode == 'wide': process_format_wide(variant, my_sample_idx, format_ids_values, write_block, line_to_write) elif mode == 'long': process_format_long(variant, my_sample_idx, format_ids_values, write_block, line_to_write, sample_ids) #write_block.write('\n') print('Elapsed time: ', time.time() - start_time01) print()
def test_samples(): v = VCF(VCF_PATH) assert len(v.samples) == 189
def vcf2pd(vcf_in): ''' VCF 파일을 읽어서 pandas dataframe 형식으로 return함 :param vcf_in: VCF 파일 (.vcf/.vcf.gz/.bcf) :return: pandas dataframe ''' vcf = VCF(vcf_in, gts012=True) lRows = [] # row의 list를 만들어서 마지막에 DataFrame으로 변환하는게 가장 빠르다. lInfo = [] # INFO list lFormat = [] # FORMAT list samples = vcf.samples n_samples = len(samples) # INFO FIELD의 item을 얻는다 for h in vcf.header_iter(): if (h['HeaderType'] == 'INFO'): lInfo.append(h['ID']) if (h['HeaderType'] == 'FORMAT'): lFormat.append(h['ID']) # sample information이 있는지 찾는다 (mutect2 output의 경우 이 정보가 포함되어있음) re_tumor = re.compile('##tumor_sample=.*') re_normal = re.compile('##normal_sample=.*') if re_tumor.search(vcf.raw_header) is not None: samples[samples.index( re_tumor.search(vcf.raw_header).group().split('=')[1])] = 'TUMOR' if re_normal.search(vcf.raw_header) is not None: samples[samples.index( re_normal.search(vcf.raw_header).group().split('=')[1])] = 'NORMAL' for v in vcf: # 8개의 fixed field를 저장한다. dVariant = { 'CHROM': v.CHROM, 'POS': v.POS, 'ID': v.ID, 'REF': v.REF, 'ALT': ','.join(v.ALT), 'QUAL': v.QUAL, 'FILTER': v.FILTER } if not dVariant[ 'FILTER']: # cyvcf2에서는 FILTER가 PASS일때 FILTER=None으로 저장하기 때문에 다시 'PASS'로 돌려줌 dVariant['FILTER'] = 'PASS' # INFO field 처리 for i in lInfo: dVariant[i] = v.INFO.get(i) # FORMAT field 처리 for f in lFormat: for i in range(n_samples): if f == 'GT': dVariant[samples[i] + '_' + f] = v.gt_types[i] # v.format('GT')에는 이상한 형식으로 저장됨. # gt_type = 0 --> hom_ref, gt_type = 1 --> hetero, gt_type = 2 --> hom_alt, gt_type = 3 --> unknown else: if v.format(f) is not None: # field가 None이 아니면 if isinstance(v.format(f)[i], str): dVariant[samples[i] + '_' + f] = str( v.format(f) [i]) # string일 경우 각 letter들이 comma로 구분되는 것 방지 elif np.isnan(v.format(f)[i]).any(): # nan일 경우.. dVariant[samples[i] + '_' + f] = None else: dVariant[samples[i] + '_' + f] = ','.join( list(map(str, v.format(f)[i]))) lRows.append(dVariant) cols = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'] cols.extend(lInfo) cols.extend([s + '_' + f for s in samples for f in lFormat]) if lRows: df = pd.DataFrame(lRows, columns=cols) else: df = pd.DataFrame( columns=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']) return df
def test_next(): v = VCF(VCF_PATH) variant = next(v) assert isinstance(variant, Variant)
def run(inheritance_model, ped, vcf, min_depth, min_gq, min_kindreds, severity): from cyvcf2 import VCF, Writer vcf = VCF(vcf, samples="-") annos = {} if "ANN" in vcf: desc = vcf["ANN"]["Description"] parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))] annos["ANN"] = desc if "EFF" in vcf: desc = vcf["EFF"]["Description"] parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())] annos["EFF"] = parts if "CSQ" in vcf: desc = vcf["CSQ"]["Description"] parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())] annos["CSQ"] = parts vcf.update(id="inheritance", type="String", number="1", description="inheritance stuffs") out = Writer("-", vcf) vcf_order = dict((n, i) for i, n in (enumerate(vcf.samples))) fams = Family.from_ped(ped, order=vcf_order) for fam_id in fams: fams[fam_id] = (EvalFamily(fams[fam_id]), [s._i for s in fams[fam_id].subjects]) def get_gene(variant): for anno in annos: consequences = variant.INFO[anno].split(",") effs = (Effect.new(anno, c, annos[anno]) for c in consequences) # limit to requested severity if severity is not None: effs = [e for e in effs if e.impact_severity in severity] effs = sorted(effs, reverse=True) for eff in effs: if eff.gene: return eff.gene # TODO: more flexible groupby for gene, variants in it.groupby(vcf, get_gene): matching_fams = defaultdict(list) saved_vars = [] uniq_fams = [] for i, variant in enumerate(variants): saved_vars.append(variant) for family_id, (fam, idxs) in fams.items(): fam.gt_types = variant.gt_types[idxs] fam.gt_depths = variant.gt_depths[idxs] fam.gt_quals = variant.gt_quals[idxs] # this dispatches to fam.auto_rec/auto_dom/de_novo/, etc. by the string # in inheritance model res = getattr(fam, inheritance_model)(min_depth=min_depth, min_gq=min_gq) # matched the inheritance model. if res: # can add custom logic here, e.g. and v.call_rate > 0.9: matching_fams[i].append(family_id) uniq_fams.append(family_id) if 0 < len(set(uniq_fams)) >= min_kindreds: if inheritance_model == 'comp_het': # TODO: idxs = matching_fams.keys() # run idxs[1:] vs idxs[:-1] for variants pass for i, family_ids in sorted(matching_fams.items()): variant = saved_vars[i] variant.INFO["inheritance"] = "%s:%s" % (gene, ",".join(set(family_ids))) out.write_record(variant)
def get_region_vcf( self, case_obj, chrom=None, start=None, end=None, gene_obj=None, variant_type="clinical", category="snv", rank_threshold=None, ): """Produce a reduced vcf with variants from the specified coordinates This is used for the alignment viewer. Args: case_obj(dict): A case from the scout database variant_type(str): 'clinical' or 'research'. Default: 'clinical' category(str): 'snv' or 'sv'. Default: 'snv' rank_threshold(float): Only load variants above this score. Default: 5 chrom(str): Load variants from a certain chromosome start(int): Specify the start position end(int): Specify the end position gene_obj(dict): A gene object from the database Returns: file_name(str): Path to the temporary file """ rank_threshold = rank_threshold or -100 variant_file = None if variant_type == "clinical": if category == "snv": variant_file = case_obj["vcf_files"].get("vcf_snv") elif category == "sv": variant_file = case_obj["vcf_files"].get("vcf_sv") elif category == "str": variant_file = case_obj["vcf_files"].get("vcf_str") elif category == "cancer": variant_file = case_obj["vcf_files"].get("vcf_cancer") elif variant_type == "research": if category == "snv": variant_file = case_obj["vcf_files"].get("vcf_snv_research") elif category == "sv": variant_file = case_obj["vcf_files"].get("vcf_sv_research") if not variant_file: raise FileNotFoundError("VCF file does not seem to exist") try: vcf_obj = VCF(variant_file) except Exception: raise FileNotFoundError( "Could not access {}. The file is missing or malformed".format( variant_file)) region = "" if gene_obj: chrom = gene_obj["chromosome"] start = gene_obj["start"] end = gene_obj["end"] if chrom: if start and end: region = "{0}:{1}-{2}".format(chrom, start, end) else: region = "{0}".format(chrom) else: rank_threshold = rank_threshold or 5 with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp: file_name = str(pathlib.Path(temp.name)) for header_line in vcf_obj.raw_header.split("\n"): if len(header_line) > 3: temp.write(header_line + "\n") try: for variant in vcf_obj(region): temp.write(str(variant)) except Exception: raise FileNotFoundError( "Could not find index for {}".format(variant_file)) return file_name