def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float( utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.info( "Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists( ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq * 100) + ('(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)' ) }) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter): vcf = VCF(vcffile) header_param_id = { 'ID': 'MISSING', 'Description': 'failed variant site missingness threshold ({} %)'.format( missing_threshold) } header_param_info = { 'ID': 'MISSINGPCT', 'Description': 'site missingness percentage', 'Type': 'Float', 'Number': '1' } vcf.add_filter_to_header(header_param_id) vcf.add_info_to_header(header_param_info) out = Writer('-', vcf) (total_sites, noted_sites) = (0, 0) for variant in vcf(region): total_sites += 1 (missing_pct, missing, total) = compute_missingness(variant) verdict = variant_missing_criteria(missing_threshold, missing_pct) variant = update_variant(variant, verdict, missing_pct) if verdict == "pass": noted_sites += 1 out.write_record(variant) elif verdict == "fail" and soft_filter: out.write_record(variant) out.close() msg = "After filtering, passed {} out of a possible {} Sites ({})" msg = msg.format(noted_sites, total_sites, 'pass') print(msg, file=sys.stderr)
def augment_vcf(vcf_in_file, vcf_out_file, bed_files, decimals): """Parses and augments VCF file.""" # Read in the input VCF file vcf_in = VCF(vcf_in_file) # Add rows to the header for each new field vcf_in = modify_header(vcf_in, bed_files) # Set up a write based on the tweaked input VCF file vcf_out = Writer(vcf_out_file, vcf_in) # Parse BED files bed = parse_bed_files(bed_files) # Iterate over every variant record for variant in vcf_in: # Augment the variant by adding new fields (if there are samples) num_samples = len(vcf_in.samples) if num_samples > 0: variant = add_fields_to_variant(variant, bed, decimals) # Output the augmented variant vcf_out.write_record(variant) # Close input and output VCF files vcf_in.close() vcf_out.close()
def writeVCF(vcf, inIDs, out): vcf_in = VCF(vcf) vcf_out = Writer(out, vcf_in) # vcf_out = VariantFile(out, 'wb', header=vcf_in.header) for rec in vcf_in: # for en,rec in enumerate(vcf_in.fetch()): chrom = rec.CHROM try: test = int(chrom) except ValueError: continue id1 = str(rec.ID) + ":" + str(rec.REF) id2 = str(rec.ID) + ":" + ''.join(rec.ALT) # recChang = list(set.intersection(*map(set,[[id1, id2], inIDs]))) # if len(recChang) != 0: # vcf_out.write_record(rec) # inIDs = [inID for inID in inIDs if inID not in recChang[0]] # recChang = list(set(id1) & set(id2) & set(inIDS)) if id1 in inIDs: #modify id rec.ID = id1 vcf_out.write_record(rec) # inIDS = [x for x in inIDs if id1 not in inIDs] # inIDs = list(filter(lambda x: x != id1, inIDs)) inIDs = [inID for inID in inIDs if inID not in id1] elif id2 in inIDs: rec.ID = id2 #modify id # vcf_out.write(rec) vcf_out.write_record(rec) # inIDs = list(filter(lambda x: x != id2, inIDs)) inIDs = [inID for inID in inIDs if inID not in id2] if len(inIDs) == 0: break
def merge(in_vcf, cadd_tsv): new_headers = annotation_info_headers() log("Collecting the CADD annotation information") cadd_annotations = create_CADD_annotation_dictionary(cadd_tsv) log("Processing the build37 vcf") vcf = VCF(in_vcf) for info_hdr in new_headers: vcf.add_info_to_header(info_hdr) out = Writer('-', vcf) in_vcf_variants = set() for variant in vcf: (variant, key) = update_variant(variant, cadd_annotations) in_vcf_variants.add(key) out.write_record(variant) out.close() log("Checking whether CADD completed correctly") ensure_cadd_completed_successfully( in_vcf, cadd_tsv, in_vcf_variants, frozenset(list(cadd_annotations.keys()))) log("All Done!")
def main(fname_in, fname_out, ambiguous_base_coverage_threshold): """ ambiguous_base_coverage_threshold: frequency threshold to include a variant in computation of ambiguous code """ vcf_reader = VCF(fname_in) vcf_writer = Writer(fname_out, vcf_reader) for variant in vcf_reader: base_list = [variant.REF] + variant.ALT coverage_list = variant.INFO.get("AD") total_coverage = sum(coverage_list) assert len(base_list) == len(coverage_list) # genotype 0 is reference (base is not really needed) genotype = [ i for i, (base, coverage) in enumerate(zip(base_list, coverage_list)) if coverage / total_coverage >= ambiguous_base_coverage_threshold ] variant.genotypes = [[*genotype, False]] vcf_writer.write_record(variant) vcf_writer.close() vcf_reader.close()
def filter_vcf(vcf, output, minlength=0, truncate_svlen=float("inf"), suffix=""): vcf_in = VCF(vcf) if not output: output = vcf.replace(".vcf", "_{}.vcf".format(suffix)) vcf_in.add_info_to_header({ 'ID': 'TRUNCATED', 'Description': "SVLEN truncated", 'Type': 'Flag', 'Number': '0' }) vcf_out = Writer(output, vcf_in) records_truncated = 0 records_filtered = 0 for v in vcf_in: svlen = get_svlen(v) if svlen >= minlength: if svlen > truncate_svlen: v.INFO['SVLEN'] = 1 v.INFO['END'] = v.start + 1 v.INFO['TRUNCATED'] = True records_truncated += 1 vcf_out.write_record(v) else: records_filtered += 1 if records_truncated != 0: sys.stderr.write("Truncated {} records where SVLEN > {}\n".format( records_truncated, int(truncate_svlen))) if records_filtered != 0: sys.stderr.write("Filtered {} records where SVLEN < {}\n".format( records_filtered, int(minlength)))
def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) # GATK 3.x can produce VCFs without sample names for empty VCFs try: tumor_index = vcf.samples.index(dd.get_sample_name(data)) except ValueError: tumor_index = None for rec in vcf: if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def test_add_info_to_header(): v = VCF(VCF_PATH) v.add_info_to_header({ 'ID': 'abcdefg', 'Description': 'abcdefg', 'Type': 'Character', 'Number': '1' }) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) import sys rec = next(v) rec.INFO["abcdefg"] = "XXX" w.write_record(rec) w.close() v = next(VCF(f)) ret = v.INFO["abcdefg"] if isinstance(ret, bytes): ret = ret.decode() assert ret == "XXX", (dict(v.INFO), v.INFO["abcdefg"])
def test_add_flag(): vcf = VCF(VCF_PATH) vcf.add_info_to_header({ 'ID': 'myflag', 'Description': 'myflag', 'Type': 'Flag', 'Number': '0' }) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec = next(vcf) rec.INFO["myflag"] = True w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["myflag"] is True, dict(v.INFO) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec.INFO["myflag"] = False w.write_record(rec) v = next(VCF(f)) assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def annotate_allelic_balance(vcffile, region): vcf = VCF(vcffile) header_hetab_param_info = { 'ID': 'HetAB', 'Description': 'heterozygous genotype allele balance', 'Type': 'Float', 'Number': '1' } header_het_hom_alt_ab_param_info = { 'ID': 'HetHomAltAB', 'Description': 'heterozygous + homozygous ALT genotype allele balance', 'Type': 'Float', 'Number': '1' } vcf.add_info_to_header(header_hetab_param_info) vcf.add_info_to_header(header_het_hom_alt_ab_param_info) out = Writer('-', vcf) (total_sites, noted_sites) = (0, 0) for variant in vcf(region): total_sites += 1 if is_biallelic(variant): noted_sites += 1 (hetab, het_hom_alt_ab) = compute_allelic_balances(variant) variant = update_variant(variant, hetab, het_hom_alt_ab) out.write_record(variant) out.close() msg = "Annotated {} out of a possible {} sites" msg = msg.format(noted_sites, total_sites) log(msg)
def write_pass_vcf(annotated_vcf): out_vcf = re.sub(r'\.annotated\.vcf\.gz$', '.annotated.pass.vcf', annotated_vcf) vcf = VCF(annotated_vcf) w = Writer(out_vcf, vcf) num_rejected = 0 num_pass = 0 for rec in vcf: if rec.FILTER is None or rec.FILTER == 'None': w.write_record(rec) num_pass += 1 else: num_rejected += 1 vcf.close() w.close() logger.info('Number of non-PASS/REJECTED variant calls: ' + str(num_rejected)) logger.info('Number of PASSed variant calls: ' + str(num_pass)) if num_pass == 0: logger.warning( 'There are zero variants with a \'PASS\' filter in the VCF file') os.system('bgzip -dc ' + str(annotated_vcf) + ' egrep \'^#\' > ' + str(out_vcf)) #else: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') return
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_in.add_info_to_header({ 'ID': 'SVLEN', 'Description': 'length of sv', 'Type': 'Integer', 'Number': '1' }) vcf_in.add_info_to_header({ 'ID': 'SVTYPE', 'Description': 'type of sv - just DEL or INS based on SVLEN', 'Type': 'String', 'Number': '1' }) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49: v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF) if v.INFO["SVLEN"] > 0: v.INFO["SVTYPE"] = "INS" else: v.INFO["SVTYPE"] = "DEL" vcf_out.write_record(v) vcf_in.close() vcf_out.close()
def test_write_missing_contig(): input_vcf = VCF('{}/seg.vcf.gz'.format(HERE)) output_vcf = Writer('/dev/null', input_vcf) for v in input_vcf: v.genotypes = [[1, 1, False]] output_vcf.write_record(v) output_vcf.close()
def use_cyvcf2(vcf_file, vcf_out=None): """ Working. File out: 2:17.51 stdout + bgzip: 2:50.35 """ from cyvcf2 import VCF, Writer vcf = VCF(vcf_file) vcf.add_filter_to_header({ 'ID': 'MSI_FAIL', 'Description': 'Possible homopolymer artefact' }) if vcf_out: w = Writer(vcf_out, vcf) else: w = None sys.stdout.write(vcf.raw_header) for rec in vcf: msi_fail = proc_fields(rec.REF, rec.ALT[0], rec.format('AF')[0][0], rec.INFO['MSI']) if msi_fail: filters = rec.FILTER.split(';') if rec.FILTER else [] filters.append('MSI_FAIL') rec.FILTER = ';'.join(filters) if w: w.write_record(rec) else: sys.stdout.write(str(rec)) if w: w.close()
def test_add_flag(): vcf = VCF(VCF_PATH) vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag', 'Type':'Flag', 'Number': '0'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec = vcf.next() rec.INFO["myflag"] = True w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["myflag"] is None, dict(v.INFO) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec.INFO["myflag"] = False w.write_record(rec) v = next(VCF(f)) assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def test_write_missing_contig(): input_vcf = VCF('{}/seg.vcf.gz'.format(HERE)) output_vcf = Writer('/dev/null', input_vcf) for v in input_vcf: v.genotypes = [[1,1,False]] output_vcf.write_record(v) output_vcf.close()
def prepare_benign_training_sets(vcf, output, intersection_clinvar_hgmd, intersection_circularity, db): o = Writer(output, vcf) stats_dict = collections.defaultdict() stats_dict['Database'] = db stats_dict['Circularity_filtering'] = 0 stats_dict['High_confidence'] = 0 for counter, variant in enumerate( tqdm( vcf, desc= 'Removing overlapping variants : [gnomAD] ∩ [ClinVar, HGMD, Training_sets]' )): if counter == 3000: break if len(variant.REF) == 1 and len(variant.ALT[0]) == 1: id_var = str(variant.CHROM) + '_' + str(variant.POS) + '_' + str( variant.REF) + '_' + str(variant.ALT[0]) if id_var not in intersection_clinvar_hgmd: stats_dict['High_confidence'] += 1 if id_var not in intersection_circularity: stats_dict['Circularity_filtering'] += 1 variant.INFO['True_Label'] = -1 variant.INFO['Source'] = db o.write_record(variant) return stats_dict
def output_pharmcat_ready_vcf(input_vcf, output_dir, output_prefix): ''' iteratively write to a PharmCAT-ready VCF for each sample "bcftools view <options> <input_vcf>". For bcftools common options, see running_bcftools(). "-U" exclude sites without a called genotype, i.e., GT = './.' ''' input_vcf_cyvcf2 = VCF(input_vcf) input_vcf_sample_list = input_vcf_cyvcf2.samples input_vcf_sample_list.remove('PharmCAT') input_vcf_cyvcf2.close() # output each single sample to a separete VCF for single_sample in input_vcf_sample_list: print('Generating a PharmCAT-ready VCF for ' + single_sample) input_vcf_cyvcf2 = VCF(input_vcf, samples=single_sample) # write to a VCF output file output_file_name = os.path.join( output_dir, output_prefix + '.' + single_sample + '.vcf') # header output_vcf_cyvcf2 = Writer(output_file_name, input_vcf_cyvcf2, mode='w') # content for single_var in input_vcf_cyvcf2: output_vcf_cyvcf2.write_record(single_var) output_vcf_cyvcf2.close() input_vcf_cyvcf2.close()
def processVCF(invcf, remm, dann, out): vcf_data = VCF(invcf, gts012=True) tbx_remm = pysam.TabixFile(remm) tbx_dann = pysam.TabixFile(dann) vcf_data.add_info_to_header({ 'ID': 'DANN', 'Description': 'A deep neural network aimed to recognize pathogenic variants by annotating genetic variants, especially in noncoding regions.', 'Type': 'String', 'Number': '.' }) w = Writer(out, vcf_data) for record in vcf_data: try: for row in tbx_remm.fetch(record.CHROM, record.start, record.end): if int(str(row).split()[1]) == record.POS: record.INFO["ReMM"] = str(row).split()[2] if not record.INFO["ReMM"]: record.INFO["ReMM"] = "." except ValueError: record.INFO["ReMM"] = "." try: for row in tbx_dann.fetch(record.CHROM, record.start, record.end): if int(row.split()[1]) == record.POS and row.split( )[2] == record.REF and row.split()[3] == record.ALT[0]: record.INFO["DANN"] = round(float(row.split()[4]), 3) break else: record.INFO["DANN"] = "." except ValueError: record.INFO["DANN"] = "." w.write_record(record)
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir, input_ref_pgx_vcf): ''' extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output ''' print( 'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.' ) path_output = os.path.join( output_dir, obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz') input_vcf_cyvcf2 = VCF(input_vcf) input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf) # get pgx regions in each chromosome input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf) input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[ 'CHROM'].replace({ 'chr': '' }, regex=True).astype(str).astype(int) ref_pgx_regions = input_ref_pgx_pos_pandas.groupby( ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index() # fix chr names chr_name_match = re.compile("^chr") if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames): # chromosomes have leading 'chr' characters in the original VCF # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1).replace({'^': 'chr'}, regex=True) else: # chromosomes do not have leading 'chr' characters in the original VCF # add chromosome name with leading 'chr' to the VCF header for single_chr in input_vcf_cyvcf2.seqnames: input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr + '>') # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1) # write to a VCF output file # header output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz") # content for single_region in ref_pgx_regions: for single_variant in input_vcf_cyvcf2(single_region): single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1', single_variant.CHROM) output_vcf_cyvcf2.write_record(single_variant) # close pipe input_vcf_cyvcf2.close() input_ref_pgx_pos_cyvcf2.close() output_vcf_cyvcf2.close() tabix_index_vcf(tabix_executable_path, path_output) return path_output
def main(invcf: str = typer.Argument(..., help="输入的vcf文件"), outvcf: str = typer.Argument(..., help="输出的vcf文件"), mindepth: int = typer.Option(10, help="最低reads覆盖率"), het_altrange: Tuple[float, float] = typer.Option((0.2, 0.8), help="杂合位点的alt频率范围"), homref_maxaltrate: float = typer.Option( 0, help="纯合ref型GT,最大alt reads比例不超过这个"), homalt_minaltrate: float = typer.Option( 1, help="纯合alt型GT,最小alt reads比例不低于这个")): """ mask掉满足以下的genotype: 杂合位点alt reads的频率不在20%到80%范围之内的。 纯合位点reads支持比例不是100%的。 覆盖的reads小于10条的。 """ vcf = VCF(invcf) w = Writer(outvcf, vcf) for v in vcf: indicies_mask = filter_samples(v, mindepth, het_altrange, homref_maxaltrate, homalt_minaltrate) if indicies_mask: for index in indicies_mask: v.genotypes[index] = [-1] * v.ploidy + [False] v.genotypes = v.genotypes w.write_record(v) w.close() vcf.close()
def seperate_vcffile(self): # start = time.time() file_list = self.search_vcf_file(self.from_directory) for file in file_list: vcf_read = VCF(file) samples = vcf_read.samples chromosome_num = "" for variant in vcf_read: chromosome_num = variant.CHROM break for sample in samples: start = time.time() # print(sample, "file write start... ", start) try: if not (os.path.isdir(self.target_directory)): os.makedirs(os.path.join(self.target_directory)) if not (os.path.isdir(self.target_directory + "/" + sample)): os.makedirs( os.path.join(self.target_directory + "/" + sample)) except OSError as e: print("Failed to create directory!!!!!", e) raise filepath = os.path.join(self.target_directory + "/" + sample, chromosome_num + "-" + sample + ".vcf") index = 0 while os.path.exists(filepath): index = index + 1 filepath = os.path.join( self.target_directory + "/" + sample, chromosome_num + "-" + sample + str(index) + ".vcf") out_read_vcf = VCF(file, samples=[sample]) write_file = Writer(filepath, out_read_vcf) for variant in out_read_vcf: if chromosome_num == "Y": if not variant.genotypes[0][0] == 0: write_file.write_record(variant) elif not (variant.genotypes[0][0] == 0 and variant.genotypes[0][1] == 0): write_file.write_record(variant) write_file.close() out_read_vcf.close() with open(filepath, "rb") as f_in: with gzip.open(filepath + ".gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) os.remove(filepath) sec = time.time() - start print(sample + " write end...", time.strftime("%H:%M:%S", time.gmtime(sec))) break vcf_read.close()
def write_truncate_vcf(path_in: str, path_out: str, trunc: int) -> int: w = Writer(path_out, VCF(path_in, threads=nb_cores)) for i, v in enumerate(VCF(path_in, threads=nb_cores)): if i == trunc: break else: w.write_record(v) return i
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if v.INFO["SVLEN"] > 49: vcf_out.write_record(v) vcf_in.close() vcf_out.close()
def main(): args = get_args() vcf = VCF(args.vcf) w = Writer(args.output, vcf) for v in vcf: if v.INFO["SVTYPE"] == "DEL": if not v.INFO["SVLEN"] < 0: v.INFO["SVLEN"] = -v.INFO["SVLEN"] w.write_record(v)
def main(): args = get_args() vars = defaultdict(list) vcf = VCF(args.vcf) output = args.vcf.replace('.vcf', '') for v in vcf: vars[v.INFO.get('SVTYPE')].append(v) for k, varlist in vars.items(): w = Writer(output + '_' + k.replace('/', '') + '.vcf', vcf) for v in varlist: w.write_record(v)
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def to_vcf(self, path): """ Parse query result as vcf file. Args: path: path of the file. """ from cyvcf2 import Writer writer = Writer(path, self.vcf) for v in self: writer.write_record(v.source)
def canonicalize_vcf(input: PathType, output: PathType) -> None: """Canonicalize the fields in a VCF file by writing all INFO fields in the order that they appear in the header.""" with open_vcf(input) as vcf: info_field_names = _info_fields(vcf.raw_header) w = Writer(str(output), vcf) for v in vcf: v = _reorder_info_fields(w, v, info_field_names) w.write_record(v) w.close()
def main(): args = get_args() vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect = 0 for v in vcf: if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL": v.ALT = "<DEL>" incorrect += 1 output.write_record(v) print("Fixed {} positions".format(incorrect)) output.close() vcf.close()
def main(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields, vcf): reader = VCF(vcf) refilter = Filter(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields) reader.add_filter_to_header(refilter.filtered_header()) reader.add_info_to_header(refilter.rescued_header()) writer = Writer('-', reader) for variant in reader: refilter(variant) # Modifies variant filter status in place writer.write_record(variant)
def main(): args = get_args() genome = Fasta(args.genome) vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect_reference = 0 for v in vcf: ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome) if v.REF != ref_nucl: v.REF = ref_nucl incorrect_reference += 1 output.write_record(v) print("Fixed {} positions".format(incorrect_reference)) output.close() vcf.close()
def test_issue44(): vcf = VCF('{}/issue_44.vcf'.format(HERE)) w = Writer('__o.vcf', vcf) for v in vcf: tmp = v.genotypes #print(tmp, file=sys.stderr) v.genotypes = tmp w.write_record(v) w.close() # "./." "." ".|." "0|0" expected = [[-1, -1, False], [-1, False], [-1, -1, True], [0, 0, True]] print("", file=sys.stderr) for i, v in enumerate(VCF('__o.vcf')): #print(v.genotypes, file=sys.stderr) assert v.genotypes == [expected[i]], (v.genotypes, expected[i]) os.unlink("__o.vcf")
def test_add_filter_to_header(): v = VCF(VCF_PATH) # NOTE that we have to add the filter to the header of the reader, # not the writer because the record will be associated with the reader v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'}) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) rec = v.next() rec.FILTER = ["abcdefg"] w.write_record(rec) w.close() v = next(VCF(f)) assert v.FILTER == "abcdefg", v.FILTER
def test_add_info_to_header(): v = VCF(VCF_PATH) v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg', 'Type':'Character', 'Number': '1'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) import sys rec = v.next() rec.INFO["abcdefg"] = "XXX" w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
def test_writer(): v = VCF(VCF_PATH) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) o = Writer(f, v) rec = next(v) rec.INFO["AC"] = "3" rec.FILTER = ["LowQual"] o.write_record(rec) rec.FILTER = ["LowQual", "VQSRTrancheSNP99.90to100.00"] o.write_record(rec) rec.FILTER = "PASS" o.write_record(rec) o.close() expected = ["LowQual".encode(), "LowQual;VQSRTrancheSNP99.90to100.00".encode(), None] for i, variant in enumerate(VCF(f)): assert variant.FILTER == expected[i], (variant.FILTER, expected[i])
def run(inheritance_model, ped, vcf, min_depth, min_gq, min_kindreds, severity): from cyvcf2 import VCF, Writer vcf = VCF(vcf, samples="-") annos = {} if "ANN" in vcf: desc = vcf["ANN"]["Description"] parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))] annos["ANN"] = desc if "EFF" in vcf: desc = vcf["EFF"]["Description"] parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())] annos["EFF"] = parts if "CSQ" in vcf: desc = vcf["CSQ"]["Description"] parts = [x.strip(" [])'(\"") for x in re.split("\||\(", desc.split(":", 1)[1].strip())] annos["CSQ"] = parts vcf.update(id="inheritance", type="String", number="1", description="inheritance stuffs") out = Writer("-", vcf) vcf_order = dict((n, i) for i, n in (enumerate(vcf.samples))) fams = Family.from_ped(ped, order=vcf_order) for fam_id in fams: fams[fam_id] = (EvalFamily(fams[fam_id]), [s._i for s in fams[fam_id].subjects]) def get_gene(variant): for anno in annos: consequences = variant.INFO[anno].split(",") effs = (Effect.new(anno, c, annos[anno]) for c in consequences) # limit to requested severity if severity is not None: effs = [e for e in effs if e.impact_severity in severity] effs = sorted(effs, reverse=True) for eff in effs: if eff.gene: return eff.gene # TODO: more flexible groupby for gene, variants in it.groupby(vcf, get_gene): matching_fams = defaultdict(list) saved_vars = [] uniq_fams = [] for i, variant in enumerate(variants): saved_vars.append(variant) for family_id, (fam, idxs) in fams.items(): fam.gt_types = variant.gt_types[idxs] fam.gt_depths = variant.gt_depths[idxs] fam.gt_quals = variant.gt_quals[idxs] # this dispatches to fam.auto_rec/auto_dom/de_novo/, etc. by the string # in inheritance model res = getattr(fam, inheritance_model)(min_depth=min_depth, min_gq=min_gq) # matched the inheritance model. if res: # can add custom logic here, e.g. and v.call_rate > 0.9: matching_fams[i].append(family_id) uniq_fams.append(family_id) if 0 < len(set(uniq_fams)) >= min_kindreds: if inheritance_model == 'comp_het': # TODO: idxs = matching_fams.keys() # run idxs[1:] vs idxs[:-1] for variants pass for i, family_ids in sorted(matching_fams.items()): variant = saved_vars[i] variant.INFO["inheritance"] = "%s:%s" % (gene, ",".join(set(family_ids))) out.write_record(variant)
from cyvcf2 import VCF, Writer import re import sys patt = re.compile(',|\|') def clinvar(v): return v.INFO.get("CLNSIG") == "5" #return [x in "45" for x in re.split(patt,v.INFO.get("CLNSIG"))][0] def aaf(v, max_aaf): if v.INFO.get("max_aaf_all") != None: return float(v.INFO.get("max_aaf_all")) <= float(max_aaf) else: return True vcf_path = sys.argv[1] max_aaf = float(sys.argv[2]) viter = VCF(vcf_path) w = Writer("-", viter) pos = lambda v: (v.CHROM, v.start, v.end) for v in viter: if clinvar(v) and aaf(v, max_aaf): w.write_record(v) w.close()