def main(bedfile, output_file, genome_file): """ Constructing a vcf file from scratch using the linkedSV and pysam module bedpe inputfile describing the variants """ contigs = get_contigs(genome_file) tmp_vcf = vcf_from_scratch(output_file, contigs) vcf_in = VariantFile(tmp_vcf) ashkenazim_son = "HG002" vcf_in.header.info.add('END', number=1, type='Integer', description="End position of the variant " "described in this record") vcf_in.header.info.add('SVTYPE', number=1, type='String', description="Type of structural variant") vcf_in.header.info.add('SVMETHOD', number=1, type='String', description="SV detection method") vcf_in.header.info.add( 'NUM_FRAGMENT_SUPPORT', number=1, type='Integer', description="Number of fragments supporting the variant") vcf_in.header.info.add( 'NUM_READ_PAIR', number=1, type='Integer', description="Number of read pairs supporting the variant") vcf_in.header.info.add('ENDPOINT1', number=1, type='String', description="?") vcf_in.header.info.add('ENDPOINT2', number=1, type='String', description="?") vcf_in.header.info.add( 'BARCODES', number=1, type='String', description="List of molecules barcodes supporting the variant") vcf_in.header.formats.add('GT', number=1, type='String', description="Genotype") vcf_in.header.add_sample(ashkenazim_son) records = reformat_bedpe2vcfrecords(bedfile, vcf_in.header) vcf_out = VariantFile(output_file, 'w', header=vcf_in.header) for rec in records: vcf_out.write(rec) os.remove(tmp_vcf)
def file_process(fname): try: cpath = fname.rstrip('\n') sys.stderr.write("Processing " + cpath + "\n") sys.stderr.flush() in_vcf = VariantFile(cpath) # pdb.set_trace() for cat in tbl_dict: for key in tbl_dict[cat]: getattr(in_vcf.header, cat)[key].remove_header() in_vcf.header.add_meta(cat_dict[cat], items=[('ID', key), ('Number', getattr(good_boy.header, cat)[key].number), ('Type', getattr(good_boy.header, cat)[key].type), ('Description', getattr(good_boy.header, cat)[key].description)]) # pdb.set_trace() out_vcf = VariantFile("-", 'w', header=in_vcf.header) for rec in in_vcf.fetch(): out_vcf.write(rec) out_vcf.close() except Exception as e: sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n")
def run_process(opts, inputvcf): outputvcf = opts.output # Open VCF vcf_in = VariantFile(inputvcf) # Add FORMAT to Header vcf_in.header.formats.add("AF", "A", "Float", "Allele fractions of alternate alleles") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts vaf_list = list() for n, alt in enumerate(alts): # Get DP,AO,RO,VAF tmp_vaf = float(record.samples[0]['AD'][1]) / float( record.samples[0]['AD'][0] + record.samples[0]['AD'][1]) vaf_list.append(tmp_vaf) if vaf_list != []: record.samples[0]["AF"] = tuple(vaf_list) # Write VCF vcf_out.write(record)
def filter_variants(vcf, read_ratio, depth, output): """ Soft filter all variants with suspicious read ratio and insufficient read-depth """ vcf_in = VariantFile(vcf) new_header = vcf_in.header new_header.filters.add(f"AR{read_ratio}", None, None, f"Ratio of ref/alt reads lower than {read_ratio}") new_header.filters.add(f"DP{depth}", None, None, f"DP is lower than {depth}x") vcf_out = VariantFile(output, "w", header=new_header) for record in vcf_in.fetch(): ad = record.samples[0]["AD"] # No multiallelic split if record.info["DP"] < depth: record.filter.add("DP100") elif len(ad) == 2: n_ref, n_alt = ad if n_alt / (n_ref + n_alt) < read_ratio: record.filter.add(f"AR{read_ratio}") else: record.filter.add("PASS") vcf_out.write(record)
def _dump_rebased_vcf(records: List[VariantRecord], disco_paths: DiscoverPaths): template_vcf = VariantFile(disco_paths.discov_vcf) output_vcf = VariantFile(disco_paths.final_vcf, "w", header=template_vcf.header) for record in records: output_vcf.write(record)
class VCFWriter: def __init__(self, reference_file_path, contigs, sample_name, output_dir, filename): self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path) self.contigs = contigs vcf_header = self.get_vcf_header(sample_name, contigs) self.vcf_file = VariantFile(output_dir + filename + '.vcf', 'w', header=vcf_header) def write_vcf_records(self, called_variant): contig, ref_start, ref_end, ref_seq, alleles, genotype = called_variant alleles = tuple([ref_seq]) + tuple(alleles) vcf_record = self.vcf_file.new_record(contig=str(contig), start=ref_start, stop=ref_end, id='.', qual=60, filter='PASS', alleles=alleles, GT=genotype, GQ=60) self.vcf_file.write(vcf_record) def get_vcf_header(self, sample_name, contigs): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) sqs = self.fasta_handler.get_chromosome_names() for sq in sqs: if sq not in contigs: continue sq_id = sq ln = self.fasta_handler.get_chromosome_sequence_length(sq) header.contigs.add(sq_id, length=ln) header.add_sample(sample_name) return header
class VCFWriter: def __init__(self, bam_file_path, sample_name, output_dir): self.bam_handler = BamHandler(bam_file_path) bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0] vcf_header = self.get_vcf_header(sample_name) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + bam_file_name + '_' + time_str + '.vcf', 'w', header=vcf_header) def write_vcf_records(self, called_variants): for variant in called_variants: alleles = tuple([variant.ref]) + tuple(variant.alternate_alleles) # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq) vcf_record = self.vcf_file.new_record(contig=str( variant.chromosome_name), start=variant.pos_start, stop=variant.pos_end, id='.', qual=60, filter='PASS', alleles=alleles, GT=variant.genotype, GQ=60) self.vcf_file.write(vcf_record) def get_vcf_header(self, sample_name): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) bam_sqs = self.bam_handler.get_header_sq() for sq in bam_sqs: id = sq['SN'] ln = sq['LN'] items = [('ID', id), ('length', ln)] header.add_meta(key='contig', items=items) header.add_sample(sample_name) return header
def convert_vcffile(filename, outfile_name, source): vcf_in = VariantFile(filename) vcf_out = VariantFile(outfile_name, "w", header=vcf_in.header) for rec in vcf_in.fetch(): if source == "svaba": rec = convert_indel(rec) for conv_rec in convert_to_bnd(rec, source): vcf_out.write(conv_rec)
def main(argv): bcf_in = VariantFile(argv[0]) # auto-detect input format bcf_out = VariantFile(argv[0] + '.filtered.vcf.gz', 'w', header=bcf_in.header) for site in bcf_in.fetch(): keep_site = 0 # default option is to remove SNP for sample, rec in site.samples.items(): if max(rec.get('GP')[1:]) > 0.9: keep_site = 1 # do not remove SNP if either het or non-ref h**o is greater than .9 for any sample if keep_site: bcf_out.write(site)
def create_sample_format_from_info_lofreq(sample, input_name, output_name, skip_gt=False): input_vcf = VariantFile(input_name, 'r') input_vcf.header.formats.add("AF", number=1, type='Float', description="Allele Frequency") input_vcf.header.formats.add( "AD", number=".", type='String', description= "Allelic sample depths for the ref and alt alleles in the order listed" ) input_vcf.header.formats.add( "DP", number=1, type='Integer', description= "Approximate read depth (reads with MQ=255 or with bad mates are filtered)" ) input_vcf.header.formats.add( "DP4", number=4, type='Integer', description= "Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases" ) input_vcf.header.formats.add("GT", number=".", type="String", description="Genotype") input_vcf.header.add_sample(sample) output_vcf = VariantFile(output_name, 'w', header=input_vcf.header) for record in input_vcf: ad = record.info["AD"] af = record.info["AF"] dp = record.info["DP"] fields = { "AF": af, "DP4": record.info["DP4"], "DP": dp, "AD": ad, "GT": (record.alleles[1], record.alleles[0]) } new_record = output_vcf.new_record(record.chrom, record.start, record.stop, record.alleles, record.id, record.qual, record.filter, record.info, [fields]) #, output_vcf.write(new_record)
def decompose_multiallelic_record(in_vcf, out_vcf): """Break records with multiple ALT alleles into multiple records.""" i_vcf = VariantFile(in_vcf, "r") raw_out = out_vcf.strip(".gz") o_vcf = VariantFile(raw_out, "w", header=i_vcf.header) for record in i_vcf: # Only mutect put multiple ALTs in one record number_events = len(record.alts) # Temporary fix due to segfault # see https://github.com/leukgen/click_mergevcfs/issues/2 if number_events >= 8: continue elif number_events > 1: click.echo("file={},pos={}".format(in_vcf, record.pos)) for i in range(0, number_events): new_rec = record.copy() new_rec.alts = tuple([record.alts[i]]) # Multiallic sites GT are ex. 0/1/2, which causes error later # Needs to change to ./. genotypes = list(record.samples) for g in genotypes: # Overwrite GT new_rec.samples[g]["GT"] = (None, None) # Use none_if_tuple_out_of_idx because # record.samples[g]['AD'] would sometimes return # a tuple of (None,) if "AD" in list(record.samples[g]): new_rec.samples[g]["AD"] = ( record.samples[g]["AD"][0], none_if_tuple_out_of_idx(t=record.samples[g]["AD"], index=i + 1), ) if "AF" in list(record.samples[g]): new_rec.samples[g]["AF"] = none_if_tuple_out_of_idx( t=record.samples[g]["AF"], index=i) if "F1R2" in list(record.samples[g]): new_rec.samples[g]["F1R2"] = ( record.samples[g]["F1R2"][0], none_if_tuple_out_of_idx( t=record.samples[g]["F1R2"], index=i + 1), ) if "F2R1" in list(record.samples[g]): new_rec.samples[g]["F2R1"] = ( record.samples[g]["F2R1"][0], none_if_tuple_out_of_idx( t=record.samples[g]["F2R1"], index=i + 1), ) o_vcf.write(new_rec) else: o_vcf.write(record) o_vcf.close() subprocess.check_call(["bgzip", "-f", raw_out])
def vcf_merge_vcfs(in_vcf1, in_vcf2, happy_vcf, output_vcf): """ Merge two vcf files :param in_vcf1: Input VCF file 1 :param in_vcf2: Input VCF file 2 :param happy_vcf: Hap.py input file :param output_vcf: Output VCF file :return: """ happy_vcf_file = VariantFile(happy_vcf) # counter to keep track to of true positive cases true_positive_positions = defaultdict(list) # filter the file for rec in happy_vcf_file.fetch(): for sample in rec.samples: sample_bd = rec.samples[sample]['BD'] if sample_bd == 'TP': # record a true positive case true_positive_positions[rec.contig].append(rec.pos) # read the two inpt files vcf1_vcf_file = VariantFile(in_vcf1) vcf2_vcf_file = VariantFile(in_vcf2) # for VCF1 we add all of the records. merged_records = [] position_dict = set() for rec in vcf1_vcf_file.fetch(): position_dict.add((rec.contig, rec.pos)) merged_records.append((rec.contig, rec.pos, rec)) # for VCF2 we add records that are not true positives for rec in vcf2_vcf_file.fetch(): if rec.pos not in true_positive_positions[rec.contig]: if (rec.contig, rec.pos) not in position_dict: merged_records.append((rec.contig, rec.pos, rec)) # sort the records merged_records.sort(key=operator.itemgetter(0, 1)) # output the file vcf_out = VariantFile(output_vcf, 'w', header=vcf1_vcf_file.header) # write the VCF for cotig, pos, rec in merged_records: vcf_out.write(rec) # process completed sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: PROCESS FINISHED " + "\n") sys.stderr.flush()
def add_contigs_to_header(input_name, output_name, contig_file, assembly): from src.lib.data.files.reference import InfoImporter info = InfoImporter(contig_file) input_vcf = VariantFile(input_name, 'r') for key in info: input_vcf.header.contigs.add(key, length=info[key]['length'], assembly=assembly) output_vcf = VariantFile(output_name, 'w', header=input_vcf.header) #output_vcf.header.info.add("AD", number=".", type='Integer', description="Allelic depths for the ref and alt alleles in the order listed") for record in input_vcf: output_vcf.write(record)
def get_filtered_phased_het_trio_variants(trio_vcf, trio_filtered_het_phased_vcf, sample_name): vcf_in = VariantFile(trio_vcf) vcf_in.subset_samples([sample_name]) vcf_out = VariantFile(trio_filtered_het_phased_vcf, 'w', header=vcf_in.header) for rec in vcf_in.fetch(): if rec.filter.keys()[0] == 'PASS': rec_sample = rec.samples[0] if rec_sample.phased and rec_sample['GT'][0] != rec_sample['GT'][1]: rec.samples[0].update({'PS':1}) vcf_out.write(rec) return 0
def filter_somatic(in_vcf_path, out_vcf_path): in_vcf = VariantFile(in_vcf_path) out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header) num_skipped_records = 0 for rec in in_vcf: if is_somatic(rec): try: out_vcf.write(rec) except OSError: num_skipped_records += 1 print("Skipped " + str(num_skipped_records) + " bad records") in_vcf.close() out_vcf.close()
def main(): vcf = VariantFile(snakemake.input.vcf) outlier_table = pd.read_table(snakemake.input.outliers) filtered = VariantFile(snakemake.output[0], mode='w', header=vcf.header) outliers = defaultdict(list) for idx, row in outlier_table.iterrows(): outliers[row['svtype']].append(row['sample']) for record in remove_outliers(vcf, outliers): filtered.write(record) filtered.close()
def filter_bcf_file(self, bcf_file): bcf_in = VariantFile(bcf_file,'rb') bcf_out = VariantFile("%s.target.vcf" % bcf_file[:-4],'w',header=bcf_in.header) for rec in bcf_in.fetch(): if rec.contig == self.contig_id: if self.contig_start == False and self.contig_end == False: pass else: if rec.pos >= self.contig_start and rec.pos <= self.contig_end: bcf_out.write(rec) bcf_in.close() bcf_out.close()
def subset_by_callers(in_file, callers): out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(".gz", ""), "_".join(callers)) if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"): want_callers = set(callers) reader = VariantFile(in_file) writer = VariantFile(out_file, "w", header=reader.header) count = 0 for rec in reader: cur_callers = set(rec.info["set"].split("-")) if len(cur_callers & want_callers) > 0: count += 1 writer.write(rec) print callers, count return vcfutils.bgzip_and_index(out_file, {})
def subset_by_callers(in_file, callers): out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace( ".gz", ""), "_".join(callers)) if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"): want_callers = set(callers) reader = VariantFile(in_file) writer = VariantFile(out_file, "w", header=reader.header) count = 0 for rec in reader: cur_callers = set(rec.info["set"].split("-")) if len(cur_callers & want_callers) > 0: count += 1 writer.write(rec) print callers, count return vcfutils.bgzip_and_index(out_file, {})
def add_AD_field_using_DP4(input_name, output_name): input_vcf = VariantFile(input_name, 'r') input_vcf.header.info.add( "AD", number=".", type='String', description= "Allelic depths for the ref and alt alleles in the order listed") output_vcf = VariantFile(output_name, 'w', header=input_vcf.header) #output_vcf.header.info.add("AD", number=".", type='Integer', description="Allelic depths for the ref and alt alleles in the order listed") for record in input_vcf: ref_pos, ref_neg, var_pos, var_neg = record.info['DP4'] new_record = record.copy() new_record.info["AD"] = "{},{}".format(ref_pos + ref_neg, var_pos + var_neg) output_vcf.write(new_record)
def main(bedfile, output_file, genome_file): """ Constructing a vcf file from scratch using the linkedSV bedpe inputfile thzta describes the variants """ contigs = get_contigs(genome_file) tmp_vcf = vcf_from_scratch(output_file, contigs) vcf_in = VariantFile(tmp_vcf) ashkenazim_son = "HG002" vcf_in.header.info.add('END', number=1, type='Integer', description="End position of the variant " "described in this record") vcf_in.header.info.add('SVLEN', number=1, type='Integer', description="Length of the variant " "described in this record") vcf_in.header.info.add('SVTYPE', number=1, type='String', description="Type of structural variant") vcf_in.header.info.add('SVMETHOD', number=1, type='String', description="SV detection method") vcf_in.header.filters.add('FAIL', number=None, type=None, description="Fail to pass filtering") vcf_in.header.formats.add('GT', number=1, type='String', description="Genotype") vcf_in.header.add_sample(ashkenazim_son) records = reformat_bedpe2vcfrecords(bedfile, vcf_in.header) vcf_out = VariantFile(output_file, 'w', header=vcf_in.header) for rec in records: vcf_out.write(rec) os.remove(tmp_vcf)
def prepare_octopus_vcf_for_rtg(octopus_vcf, tumour_sample, out_vcf_name): """" Octopus reports non-diploid genotypes for somatic variants. """ in_vcf = VariantFile(octopus_vcf) out_vcf = VariantFile(out_vcf_name, 'w', header=in_vcf.header) n_failed = 0 for record in in_vcf: old_gt = record.samples[tumour_sample]['GT'] assert (len(old_gt) > 1) somatic_allele = next(a for a in reversed(list(old_gt)) if a is not None and a > 0) record.samples[tumour_sample]['GT'] = (old_gt[0], somatic_allele) try: out_vcf.write(record) except OSError: n_failed += 1 out_vcf.close() index(out_vcf_name)
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str, output_vcf: str) -> None: """ Transforms dToxoG MAF to minimal VCF of only dtoxo failures. :param input_maf: The annotated dtoxog MAF output file. :param reference_fa: Reference fasta used to make seqdict header. :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("dtoxog_maf_to_vcf") logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures") # setup total = 0 written = 0 tag = "oxog" # header header = generate_header(reference_fa, tag) # Writer mode = get_pysam_outmode(output_vcf) writer = VariantFile(output_vcf, mode=mode, header=header) # Process try: with open(input_maf, "rt") as fh: for record in maf_generator(fh): total += 1 if record["oxoGCut"] == "1": new_vcf_record = build_new_record(record, writer, tag) writer.write(new_vcf_record) written += 1 finally: writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Wrote {}".format(total, written))
def add_PASSED_field(in_vcf, out_vcf): """ Add PASSED_{caller} fields. Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce ambiguity of confident variants in the merged vcf. """ # see logic of merging INFO fields # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge caller = get_caller(in_vcf) i_vcf = VariantFile(in_vcf, "rb") new_header = i_vcf.header.copy() try: new_header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) i_vcf.header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) except ValueError: pass raw_out = out_vcf.strip(".gz") o_vcf = VariantFile(raw_out, "w", header=new_header) for record in i_vcf: new_rec = record.copy() filters = list(record.filter) if filters and filters[0] == "PASS": new_rec.info["PASSED_{}".format(caller)] = 1 o_vcf.write(new_rec) o_vcf.close() subprocess.check_call(["bgzip", "-f", raw_out])
def main(): parser = argparse.ArgumentParser("find_outliers.py") parser.add_argument("input", type=str, help="list of samples names") parser.add_argument("output", type=str, help="list of samples names") parser.add_argument("outliers", type=str, help="list of samples names") args = parser.parse_args() #vcf = VariantFile(snakemake.input.vcf) vcf = VariantFile(args.input) outlier_table = pd.read_table(args.outliers) filtered = VariantFile(args.output, mode='w', header=vcf.header) outliers = defaultdict(list) for idx, row in outlier_table.iterrows(): outliers[row['svtype']].append(row['sample']) for record in remove_outliers(vcf, outliers): filtered.write(record) filtered.close()
def run_process(opts, inputvcf): outputvcf = opts.output # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add("HGVS_p", ".", "String", "HGVS.p Information (Single Character Amino Acid)") vcf_in.header.info.add("variant_type", ".", "String", "Variant Type for Tiering System") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): new_hgvsp = [] if "ANN" in record.info: # Get HGVS.p anns = record.info["ANN"] for annstring in anns: ann = annstring.split("|") #ann[6] = ann[6].split(".")[0] #print annstring #print "|".join(ann) new_hgvsp_tmp = convert_hgvsp(ann[10]) if new_hgvsp_tmp == "" or new_hgvsp_tmp == None: new_hgvsp_tmp = '.' new_hgvsp.append(new_hgvsp_tmp) new_hgvsp_string = ",".join(new_hgvsp) record.info["HGVS_p"] = new_hgvsp_string # Write VCF vcf_out.write(record)
def write_rephased_tenx_vcf(tenx_vcf, tenx_records, tenx_phase_sets, threshold, workdir): """ Writes new 10X VCF file and switches genotypes if logratios above / below threshold """ basename = os.path.basename(tenx_vcf) if basename.endswith('.vcf'): offset = -4 elif basename.endswith('.vcf.gz'): offset = -7 else: return tenx_rephased_vcf = workdir + '/' + basename[:offset] + '.filtered.het.rephased.vcf' vcf_in = VariantFile(tenx_vcf) vcf_out = VariantFile(tenx_rephased_vcf, 'w', header=vcf_in.header) for ps_id in tenx_phase_sets: if tenx_phase_sets[ps_id].rephased: chrom = tenx_phase_sets[ps_id].chrom if tenx_phase_sets[ps_id].log2ratio >= threshold: for pos in tenx_phase_sets[ps_id].positions: tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1 vcf_out.write(tenx_records[chrom + ':' + str(pos)]) elif tenx_phase_sets[ps_id].log2ratio <= -threshold: for pos in tenx_phase_sets[ps_id].positions: tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1 GT_swapped = (tenx_records[chrom + ':' + str(pos)].samples[0]['GT'][1], tenx_records[chrom + ':' + str(pos)].samples[0]['GT'][0]) tenx_records[chrom + ':' + str(pos)].samples[0]['GT'] = GT_swapped tenx_records[chrom + ':' + str(pos)].samples[0].phased = True vcf_out.write(tenx_records[chrom + ':' + str(pos)]) return tenx_rephased_vcf
def run_process(opts, mutect2_vcf, pindel_vcf): outputvcf = opts.output # Open VCF mutect2 = VariantFile(mutect2_vcf) pindel = VariantFile(pindel_vcf) # Add pindel header to new header new_header = mutect2.header new_header_keys = new_header.info.keys() for item in pindel.header.info.iteritems(): if item[1].name in new_header_keys: continue else: new_header.info.add(item[1].name, item[1].number, item[1].type, item[1].description) # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header) pindel_record_list = list() for p in pindel.fetch(): tmp = vcf_out.new_record() tmp.chrom = p.chrom tmp.pos = p.pos tmp.ref = p.ref tmp.alts = p.alts for key in p.info.keys(): tmp.info[key] = p.info[key] for key in p.format.keys(): tmp.samples[0][key] = p.samples[0][key] tmp.samples[0]["AF"] = float(tmp.samples[0]["AD"][1]) / float(tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1]) tmp.info["DP"] = tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1] pindel_record_list.append(tmp) oldchrom = 1 for record in mutect2.fetch(): chrom = record.chrom pos = record.pos alts = record.alts for i,record2 in enumerate(pindel_record_list): oldchrom = int(record2.chrom.replace("chr","")) if record2.chrom == chrom and record2.pos == pos and record2.alts == alts: del(pindel_record_list[i]) elif record2.chrom == chrom and record2.pos > pos: break elif record2.chrom == chrom and record2.pos < pos: vcf_out.write(record2) del(pindel_record_list[i]) elif oldchrom < int(chrom.replace("chr","")): vcf_out.write(record2) del(pindel_record_list[i]) vcf_out.write(record)
def run_process(opts, inputvcf): outputvcf = opts.output popfreq = float(opts.popfreq) # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add("ngb_popmaf_snp_db_cnt",".","Integer","Population Database Count above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_list",".","String","Population Database List above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_eastasian",".","String","East Asian Exist Flag above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_korean",".","String","Korean Exist Flag above setting MAF") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=vcf_in.header) for record in vcf_in.fetch(): record_data = OrderedDict() record_value = list() # Check Population MAF for key in freq_check_list: try: value = record.info[key] if type(value) == list or type(value) == tuple: value2 = float(value[0]) else: value2 = float(value) if value2 >= popfreq: record_data[key] = value2 except: continue # Check ESP6500 try: value_list = record.info['esp6500_MAF'] if float(value_list[2]) / 100 >= popfreq: record_data['esp6500_MAF_ALL'] = float(value_list[2]) / 100 if float(value_list[1]) / 100 >= popfreq: record_data['esp6500_MAF_AA'] = float(value_list[1]) / 100 if float(value_list[0]) / 100 >= popfreq: record_data['esp6500_MAF_EA'] = float(value_list[0]) / 100 except: pass for key in record_data.iterkeys(): record_value.append(key) filtered_db_list = '|'.join(record_value) if filtered_db_list == '': filtered_db_list = '.' record.info['ngb_popmaf_snp_db_list'] = filtered_db_list record.info['ngb_popmaf_snp_db_cnt'] = len(record_data) if "EAS" in filtered_db_list: record.info['ngb_popmaf_snp_db_eastasian'] = 'Y' else: record.info['ngb_popmaf_snp_db_eastasian'] = 'N' if ("KRGDB" in filtered_db_list) or ("KoEXID" in filtered_db_list): record.info['ngb_popmaf_snp_db_korean'] = 'Y' else: record.info['ngb_popmaf_snp_db_korean'] = 'N' # Write VCF vcf_out.write(record)
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtools standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) template = pkg_resources.resource_filename('svtools', 'data/standard_template.vcf') template = VariantFile(template) vcf = VariantFile(args.vcf) # Template header includes all necessary FILTER, INFO, and FORMAT fields # Just need to add samples from VCF being standardized header = template.header for sample in vcf.header.samples: header.add_sample(sample) # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout) idx = 1 for record in standardizer.standardize_vcf(): if any_called(record) or args.include_reference_sites: if args.prefix is not None: record.id = '{0}_{1}'.format(args.prefix, idx) idx += 1 fout.write(record) # for std_rec in standardize_vcf(vcf, fout): # fout.write(std_rec) fout.close() vcf.close()
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') parser.add_argument('--contigs', type=argparse.FileType('r'), help='Reference fasta index (.fai). If provided, ' 'contigs in index will be used in VCF header. ' 'Otherwise all GRCh37 contigs will be used in header. ' 'Variants on contigs not in provided list will be ' 'removed.') parser.add_argument('--min-size', type=int, default=50, help='Minimum SV size to report [50].') parser.add_argument('--call-null-sites', action='store_true', default=False, help='Call sites with null genotypes (./.). Generally ' 'useful when an algorithm has been run on a single ' 'sample and has only reported variant sites.') parser.add_argument('--sample-names', type=str, default=None, help='Comma-delimited list of sample names to use in ' 'header [use existing].') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Add contigs to header if provided if args.contigs: template = pkg_resources.resource_filename( 'svtk', 'data/no_contigs_template.vcf') template = VariantFile(template) header = template.header contig_line = '##contig=<ID={contig},length={length}>' for line in args.contigs: contig, length = line.split()[:2] header.add_line(contig_line.format(**locals())) # Use GRCh37 by default else: template = pkg_resources.resource_filename('svtk', 'data/GRCh37_template.vcf') template = VariantFile(template) header = template.header vcf = VariantFile(args.vcf) # Parse new sample names if provided if args.sample_names: sample_names_list = args.sample_names.split(',') else: sample_names_list = vcf.header.samples # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout, sample_names_list, args.prefix, args.min_size, args.include_reference_sites, args.call_null_sites) for record in standardizer.standardize_vcf(): fout.write(record) fout.close() vcf.close()