def main(variant_file, bam_file, samplename, loglevel, filter_hom): setup_logging(loglevel) vcf_reader = vcf.Reader(open(variant_file, 'r')) bamfile = pysam.AlignmentFile(bam_file, "rb") vcf_reader.samples.append(samplename) vcf_writer = vcf.Writer(open('/dev/stdout', 'w'), vcf_reader) for variant in vcf_reader: calls = [call.data.GT for call in variant.samples] if filter_hom and ('0/0' in calls or '1/1' in calls): continue # only work on simple substitutions if len(variant.REF) == 1 and len(variant.ALT) == 1 and len( variant.ALT[0]) == 1: pile = bamfile.pileup(variant.CHROM, variant.POS, variant.POS + 1) bases = [] quals = [] for pileupcolumn in pile: if pileupcolumn.pos + 1 != variant.POS: continue for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: bases.append(pileupread.alignment.query_sequence[ pileupread.query_position]) quals.append(pileupread.alignment.query_qualities[ pileupread.query_position]) bases.sort() logging.debug("pileup at {}:{} {}/{} = {}".format( variant.CHROM, variant.POS, variant.REF, variant.ALT, "".join(bases))) Genotype = namedtuple( 'Genotype', variant.FORMAT.split(":")) # lazy genotype object Genotype.__new__.__defaults__ = ('.', ) * len( Genotype._fields) # set defaults to 0 dp = len(bases) ro = len([base for base in bases if base == variant.REF]) ao = len([base for base in bases if base == variant.ALT[0]]) gt = "./." newgt = Genotype(AO=ao, RO=ro, DP=dp, GT=gt) newcall = Call(site=variant, sample=samplename, data=newgt) variant.samples.append(newcall) vcf_writer.write_record(variant) bamfile.close() return 0
def __init__(self, out_file, file_type='VCF4.1', template_file=None, template_reader=None, new_source=None, new_info_fields=[], new_format_fields=[], new_filters=[]): self.file_type = file_type if self.file_type == 'VCF4.1': if template_reader is None and template_file is not None: template_reader = vcf.Reader(template_file) elif template_reader is not None: pass else: metadata = OrderedDict() infos = OrderedDict() formats = OrderedDict() filters = OrderedDict() alts = OrderedDict() contigs = OrderedDict() template_reader = namedtuple('template', ['metadata', 'infos', 'formats', 'filters', 'alts', 'contigs']) template_reader.metadata = metadata template_reader.infos = infos template_reader.formats = formats template_reader.filters = filters template_reader.alts = alts template_reader.contigs = contigs # Add new source to metadata of header if not(new_source is None): sources = template_reader.metadata.setdefault("source", []) sources.append(new_source) # Add new info fields to header for info_id, info_len, info_type, info_desc, _, _ in new_info_fields: info_field = vcf.parser._Info(info_id, info_len, info_type, info_desc, None, None) template_reader.infos[info_id] = info_field # Add new format fields to header for format_id, format_len, format_type, format_desc in new_format_fields: format_field = vcf.parser._Format(format_id, format_len, format_type, format_desc) template_reader.formats[format_id] = format_field # Add new filters to header for filter_id, filter_desc in new_filters: filter_field = vcf.parser._Filter(filter_id, filter_desc) template_reader.filters[filter_id] = filter_field self.writer = vcf.Writer(out_file, template_reader, lineterminator='\n') else: raise NotSupportedException('File type unsupported: ' + file_type)
def writer(): # file path to the reference file data_file_path = "/home/ubuntu/GSoC-Strain_Diffrential/data/original.vcf.gz" # raw_input("Enter path to reference VCF file") vcf_reader = vcf.Reader(open(data_file_path)) write_file_path = "/home/ubuntu/GSoC-Strain_Diffrential/data/Toy\ Examples/test_toy3.vcf" #print "Enter writing path" # reading from user input. #temp_path = raw_input("type d for default path else type a path:") #temp_path = temp_path + raw_input("Enter toy example name:") + ".vcf" #if temp_path != "d": #write_file_path = temp_path temp_path = write_file_path vcf_writer = vcf.Writer(open(write_file_path, 'w'), vcf_reader) count = 1 for record in vcf_reader: print "\nEnter the values for input" + str(count) + ":\n" print (record) print (record.ALT) print( record.is_snp ) print(record.is_indel ) print(record.is_deletion) record.POS = raw_input("Enter POS:") record.REF = raw_input("Enter REF for POS " + record.POS + " :") record.ALT = raw_input("Enter ALT for POS " + record.POS + " :") #record.is_indel = raw_input("Enter is_indel (true or false) for POS " + record.POS + " :") #record.is_deletion = raw_input("Enter is_deletion (true or false) for POS " + record.POS + " :") #record.is_snp = raw_input("Enter is_snp (true or false) for POS " + record.POS + " :") vcf_writer.write_record(record) if raw_input("Press enter if you want to continue else press any other key:") != "": break count = count + 1 # writing the value to permanent storage. # vcf_writer.write_record(record) vcf_writer.flush() vcf_reader = vcf.Reader(open(temp_path)) for record in vcf_reader: print (record)
def splitVcfByChromosome(source, output_folder, create_subfolders=False): """ Separates a vcf file into separate files for each chromosome. Assumes the file is sorted. Parameters ---------- source: string [PATH] output_folder: string [PATH] create_subfolders: bool; default False If 'True', each chromosome will be saved to a separate folder. """ basename = os.path.basename(source) basename, ext = os.path.splitext(basename) _match_chroms = "chr[0-9MT]{1,3}$" _match_chroms = re.compile(_match_chroms) with open(source, 'r') as input_vcf_file: reader = vcf.Reader(input_vcf_file) #pprint(reader.contigs) chromosomes = {i: list() for i in reader.contigs} # Sort the records by chromosome for record in reader: chrom = record.CHROM if chrom not in chromosomes: chromosomes[chrom] = list() chromosomes[chrom].append(record) for chromosome, record_list in chromosomes.items(): match = _match_chroms.search(chromosome) if not match: continue print(chromosome, match) output_basename = "{}.{}.vcf".format(basename, chromosome) print(output_basename) if create_subfolders: chromosome_folder = os.path.join(output_folder, chromosome) else: chromosome_folder = output_folder output_filename = os.path.join(chromosome_folder, output_basename) filetools.checkDir(chromosome_folder, True) with open(output_filename, 'w') as output_vcf: writer = vcf.Writer(output_vcf, reader) if len(record_list) > 0: for record in record_list: writer.write_record(record)
def repeat_merging(f_in, f_out): """takes a vcf file, collapses repetitive variant rows and write out to a new vcf file (without header)""" vcf_reader = vcf.Reader(f_in, strict_whitespace=True) variant_dict = {} num_repeats = 0 for record in vcf_reader: genome_coor = "chr{0}:{1}:{2}>{3}".format(record.CHROM, str(record.POS), record.REF, record.ALT[0]) if genome_coor not in variant_dict.keys(): variant_dict[genome_coor] = deepcopy(record) else: num_repeats += 1 for key in record.INFO: if key not in variant_dict[genome_coor].INFO.keys(): variant_dict[genome_coor].INFO[key] = deepcopy( record.INFO[key]) else: new_value = deepcopy(record.INFO[key]) old_value = deepcopy(variant_dict[genome_coor].INFO[key]) if type(new_value) != list: new_value = [new_value] if type(old_value) != list: old_value = [old_value] if new_value == old_value: continue else: if key == "individuals": ''' LOVD individuals field values are all meaningful even if repeated e.g. if two LOVD submissions for the same variant each have one individual associated with them, "1,1" is a more sensible value for the variant than "1" since 2 individuals are associated. ''' merged_value = list(new_value + old_value) else: merged_value = list(set(new_value + old_value)) variant_dict[genome_coor].INFO[key] = deepcopy( merged_value) print "number of repeat records: ", num_repeats, "\n" vcf_writer = vcf.Writer(f_out, vcf_reader) for record in variant_dict.values(): vcf_writer.write_record(record) f_in.close() f_out.close()
def test_null_mono(self): # null qualities were written as blank, causing subsequent parse to fail print( os.path.abspath( os.path.join(os.path.dirname(__file__), 'null_genotype_mono.vcf'))) p = vcf.Reader(fh('null_genotype_mono.vcf')) assert p.samples out = StringIO() writer = vcf.Writer(out, p) for record in p: writer.write_record(record) out.seek(0) print(out.getvalue()) p2 = vcf.Reader(out) rec = p2.next() assert rec.samples
def __init__(self, outstream, template, lineterminator='\n'): filename = template.filename disp = template.disp self.template = vcf.Reader(filename=filename) if template.family: for info in PEDINFO: self.template.infos[info.id] = info else: for info in INFO: self.template.infos[info.id] = info for format in FORMAT: self.template.formats[format.id] = format analysis = ''.join( ("input_file=%s " % filename, "disp_slope=%f " % disp['slope'], "disp_intercept=%f" % disp['intercept'])) self.template.metadata['GBStools'] = [analysis] self.writer = vcf.Writer(outstream, self.template, lineterminator)
def testWrite(self): reader = vcf.Reader(fh('example-4.1-bnd.vcf')) out = StringIO() writer = vcf.Writer(out, reader) records = list(reader) for record in records: writer.write_record(record) out.seek(0) out_str = out.getvalue() for line in out_str.split("\n"): if line.startswith("##PEDIGREE"): self.assertEquals(line, '##PEDIGREE=<Derived="Tumor",Original="Germline">') if line.startswith("##SAMPLE"): assert line.startswith('##SAMPLE=<'), "Found dictionary in meta line: {0}".format(line)
def GetWriter(reader, filters): """ Get VCF Writer with the appropriate metadata """ tmpdir = tempfile.mkdtemp(prefix="lobstr.") tmpfile = os.path.join(tmpdir, "header.vcf") f = open(tmpfile, "w") for line in reader._header_lines: f.write(line.strip() + "\n") for ft in filters.keys(): name = ft + str(filters[ft]["Value"]) desc = filters[ft]["Description"] f.write("##FILTER=<ID=%s,Description=\"%s\">\n"%(name, desc)) f.write("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Call-level filter.\">\n") f.write("#" + "\t".join(reader._column_headers + reader.samples) + "\n") f.close() writer = vcf.Writer(sys.stdout, vcf.Reader(open(tmpfile, "rb"))) return writer
def copyVcf(source, destination): with open(source, 'r') as input_file: reader = vcf.Reader(input_file) if 'Varscan' in source: reader.formats['DP4'] = reader.formats['DP4']._replace(num=4) reader.formats['DP4'] = reader.formats['DP4']._replace( type='Integer') with open(destination, 'w') as output_file: writer = vcf.Writer(output_file, reader) for record in reader: filterOut = '/' in str(record.ALT[0]) or '/' in record.REF if not filterOut: try: writer.write_record(record) except ValueError: print(record) return destination
def main(): parser = argparse.ArgumentParser( description='Fix dbsnp VP calls and add OXOG filter') parser.add_argument('validationvcf', help="Validation vcf file") parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Merged and annotated VCF file (default: stdin)") parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)") parser.add_argument( '-f', '--fieldname', default="Validation_status", help="Specify INFO field name to use (default: Validation_status") parser.add_argument( '-s', '--skipif', default="LOWDEPTH", help= "Comma-delimted list of items which won't get carried over (default: LOWDEPTH)" ) args = parser.parse_args() skips = args.skipif.split(',') header, infos = get_info_field(args.validationvcf, args.fieldname) reader = vcf.Reader(args.input) if len(infos) > 0: reader.infos[args.fieldname] = header writer = vcf.Writer(args.output, reader) for record in reader: assert len(record.ALT) == 1 variant = variant_tuple(record, record.ALT[0]) if variant in infos: items = infos[variant] apply_items = [item for item in items if item not in skips] if apply_items: record.INFO[args.fieldname] = ','.join(apply_items) writer.write_record(record) return 0
def filter_somatic_variants(in_file, out_file): reader = vcf.Reader(filename=in_file) with open(out_file, 'wb') as out_fh: writer = vcf.Writer(out_fh, reader) for record in reader: if (record.FILTER is None) or (len(record.FILTER) == 0): pass_filter = True else: pass_filter = False if pass_filter and ('SOMATIC' in record.INFO): writer.write_record(record) writer.close()
def the_thread(block, output_dir): index, input_filename = block output_filename = Blister.Output(input_filename, output_dir, "QUAL20", "vcf", rewrite=True, index=index) with Blister.Timestamp("Filtering", filename_1=input_filename, filename_2=output_filename, index=index): vcf_reader = vcf.Reader(open(input_filename, 'r')) vcf_writer = vcf.Writer(open(output_filename, 'w'), template=vcf_reader) for record in vcf_reader: if record.QUAL > 20: vcf_writer.write_record(record)
def generate_s(metaFile, tree, l, sv_cn_idx_dict, r, seg_cn_idx_dict, seg_bgn_idx_dict, seg_end_idx_dict, F, U, C, c_p, c_m, a, h, mate_dict, outputFolder): vcf_reader = vcf.Reader(open(metaFile, 'r')) vcf_reader.metadata['filedate'][0] = datetime.datetime.now().date( ).strftime('%Y%m%d') # set date to current date f_p = np.dot(U, c_p) f_m = np.dot(U, c_m) mixed_a = np.dot(U, a) # m * l mixed_h = np.dot(U, h) # m * l for i in range(len(U)): sample_idx = i + 1 temp_file = outputFolder + '/sample' + str(sample_idx) + '.vcf' temp_writer = vcf.Writer(open(temp_file, 'w'), vcf_reader) alt_type, gt_cnv = 'CNV', '1|1' # constants for all cnv records for chrom in sorted(seg_cn_idx_dict.keys()): for (key, val) in sorted(list(seg_cn_idx_dict[chrom].items()), key=lambda x: x[1]): pos = key[0] rec_id = get_cnv_rec_id(val, r) info_end = key[1] cn = [f_p[i][val], f_m[i][val]] temp_writer.write_record( generate_cnv(chrom, pos, rec_id, alt_type, info_end, gt_cnv, cn)) alt_ori, alt_cS, alt_wMA, gt_sv = True, str( ), True, '1|0' # constants for all sv records for chrom in sorted(sv_cn_idx_dict.keys()): for (key, val) in sorted(list(sv_cn_idx_dict[chrom].items()), key=lambda x: x[1]): pos, isLeft = key[0], key[1] rec_id = get_sv_rec_id(val, l) (mate_chrom, mate_pos, mate_isLeft) = mate_dict[(chrom, pos, isLeft)] mate_id = sv_cn_idx_dict[mate_chrom][(mate_pos, mate_isLeft)] alt_chr, alt_pos = mate_chrom, mate_pos cnadj = F[i][val] bdp, dp = int(round(mixed_a[i][val])), int( round(mixed_h[i][val])) info_mateid = get_sv_rec_id(mate_id, l) alt_rO = False if mate_isLeft == True else True temp_writer.write_record( generate_sv(chrom, pos, rec_id, alt_chr, alt_pos, alt_ori, alt_rO, alt_cS, alt_wMA, info_mateid, gt_sv, cnadj, bdp, dp))
def filterHomo(input_vcffil, output_vcffile, P1, P2): '''e.g. P1=[L14-1, L14-2, L14-3] P2 = [L17-1, L17-2, L17-3]''' inputvcf = open(input_vcffil, 'r') outputvcf = open(output_vcffile, 'w') invcf = vcf.Reader(inputvcf) outvcf = vcf.Writer(outputvcf, invcf) for i in invcf: if len(i.ALT) == 1: P1GT, P2GT = [], [] for m, n in zip(P1, P2): P1GT.append(i.genotype(m)['GT']) P2GT.append(i.genotype(n)['GT']) if judgeGT(P1GT, P2GT): outvcf.write_record(i) inputvcf.close() outputvcf.close()
def write_vcf(self, vcf_path): """ Write VCF file. *Keyword arguments:* - vcf_path -- VCF file """ if not self.reader: raise Exception("No data available") writer = vcf.Writer(open(vcf_path, 'w'), self.reader) for v in self.sequences: record = vcf.model._Record(v.chrom, v.pos, v.id, v.ref, v.alt, v.qual, [], v.info, v.format, [], v.samples) writer.write_record(record)
def test_writer(self): """FORMAT should not be written if not present in the template and no extra tab character should be printed if there are no FORMAT fields.""" reader = vcf.Reader(fh('1kg.sites.vcf', 'r')) out = StringIO() writer = vcf.Writer(out, reader, lineterminator='\n') for record in reader: writer.write_record(record) out.seek(0) out_str = out.getvalue() for line in out_str.split('\n'): if line.startswith('##'): continue if line.startswith('#CHROM'): assert 'FORMAT' not in line assert not line.endswith('\t')
def convert_snpeff_info_fields(vcf_input_fh, vcf_output_fh): """This function takes a VCF file on an input stream, reads it in, converts the single EFF field to a set of EFF fields, and then returns the modified VCF file on an output stream. The snpeff field starts out as a long string, consisting of many fields each separated by pipes. Effects information is added to the INFO field using an 'EFF' tag. There can be multiple effects separated by comma. The format for each effect is: Effect ( Effect_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Rank [ | ERRORS | WARNINGS ] ) Details for each field are here: http://snpeff.sourceforge.net/SnpEff_manual.html We will pull out all of these fields separately into INFO_EFF_* and return a new VCF file. """ vcf_reader = vcf.Reader(vcf_input_fh) # Generate extra header rows. # TODO: This method is internal to pyVCF, so if they change it, # this will break. Maybe we should copy their code? parser = vcf.parser._vcf_metadata_parser() for field, values in SNPEFF_FIELDS.items(): # Create a new header line from the new field. new_header_line = SNPEFF_INFO_TEMPLATE.substitute(values) # Add this extra header line to the vcf reader. vcf_reader._header_lines.append(new_header_line) # Parse the header line as an Info obj, add it to the reader. key, val = parser.read_info(new_header_line) vcf_reader.infos[key] = val vcf_writer = vcf.Writer(vcf_output_fh, vcf_reader) # Write the old records with the new EFF INFO fields for record in vcf_reader: vcf_writer.write_record(populate_record_eff(record))
def main(): parser = argparse.ArgumentParser( description='Annotate merged vcf with VAF information where available') parser.add_argument('mergedvcf', type=argparse.FileType('r'), default=sys.stdin, help="Merged VCF file") parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)") parser.add_argument('-b', '--broad', type=str, help="Broad file") parser.add_argument('-d', '--dkfz', type=str, help="DKFZ file") parser.add_argument('-s', '--sanger', type=str, help="Sanger file") parser.add_argument('-m', '--muse', type=str, help="Muse file") parser.add_argument('-i', '--indel', action='store_true', help="Variant type == indel (default:snv_mnv)") args = parser.parse_args() snvs = not args.indel dicts = [ populate_dict(args.broad, broad=True, SNV=snvs), populate_dict(args.dkfz, dkfz=True, SNV=snvs), populate_dict(args.sanger, sanger=True, SNV=snvs), populate_dict(args.muse, muse=True, SNV=snvs) ] vcf_reader = vcf.Reader(args.mergedvcf) vcf_writer = vcf.Writer(args.output, vcf_reader) for variant in vcf_reader: key = variant.CHROM, variant.POS, variant.REF, str(variant.ALT[0]) vafs = [ vaf_dict[key] for vaf_dict in dicts if key in vaf_dict if vaf_dict[key] is not None ] roundvafs = [round_three(vaf) for vaf in vafs] if len(vafs) > 0: variant.INFO['VAFs'] = roundvafs variant.INFO['medianVAF'] = round_three(numpy.median(vafs)) vcf_writer.write_record(variant) return 0
def tab_to_vcf(input_file, output_file, reference_file): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [ row.get(tab_field, ".") for vcf_field, tab_field in VCF_TO_FIELDS ] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith( ("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([{}, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def construct_vcf_dict(vcf_file_path,vcf_file_output,hg38_centromere_dict): vcffile = vcf.Reader(open(vcf_file_path,"r")) #vcffile = pysam.VariantFile(vcf_file_path,index_filename=vcf_file_path+".idx") vcf_writer = vcf.Writer(open(vcf_file_output,'w'), vcffile) variants_list = [] for rec in vcffile: #print rec.CHROM,rec.POS,rec.REF,rec.ALT #print rec.CHROM,hg38_centromere_dict[rec.CHROM] remove = False; nregions1 = len(hg38_centromere_dict[rec.CHROM]["start"]) for j in range(nregions1): start_loop,end_loop = hg38_centromere_dict[rec.CHROM]["start"][j],hg38_centromere_dict[rec.CHROM]["end"][j] if int(rec.POS) >= start_loop and int(rec.POS) <= end_loop: remove = True; if not remove: print(rec.CHROM,rec.POS,rec.REF,rec.ALT) print("to keep") vcf_writer.write_record(rec) return variants_list
def merge_chrs_into_one_vcf(self, file1, file2): ''' Creates one VCF containing all variants of chr21 and chr22 :return: ''' print("Merging chr21_new.vcf with chr22_new.vcf") vcf_file1 = vcf.Reader(open(file1), "r") vcf_file2 = vcf.Reader(open(file2), "r") vcf_writer = vcf.Writer(open("merged_file.vcf", "w"), vcf_file1) for file in [vcf_file1, vcf_file2]: for i in file: vcf_writer.write_record(i) print("Merge successful. File 'merged_file.vcf' created.")
def one_variant_transform(f_in, f_out): """takes a vcf file, read each row, if the ALT field contains more than one item, create multiple variant row based on that row, writes new vcf""" vcf_reader = vcf.Reader(f_in, strict_whitespace=True) vcf_writer = vcf.Writer(f_out, vcf_reader) for record in vcf_reader: n = len(record.ALT) if n == 1: vcf_writer.write_record(record) else: for i in range(n): new_record = deepcopy(record) new_record.ALT = [deepcopy(record.ALT[i])] for key in record.INFO.keys(): value = deepcopy(record.INFO[key]) if type(value) == list and len(value) == n: new_record.INFO[key] = [value[i]] vcf_writer.write_record(new_record)
def main(): parser = argparse.ArgumentParser( description='Fix dbsnp VP calls and add OXOG filter') parser.add_argument('filtervcf', help="Filter vcf file") parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Merged and annotated VCF file (default: stdin)") parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Specify output file (default:stdout)") parser.add_argument( '-f', '--filtername', help= "Specify filter name to use (default: use filter field from VCF file)") parser.add_argument('-d', '--filterdesc', default="", help="Specify description of filter") args = parser.parse_args() reader = vcf.Reader(args.input) if args.filtername is not None: reader.filters[args.filtername] = vcf.parser._Filter( id=args.filtername, desc=args.filterdesc) writer = vcf.Writer(args.output, reader) failuredict = get_bias_failures(args.filtervcf, args.filtername) for record in reader: assert len(record.ALT) == 1 variant = variant_tuple(record, record.ALT[0]) if variant in failuredict: if not record.FILTER: record.FILTER = failuredict[variant] else: record.FILTER = record.FILTER + failuredict[variant] writer.write_record(record) return 0
def __main__(): parser = argparse.ArgumentParser( description='Make a heterozygosity plot from a vcf file') parser.add_argument('--vcfFile', help='VCF file generated by VarScan', required=True) args = parser.parse_args() try: vcfReader = vcf.Reader(filename=args.vcfFile) except FileNotFoundError: raise SystemExit( 'File {} cannot be found. Please check and try again\n'.format( args.vcfFile)) try: vcfWriter = vcf.Writer(open('heterozygousSNPs.vcf', 'w'), vcfReader) except: raise SystemExit( 'File heterozygousSNPs.vcf cannot be opened for writing.\n') for record in vcfReader: # at the point where we run this in the workflow, we should only ever have one sample if len(record.samples) != 1: raise SystemExit( 'VCF file {} has more than one sample. Please check you are using the correct VCF file\n' ) if 'AD' not in record.FORMAT or 'RD' not in record.FORMAT or 'DP' not in record.FORMAT: raise SystemExit( 'Record format is missing AD, RD or DP tags. Tags available in this file are "{0}". Please check that {1} was generated with VarScan\n' .format(record.FORMAT, args.vcfFile)) sample = record.samples[0] ratio = (max(sample['AD'], sample['RD']) / sample['DP']) # this is our cutoff for being heterozygous (as used by YMAP) if ratio <= 0.75: print(record) vcfWriter.write_record(record) vcfWriter.close() exit()
def induce_mutations(inFile, outFile, delta): vcf_reader = vcf.Reader(open(inFile)) vcf_writer = vcf.Writer(open(outFile, 'w'), vcf_reader) for record in vcf_reader: rec_toWrite = copy.deepcopy(record) print(record.num_called) if record.FORMAT.split(":")[0] != "GT": print("Error with FORMAT column at POSITION=" + str(record.POS) + ": \'GT\' is missing.\n") continue for sm in range(len(record.samples)): gt_read = str(record.genotype(record.samples[sm].sample)["GT"]) f_vals = [ record.samples[sm].data[vx] for vx in range(len(record.FORMAT.split(":"))) ] f_keys = record.FORMAT.split(":") rec_toWrite.samples[sm].data = collections.namedtuple( 'CallData', f_keys) if not already_mutated(gt_read): if uniform(0, 9) < delta: mutation_type = randint(0, 2) mut_type_str = "" if mutation_type == 0: mut_type_str = str(randint(1, len(record.ALT))) + ( "|" if gt_read[1] == "|" else "/") + "0" elif mutation_type == 1: mut_type_str = "0" + ("|" if gt_read[1] == "|" else "/") + str( randint(1, len(record.ALT))) else: mut_type_str = str(randint(1, len(record.ALT))) + ( "|" if gt_read[1] == "|" else "/") + str( randint(1, len(record.ALT))) f_vals[0] = mut_type_str rec_toWrite.samples[sm].data = rec_toWrite.samples[sm].data._make( f_vals) vcf_writer.write_record(rec_toWrite)
def main(args): invcf = vcf.Reader(filename=args.vcffile) outvcf = vcf.Writer(sys.stdout, invcf) vtype = None if args.vtype is not None: assert args.vtype in ('SNV', 'INDEL', 'SV') vtype = args.vtype bam = pysam.Samfile(args.bamfile, 'rb') for rec in invcf: output = True bc = basecount(bam, rec.CHROM, rec.POS) for alt in rec.ALT: if alt in bc.keys(): if bc[str(alt)] < int(args.minreads): output = False else: output = False if vtype == 'SNV' and (not rec.is_snp or (rec.is_snp and rec.INFO.get('VT') == 'LOH')): output = False if vtype == 'INDEL' and not rec.is_indel: output = False if vtype == 'SV' and not rec.is_sv: output = False if args.passonly and rec.FILTER: output = False if args.failonly and not rec.FILTER: output = False if args.somaticonly and not is_somatic(rec): output = False if args.germlineonly and is_somatic(rec): output = False if output: outvcf.write_record(rec)
def write_header(self, sample_id, filters, reference): """ Write the VCF file header with the standard SNP Pipeline data elements. Parameters ---------- sample_id : str Sample ID which will be written to the header line. filters : list of tuple(str, str) List of names and descriptions of filters which will be combined and written to the header filter lines. reference : str Reference name which will be written to the header reference line. """ # Write the template header to an in-memory buffer in_memory_file = StringIO() in_memory_file.name = "header.vcf" in_memory_file.write(VCF_VERSION) in_memory_file.write(datetime.datetime.strftime(datetime.datetime.now(), VCF_DATE)) in_memory_file.write(VCF_SOURCE) in_memory_file.write(VCF_INFO) in_memory_file.write(VCF_FILTER % ("PASS", "All filters passed")) for name, description in filters: in_memory_file.write(VCF_FILTER % (name, description)) in_memory_file.write(VCF_FORMAT) in_memory_file.write(VCF_REFERENCE % reference) in_memory_file.write(VCF_HDR_LINE % sample_id) # Rewind to the beginning of the file buffer to prepare for reading in_memory_file.seek(0) # Feed the template to pyVcf and write the header to our vcf file vcf_template = vcf.Reader(in_memory_file) self.pyvcf_writer = vcf.Writer(self.file_handle, template=vcf_template) # Extract the format string from the header. It will be the same for # all positions, so only do this once. format_lines = VCF_FORMAT.split('\n') format_lines = [line for line in format_lines if len(line) > 0] format_lines = [line.replace("##FORMAT=<ID=", "") for line in format_lines] tokens = [line.split(',')[0] for line in format_lines] self.format_str = ':'.join(tokens) self.VcfCallData = collections.namedtuple('VcfCallData', tokens) # this creates a new class called VcfCallData
def write_diff(diff_entries, template): (unmatched_i_rows, unmatched_t_rows, differing_rows) = diff_entries writer = vcf.Writer(sys.stdout, template) if unmatched_i_rows: print("## Unmatched vcf row(s) in Input File") for row in unmatched_i_rows: writer.write_record(row) if unmatched_t_rows: print("## Unmatched vcf row(s) in Truth File") for row in unmatched_t_rows: writer.write_record(row) if differing_rows: print("## Matched row(s) which differ in Input and Truth Files") for row in differing_rows: writer.write_record(row)
def main(): """ Driver program - Read in a VCF file and normal/tumour read counts for each base at each position, and output read counts at each call """ parser = argparse.ArgumentParser(description='Search validation data for germline homs/hets') parser.add_argument('vcffile', nargs='+', help='Vcf file(s) to check') parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='Output VCF (default: stdout)') parser.add_argument('-e', '--errorrate', type=float, default=0.02, help='Error rate') parser.add_argument('-a', '--alpha', type=float, default=0.05, help='prob threshold for calling hets/homs') args = parser.parse_args() vcf_reader = vcf.Reader(filename=args.vcffile[0]) vcf_writer = vcf.Writer(args.output, vcf_reader) for filename in args.vcffile: vcf_reader = vcf.Reader(filename=filename) for record in vcf_reader: if not 'NormalReads' in record.INFO or \ not 'NormalEvidenceReads' in record.INFO: continue if 'LOWDEPTH' in record.FILTER: continue normdepth = int(record.INFO['NormalReads'][0]) normevidence = sum([int(nr) for nr in record.INFO['NormalEvidenceReads']]) impl_vaf = normevidence*1./normdepth p_het = scipy.stats.binom_test(normevidence, normdepth, max(args.errorrate, impl_vaf)) p_hom = scipy.stats.binom_test(normevidence, normdepth, max(.75, impl_vaf)) if p_het > 1.-args.alpha: record.INFO['HET'] = 1.-p_het if p_hom > 1.-args.alpha: record.INFO['HOM'] = 1.-p_hom vcf_writer.write_record(record) return 0