def __init__(self, filename): self.vcf_reader = vcfpy.Reader.from_path(filename) self.vcf_reader.header.add_info_line( vcfpy.OrderedDict([ ('ID', 'TFX_HGVSBC_G'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS genomic reference, as produced by pyhgvs.') ])) self.vcf_reader.header.add_info_line( vcfpy.OrderedDict([ ('ID', 'TFX_HGVSBC_C'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS coding reference, as produced by pyhgvs.') ])) self.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGVSBC_P1'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS 1-letter protein reference, as produced ' 'by pyhgvs.')])) self.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGVSBC_P3'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS 3-letter protein reference, as produced ' 'by pyhgvs.')]))
def test_header_pedigree_header_line(): line1 = header.PedigreeHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "child"), ("Father", "father")])) line2 = header.PedigreeHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "child"), ("Father", "father")])) line3 = header.PedigreeHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "father")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', " "OrderedDict([('ID', 'child'), ('Father', 'father')]))") assert repr(line1) == ( "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', " "OrderedDict([('ID', 'child'), ('Father', 'father')]))") else: assert str(line1) == ( "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', {'ID': 'child', 'Father': 'father'})" ) assert repr(line1) == ( "PedigreeHeaderLine('PEDIGREE', '<ID=child,Father=father>', {'ID': 'child', 'Father': 'father'})" ) assert line1.value == "<ID=child,Father=father>" assert line1.serialize() == "##PEDIGREE=<ID=child,Father=father>" with pytest.raises(TypeError): hash(line1)
def test_header_filter_header_line(): line1 = header.FilterHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) line2 = header.FilterHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) line3 = header.FilterHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "q30"), ("Description", "Phred score <30")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', " "OrderedDict([('ID', 'PASS'), ('Description', 'All filters passed')]))" ) assert repr(line1) == ( "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', " "OrderedDict([('ID', 'PASS'), ('Description', 'All filters passed')]))" ) else: assert str(line1) == ( "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', " "{'ID': 'PASS', 'Description': 'All filters passed'})") assert repr(line1) == ( "FilterHeaderLine('FILTER', '<ID=PASS,Description=\"All filters passed\">', " "{'ID': 'PASS', 'Description': 'All filters passed'})") assert line1.value == '<ID=PASS,Description="All filters passed">' assert line1.serialize( ) == '##FILTER=<ID=PASS,Description="All filters passed">' with pytest.raises(TypeError): hash(line1)
def test_header_sample_header_line(): line1 = header.SampleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "sample1")])) line2 = header.SampleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "sample1")])) line3 = header.SampleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "sample2")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "SampleHeaderLine('SAMPLE', '<ID=sample1>', OrderedDict([('ID', 'sample1')]))" ) assert repr(line1) == ( "SampleHeaderLine('SAMPLE', '<ID=sample1>', OrderedDict([('ID', 'sample1')]))" ) else: assert str(line1) == ( "SampleHeaderLine('SAMPLE', '<ID=sample1>', {'ID': 'sample1'})") assert repr(line1) == ( "SampleHeaderLine('SAMPLE', '<ID=sample1>', {'ID': 'sample1'})") assert line1.value == "<ID=sample1>" assert line1.serialize() == "##SAMPLE=<ID=sample1>" with pytest.raises(TypeError): hash(line1)
def test_header_info_header_line(): line1 = header.InfoHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "SVTYPE"), ("Number", 1), ("Type", "String")])) line2 = header.InfoHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "SVTYPE"), ("Number", 1), ("Type", "String")])) line3 = header.InfoHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "END"), ("Number", 1), ("Type", "Integer")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', " "OrderedDict([('ID', 'SVTYPE'), ('Number', 1), ('Type', 'String')]))" ) assert repr(line1) == ( "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', " "OrderedDict([('ID', 'SVTYPE'), ('Number', 1), ('Type', 'String')]))" ) else: assert str(line1) == ( "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', " "{'ID': 'SVTYPE', 'Number': 1, 'Type': 'String'})") assert repr(line1) == ( "InfoHeaderLine('INFO', '<ID=SVTYPE,Number=1,Type=String>', " "{'ID': 'SVTYPE', 'Number': 1, 'Type': 'String'})") assert line1.value == "<ID=SVTYPE,Number=1,Type=String>" assert line1.serialize() == "##INFO=<ID=SVTYPE,Number=1,Type=String>" with pytest.raises(TypeError): hash(line1)
def test_header_format_header_line(): line1 = header.FormatHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"), ("Type", "Integer")])) line2 = header.FormatHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"), ("Type", "Integer")])) line3 = header.FormatHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "DP"), ("Number", 1), ("Type", "Integer")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', " "OrderedDict([('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer')]))" ) assert repr(line1) == ( "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', " "OrderedDict([('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer')]))" ) else: assert str(line1) == ( "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', " "{'ID': 'AD', 'Number': 'R', 'Type': 'Integer'})") assert repr(line1) == ( "FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer>', " "{'ID': 'AD', 'Number': 'R', 'Type': 'Integer'})") assert line1.value == "<ID=AD,Number=R,Type=Integer>" assert line1.serialize() == "##FORMAT=<ID=AD,Number=R,Type=Integer>" with pytest.raises(TypeError): hash(line1)
def test_header_alt_allele_header_line(): line1 = header.AltAlleleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "DEL"), ("Description", "deletion")])) line2 = header.AltAlleleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "DEL"), ("Description", "deletion")])) line3 = header.AltAlleleHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "DUP"), ("Description", "duplication")])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( """AltAlleleHeaderLine('ALT', '<ID=DEL,Description="deletion">', """ """OrderedDict([('ID', 'DEL'), ('Description', 'deletion')]))""") assert repr(line1) == ( """AltAlleleHeaderLine('ALT', '<ID=DEL,Description="deletion">', """ """OrderedDict([('ID', 'DEL'), ('Description', 'deletion')]))""") else: assert str(line1) == ( "AltAlleleHeaderLine('ALT', '<ID=DEL,Description=\"deletion\">', " "{'ID': 'DEL', 'Description': 'deletion'})") assert repr(line1) == ( "AltAlleleHeaderLine('ALT', '<ID=DEL,Description=\"deletion\">', " "{'ID': 'DEL', 'Description': 'deletion'})") assert line1.value == '<ID=DEL,Description="deletion">' assert line1.serialize() == '##ALT=<ID=DEL,Description="deletion">' with pytest.raises(TypeError): hash(line1)
def test_header_contig_header_line(): line1 = header.ContigHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "1"), ("length", 234)])) line2 = header.ContigHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "1"), ("length", 234)])) line3 = header.ContigHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "2"), ("length", 123)])) assert line1 == line2 assert line1 != line3 if sys.version_info < (3, 6): assert str(line1) == ( "ContigHeaderLine('contig', '<ID=1,length=234>', OrderedDict([('ID', '1'), ('length', 234)]))" ) assert repr(line1) == ( "ContigHeaderLine('contig', '<ID=1,length=234>', OrderedDict([('ID', '1'), ('length', 234)]))" ) else: assert str(line1) == ( "ContigHeaderLine('contig', '<ID=1,length=234>', {'ID': '1', 'length': 234})" ) assert repr(line1) == ( "ContigHeaderLine('contig', '<ID=1,length=234>', {'ID': '1', 'length': 234})" ) assert line1.value == "<ID=1,length=234>" assert line1.serialize() == "##contig=<ID=1,length=234>" with pytest.raises(TypeError): hash(line1)
def run(vi, vo, mmc, mmf, mmfs, mff): """ Filter VCF based on given parameter """ mff = 0.99 if not vi: exit("VCF file path not given. Exiting . . . . . .") if not path.isfile(vi): exit( "Given file path doesn't exist or it is not a file. Exiting . . . . ." ) try: reader = vcfpy.Reader.from_path(vi) reader.header.add_filter_line( vcfpy.OrderedDict([('ID', 'min_alt_count'), ('Description', f'Minimum {mmc} from each atrand')])) reader.header.add_filter_line( vcfpy.OrderedDict([('ID', 'min_alt_frac'), ('Description', f'Minimum {mmf} of total DP4') ])) reader.header.add_filter_line( vcfpy.OrderedDict([('ID', 'min_alt_fication_frac'), ('Description', f'Minimum {mff} of total DP4') ])) writer = vcfpy.Writer.from_path( '/dev/stdout', reader.header) if not vo else vcfpy.Writer.from_path( vo, reader.header) for record in reader: base_sum = sum(record.INFO["DP4"]) if ((record.INFO["DP4"][2] >= mmc) & (record.INFO["DP4"][2] >= mmfs * (record.INFO["DP4"][2] + record.INFO["DP4"][3])) & (record.INFO["DP4"][3] >= mmc) & (record.INFO["DP4"][3] >= mmfs * (record.INFO["DP4"][2] + record.INFO["DP4"][3])) & (record.INFO["DP4"][2] + record.INFO["DP4"][3] >= mmf * base_sum)): if record.INFO["DP4"][2] + record.INFO["DP4"][ 3] >= mff * base_sum: record.INFO["DP4"][0] = 0 record.INFO["DP4"][1] = 0 record.INFO["AF"] = 1.0 writer.write_record(record) except: exit( "Please check given file. Is it in proper format??. Exiting . . . . ." )
def test_header_without_lines(): lines = [ header.HeaderLine("foo", "bar"), header.HeaderLine("foo2", "bar2") ] samples = header.SamplesInfos(["one", "two", "three"]) hdr = header.Header(lines, samples) hdr.add_filter_line(vcfpy.OrderedDict([("ID", "PASS")])) hdr.add_filter_line(vcfpy.OrderedDict([("ID", "q30")])) assert len(hdr.lines) == 4 hdr2 = header.header_without_lines(hdr, [("foo", "bar"), ("FILTER", "q30")]) assert len(hdr2.lines) == 2 assert hdr2.samples == hdr.samples
def test_add_format_line(vcf_header): # check header before adding assert len(vcf_header.lines) == 18 # add header line VALUE = '<ID=GTa,Number=1,Type=String,Description="Genotype">' line = header.FormatHeaderLine( "FORMAT", VALUE, vcfpy.OrderedDict([("ID", "GTa"), ("Number", 1), ("Type", "String"), ("Description", "Genotype")]), ) vcf_header.add_line(line) # check header after adding assert len(vcf_header.lines) == 19 assert "GTa" in vcf_header._indices["FORMAT"] assert vcf_header._indices["FORMAT"]["GTa"] is vcf_header.lines[-1] # Check resulting added header line assert vcf_header.lines[-1].key == "FORMAT" assert vcf_header.lines[-1].value == VALUE assert len(vcf_header.lines[-1].mapping) == 4 assert vcf_header.lines[-1].mapping["ID"] == "GTa" assert vcf_header.lines[-1].mapping["Number"] == 1 assert vcf_header.lines[-1].mapping["Type"] == "String" assert vcf_header.lines[-1].mapping["Description"] == "Genotype"
def test_add_contig_line_shortcut(vcf_header): # check header before adding assert len(vcf_header.lines) == 18 assert "20a" not in vcf_header._indices["contig"] # add header line mapping = vcfpy.OrderedDict([ ("ID", "20a"), ("length", 62435964), ("assembly", "B36"), ("md5", "f126cdf8a6e0c7f379d618ff66beb2da"), ("species", "H**o sapiens"), ("taxonomy", "x"), ]) vcf_header.add_contig_line(mapping) # check header after adding assert len(vcf_header.lines) == 19 assert "20a" in vcf_header._indices["contig"] assert vcf_header._indices["contig"]["20a"] is vcf_header.lines[-1] # Check resulting added header line assert vcf_header.lines[-1].key == "contig" VALUE = ("<ID=20a,length=62435964,assembly=B36," "md5=f126cdf8a6e0c7f379d618ff66beb2da," 'species="H**o sapiens",taxonomy=x>') assert vcf_header.lines[-1].value == VALUE assert len(vcf_header.lines[-1].mapping) == 6 assert vcf_header.lines[-1].mapping["ID"] == "20a" assert vcf_header.lines[-1].mapping["length"] == 62435964 assert vcf_header.lines[-1].mapping["assembly"] == "B36" assert vcf_header.lines[-1].mapping[ "md5"] == "f126cdf8a6e0c7f379d618ff66beb2da" assert vcf_header.lines[-1].mapping["species"] == "H**o sapiens" assert vcf_header.lines[-1].mapping["taxonomy"] == "x"
def test_add_info_line_shortcut(vcf_header): # check header before adding assert len(vcf_header.lines) == 18 # add header line VALUE = '<ID=DPa,Number=1,Type=Integer,Description="Total Depth">' mapping = vcfpy.OrderedDict([("ID", "DPa"), ("Number", 1), ("Type", "Integer"), ("Description", "Total Depth")]) vcf_header.add_info_line(mapping) assert len(vcf_header.lines) == 19 # check header after adding assert len(vcf_header.lines) == 19 assert "DPa" in vcf_header._indices["INFO"] assert vcf_header._indices["INFO"]["DPa"] is vcf_header.lines[-1] # Check resulting added header line assert vcf_header.lines[-1].key == "INFO" assert vcf_header.lines[-1].value == VALUE assert len(vcf_header.lines[-1].mapping) == 4 assert vcf_header.lines[-1].mapping["ID"] == "DPa" assert vcf_header.lines[-1].mapping["Number"] == 1 assert vcf_header.lines[-1].mapping["Type"] == "Integer" assert vcf_header.lines[-1].mapping["Description"] == "Total Depth"
def merge_contig_header(self): for contig in self.readers[0].header.get_lines('contig'): self.merge_header.add_contig_line( vcfpy.OrderedDict( [('ID', contig.id), ('length', contig.length)] ) )
def add_caller_filter_header(self): """ Add caller info as FILTER to writer header """ for reader, caller in zip(self.readers, self.callers): reader.header.add_filter_line( vcfpy.OrderedDict( [('ID', caller), ('Description', 'Variant caller label.')]) )
def merge_filter_header(self): exclude = ['PASS'] exclude.extend(self.callers) seen = [] for reader, caller in zip(self.readers, self.callers): for filter in reader.header.get_lines('FILTER'): if filter.id not in seen: seen.append(filter.id) if filter.id not in exclude: self.merge_header.add_filter_line( vcfpy.OrderedDict( [('ID', '{}_{}'.format(caller, filter.id)), ('Description', '{} {}'.format(caller, filter.description))] ) ) else: self.merge_header.add_filter_line( vcfpy.OrderedDict([('ID', filter.id), ('Description', filter.description)]) )
def merge_info_header(self): for reader, caller in zip(self.readers, self.callers): for info in reader.header.get_lines('INFO'): self.merge_header.add_info_line( vcfpy.OrderedDict( [('ID', '{}_{}'.format(caller, info.id)), ('Number', info.number), ('Type', info.type), ('Description', '{} {}'.format(caller, info.description))] ) )
def merge_format_header(self): for reader, caller in zip(self.readers, self.callers): for format in reader.header.get_lines('FORMAT'): self.merge_header.add_format_line( vcfpy.OrderedDict( [('ID', '{}_{}'.format(caller, format.id)), ('Number', format.number), ('Type', format.type), ('Description', '{} {}'.format(caller, format.description))] ) )
def test_header_has_header_line_positive(): lines = [ header.FormatHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "DP"), ("Number", "R"), ("Type", "Integer")])), header.InfoHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "AD"), ("Number", "R"), ("Type", "Integer")])), header.FilterHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])), header.ContigHeaderLine.from_mapping( vcfpy.OrderedDict([("ID", "1"), ("length", 234)])), ] samples = header.SamplesInfos(["one", "two", "three"]) hdr = header.Header(lines, samples) assert hdr.has_header_line("FORMAT", "DP") assert hdr.has_header_line("INFO", "AD") assert hdr.has_header_line("FILTER", "PASS") assert hdr.has_header_line("contig", "1")
def collect_all_vcf( dirs: str, vcf_filename: str = "phased.partial.vcf", output: str = "IsoSeq_IsoPhase.vcf", ) -> None: no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND") snps_by_chrom = defaultdict(lambda: []) reader = None for d in dirs: filename = Path(d, vcf_filename) if not filename.exists(): if not no_snp_found_filename.exists(): logger.info("VCF file {filename} does not exist. Skipping.") continue with open(filename) as rf: reader = vcfpy.Reader(rf) for r in reader: c = Counter() # genotype -> count for x in r.samples: if x.data.GT.count("|") == 0: c[x.data.GT] += x.data.HQ else: for i, gt in enumerate(x.data.GT.split("|")): c[gt] += x.data.HQ[i] c_keys = c.keys() genotype = "|".join(str(k) for k in c_keys) counts = ",".join(str(c[k]) for k in c_keys) r.samples = [ vcfpy.Call( r, "SAMPLE", vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]), ) ] snps_by_chrom[r.CHROM].append((r.POS, r)) keys = list(snps_by_chrom.keys()) keys.sort() if reader is not None: reader.samples = ["SAMPLE"] with open(output, "w") as f: f = vcfpy.Writer(f, reader) for k in keys: v = snps_by_chrom[k] v.sort(key=lambda x: x[0]) for _, rec in v: f.write_record(rec) print("Output written to:", output)
def main(): # read from stdin reader = vcfpy.Reader.from_path('/dev/stdin') # add 'nhomalt' to header reader.header.add_info_line( vcfpy.OrderedDict([ ('ID', 'nhomalt'), ('Number', 'A'), ('Type', 'Integer'), ('Description', 'The number of individuals that are called homozygous for the alternate allele.' ) ])) # write to stdout with modified header with vcfpy.Writer.from_path('/dev/stdout', reader.header) as writer: for record in reader: record.INFO['nhomalt'] = [nhomalt(record)] writer.write_record(record)
def build_rec(calls=None, format_extras=None): calls = calls or [] format_extras = format_extras or [] alt1 = record.Substitution(vcfpy.SNV, "T") alt2 = record.Substitution(vcfpy.SNV, "A") return record.Record( "2", 100, [], "C", [alt1, alt2], None, [], vcfpy.OrderedDict(), ["GT"] + format_extras, calls, )
def generate_sv_record(records, comparison_result, sample_names): """ This method generates a single SV record after a call has been made over a set of input records :param records: the input records involved in the SV call :param comparison_result: :param sample_names: :return: """ # Build a map to easily find the records by the sample name. It can be multi-valued sample_names_to_records = group_by(records, lambda record: get_sample_name(record)) # Generate calls for each sample in this group calls = [ get_sample_call(sample_name, sample_names_to_records.get(sample_name, None)) for sample_name in sample_names ] first_record_of_the_group = records[0] chrom = first_record_of_the_group.CHROM id_of_new_record = generate_id(chrom, comparison_result.initial_position) info = vcfpy.OrderedDict() info["SVTYPE"] = comparison_result.svtype info["END"] = comparison_result.final_position if comparison_result.insseq is not None: info["INSSEQ"] = comparison_result.insseq return vcfpy.Record( CHROM=chrom, # by construction, all the grouped records have the same POS=comparison_result. initial_position, # by construction, all the grouped records have the same ID=[id_of_new_record], REF=first_record_of_the_group. REF, # by construction, all the grouped records have the same ALT=[ vcfpy.Substitution(type_=comparison_result.svtype, value='<{}>'.format(comparison_result.svtype)) ], QUAL=maximum_qual(records), FILTER=["PASS"], INFO=info, FORMAT=["GT", "TRANCHE2", "VAF"], calls=calls)
def test_add_filter_line_shortcut(vcf_header): # check header before adding assert len(vcf_header.lines) == 18 # add header line mapping = vcfpy.OrderedDict([("ID", "q10a"), ("Description", "Quality below 10")]) vcf_header.add_filter_line(mapping) # check header after adding assert len(vcf_header.lines) == 19 assert "q10a" in vcf_header._indices["FILTER"] assert vcf_header._indices["FILTER"]["q10a"] is vcf_header.lines[-1] # Check resulting added header line assert vcf_header.lines[-1].key == "FILTER" VALUE = '<ID=q10a,Description="Quality below 10">' assert vcf_header.lines[-1].value == VALUE assert len(vcf_header.lines[-1].mapping) == 2 assert vcf_header.lines[-1].mapping["ID"] == "q10a" assert vcf_header.lines[-1].mapping["Description"] == "Quality below 10"
def select_info_header(self): for info_field in self.info_fields: for caller in self.caller_priority: id = '{}_{}'.format(caller, info_field) if id in self.reader.header.info_ids(): info = self.reader.header.get_info_field_info(id) self.write_header.add_info_line( vcfpy.OrderedDict([('ID', info.id.split('_', 1)[1]), ('Number', info.number), ('Type', info.type), ('Description', info.description.split(' ', 1)[1])])) break else: print('{} not found for {} in INFO column.'.format( info_field, caller)) not_found = list( set(list(self.info_fields)) - set(self.write_header.info_ids())) if len(not_found) > 0: raise Exception(', '.join(not_found) + ' INFO field(s) not found in VCF')
def modify_outheader(outheader): info_lines={'CALLER':vcfpy.OrderedDict([('ID','CALLER'),('Number','1'),('Type','String'),('Description','Variant Call method')]), 'SS':vcfpy.OrderedDict([('ID','SS'),('Number','1'),('Type','String'),('Description','Somatic Status from respective Call Method')]), 'OFS':vcfpy.OrderedDict([('ID','OFS'),('Number','.'),('Type','String'),('Description','Original FILTER State')])} # format_lines={'GT':vcfpy.OrderedDict([('ID','GT'),('Number','1'),('Type','String'),('Description','Genotype')]), 'AD':vcfpy.OrderedDict([('ID','AD'),('Number','R'),('Type','Integer'),('Description','Alt Allele Depth')]), 'AAF':vcfpy.OrderedDict([('ID','AAF'),('Number','1'),('Type','Float'),('Description','Alt Allele Frequency')])} # for i in info_lines.keys(): if i not in outheader.info_ids(): outheader.add_info_line(info_lines[i]) else: outheader.get_info_field_info(i).mapping.update(info_lines[i]) for f in format_lines.keys(): if f not in outheader.format_ids(): outheader.add_format_line(format_lines[f]) else: outheader.get_format_field_info(f).mapping.update(format_lines[f]) return outheader
def main(): args = supply_args() myvcf = VcfReader(args.infile) vrnts = myvcf.get_vrnts() coll = CollectMetrics() # Set VCF header. # 'AAP': hgvs_aap, # 'BASEP': hgvs_basep, # 'EXON': exon, # 'HGNC': hgnc, # 'HGVSC': hgvs_c, # 'HGVSP1': hgvs_p, # 'HGVSP3': hgvs_three, # 'SOURCE': self.tfx_type, # 'SPLICE': None, # 'TXC': tx, # 'VFX': self.veff, # 'PVT': None myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_AAP'), ('Number', '.'), ('Type', 'String'), ('Description', 'Amino acid start position.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_BASEP'), ('Number', '.'), ('Type', 'String'), ('Description', 'Coding sequence start position.') ])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_EXON'), ('Number', '.'), ('Type', 'String'), ('Description', 'Exon number associated with given ' 'transcript.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGNC'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGNC gene symbol.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGVSC'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS cdot nomenclature.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGVSP1'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS pdot nomenclature, single letter ' 'amino acids.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_HGVSP3'), ('Number', '.'), ('Type', 'String'), ('Description', 'HGVS pdot nomenclature, three letter ' 'amino acids.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_SOURCE'), ('Number', '.'), ('Type', 'String'), ('Description', 'Annotation source.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_SPLICE'), ('Number', '.'), ('Type', 'String'), ('Description', 'Splice site annotation.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_TXC'), ('Number', '.'), ('Type', 'String'), ('Description', 'Transcript identifier.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_VFX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Variant effect annotation.')])) myvcf.vcf_reader.header.add_info_line( vcfpy.OrderedDict([('ID', 'TFX_PVT'), ('Number', '.'), ('Type', 'String'), ('Description', 'Variant type or location annotation.')])) if args.evf: my_evf = AnnovarExonVrntFunc(args.evf).read_annovar() coll.metrics_push(my_evf) if args.vf: my_vf = AnnovarVrntFunc(args.vf).read_annovar() coll.metrics_push(my_vf) if args.ccds_evf: my_ccds_evf = AnnovarExonVrntFunc(args.ccds_evf).read_annovar() coll.metrics_push(my_ccds_evf) if args.ccds_vf: my_ccds_vf = AnnovarVrntFunc(args.ccds_vf).read_annovar() coll.metrics_push(my_ccds_vf) to_write = {} for coord, txs in coll.metrics.items(): comb = defaultdict(list) dict_items = map(methodcaller('items'), (txs.values())) for k, v in chain.from_iterable(dict_items): comb[k].append(v) to_write[coord] = comb # Look for any instances where the number of TFX fields do not match. num_vals = len(set([len(x) for x in comb.values()])) assert num_vals == 1 or not num_vals, "%s" % comb writer = VcfWriter(args.outfile, myvcf.vcf_reader) writer.write_metrics(vrnts, to_write)
def get_header(sample_name_to_header, chromosome_set): """ Returns the header of the output VCF file :param sample_name_to_header: a dictionary from the sample names to the headers :param chromosome_set: the set of chromosomes selected for analysis :return: a vcfpy.Header """ header = vcfpy.Header() header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2")) # CONTIG headers first_sample_header = next(iter(sample_name_to_header.values())) for input_header_line in first_sample_header.lines: if isinstance(input_header_line, vcfpy.ContigHeaderLine): if chromosome_set is None or input_header_line.mapping[ "ID"] in chromosome_set: header.add_line(input_header_line) # INFO fields header.add_info_line( vcfpy.OrderedDict(ID="END", Number=1, Type="Integer", Description="Stop position of the interval")) header.add_info_line( vcfpy.OrderedDict(ID="SVTYPE", Number=1, Type="String", Description="Type of structural variant")) header.add_info_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) header.add_info_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_info_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)" )) # FORMAT fields header.add_format_line( vcfpy.OrderedDict(ID="GT", Number=1, Type="String", Description="Genotype")) header.add_format_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_format_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)" )) header.add_format_line( vcfpy.OrderedDict( ID="VAF", Number=1, Type="Float", Description= "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV" )) header.add_format_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) # Samples, sorted to ensure determinism sample_names = sample_name_to_header.keys() header.samples = vcfpy.SamplesInfos(sorted(sample_names)) return header
#!/usr/bin/env python # -*- coding: utf-8 -*- import vcfpy # Open input, add FILTER header, and open output file reader = vcfpy.Reader.from_path("input.vcf") reader.header.add_filter_line( vcfpy.OrderedDict([("ID", "DP10"), ("Description", "total DP < 10")])) writer = vcfpy.Writer.from_path("/dev/stdout", reader.header) # Add "DP10" filter to records having less than 10 reads for record in reader: ad = sum(c.data.get("DP", 0) for c in record.calls) if ad < 10: record.add_filter("DP10") writer.write_record(record)
# Variant calling score (QUAL) field dico_vcf[var_id]["features"][caller_name]["QUAL"] = record.QUAL # Filter field dico_vcf[var_id]["features"][caller_name]["FILTER"] = record.FILTER # Genotype (GT) field dico_vcf[var_id]["features"][caller_name]["GT"] = record.calls[0].data.get('GT').replace("|","/") # Read Depth (DP) field dico_vcf[var_id]["features"][caller_name]["DP"] = record.calls[0].data.get('DP') # Allele Frequency (AF) field if caller_name in ["strelka","deepvariant"]: dico_vcf[var_id]["features"][caller_name]["AF"] = round((float(record.calls[0].data['AD'][1])/(float(record.calls[0].data['AD'][0])+float(record.calls[0].data['AD'][1]))),2) # for strelka and deepvariant AD for ref and alt is in FORMAT else: dico_vcf[var_id]["features"][caller_name]["AF"] = round(float(record.INFO['AF'][0]),2) #***** CREATE new vcf header *****# # Callers list new_header.add_line(vcfpy.HeaderLine("Nk_calls","|".join(lst_caller_name))) # Filters list for filter_id in dico_filter_line: new_header.add_filter_line(vcfpy.OrderedDict([('ID', filter_id),('Description', dico_filter_line[filter_id])])) new_header.add_filter_line(vcfpy.OrderedDict([('ID', "FILTER"),('Description', "All callers filtered")])) # INFO dico_line_info_CALLNB = collections.OrderedDict([("ID","CALLNB"),("Number","A"),("Type","Integer"),("Description","Number of PASS calls")]) new_header.add_info_line(dico_line_info_CALLNB) dico_line_info_CALLAF = collections.OrderedDict([("ID","CALLAF"),("Number","A"),("Type","String"),("Description","Allele frequency per caller")]) new_header.add_info_line(dico_line_info_CALLAF) dico_line_info_CALLFILTER = collections.OrderedDict([("ID","CALLFILTER"),("Number","A"),("Type","String"),("Description","Filters per caller")]) new_header.add_info_line(dico_line_info_CALLFILTER) dico_line_info_CALLQUAL = collections.OrderedDict([("ID","CALLQUAL"),("Number","A"),("Type","String"),("Description","Variant quality per caller")]) new_header.add_info_line(dico_line_info_CALLQUAL) # FORMAT dico_line_info_GT = collections.OrderedDict([("ID","GT"),("Number","1"),("Type","String"),("Description","Genotype")]) new_header.add_format_line(dico_line_info_GT) dico_line_info_DP = collections.OrderedDict([("ID","DP"),("Number","1"),("Type","Integer"),("Description","Read depth (median if multiple calls)")]) new_header.add_format_line(dico_line_info_DP)