def add_vcf_header_info(self, vcf_template): """ Adds vcf INFO headers for the annotated values provided This is just a base method you need to override in your own implementation depending on the annotations added through the annotate_record method :param vcf_template: vcf reader object :return: """ vcf_template.infos['variant_id'] = _Info('variant_id', 1, 'Integer', 'Saphetor variant identifier', None, None) vcf_template.infos['gene'] = _Info('gene', '.', 'String', 'Genes related to this variant', None, None) vcf_template.infos['gnomad_exomes_AF'] = _Info('gnomad_exomes_AF', '.', 'Float', 'GnomAD exomes allele frequency value', None, None) vcf_template.infos['gnomad_genomes_AF'] = _Info('gnomad_genomes_AF', '.', 'Float', 'GnomAD genomes allele frequency value', None, None)
def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={'CSQ': ['T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|']}) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [ {annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [ {annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
def __init__(self, vcf_file, **opts): self.vcf_file = vcf_file self.opts = opts self.verbose = opts.get("verbose", False) self.reader = vcf.VCFReader(open(vcf_file, 'r')) self.variants = vcf_utils.filter_vcf_in_memory(self.reader, init_quality_filters(self.opts['filter_args']), keep = True) self.annotated_variants = [] self.annotation_manager = opts.get("annotation_manager", EntrezAnnotationManager(**opts)) self.reader.infos['GENE'] = _Info('GENE', 1, "String", "Gene containing this variant") self.gene_list = defaultdict(list)
def _get_sample_variant_and_header_with_csq(self): variant = self._get_sample_variant() variant.info['CSQ'] = vcfio.VariantInfo( data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'], field_count='.') csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) return variant, header_fields
def merge_hc_mity(fhc, fmity, fout, priority): """Merges the given HaplotypeCaller and UnifiedGenotyper VCFs into a new VCF.""" hc = vcf.Reader(fhc) mity = vcf.Reader(fmity) # some sanity checks # TODO: possible to make it handle different samples in the two VCFs? if sorted(hc.samples) != sorted(mity.samples): raise ValueError( "Input VCF files must have the same sample column headers.") if sorted(hc.contigs.keys()) != sorted(mity.contigs.keys()): raise ValueError("Input VCF files must denote the same contigs.") if sorted(hc.formats.keys()) != sorted(mity.formats.keys()): raise ValueError("Input VCF files must contain the same formats.") # NOTE: arbitrarily picking mity as the base template ~ we're doing # dict updates, so the hc values will take precedence # merge infos mity.infos.update(hc.infos) # merge formats ~ not necessary since they're equal # TODO: merge filters? # merge metadata if 'GATKCommandLine' in mity.metadata: mity.metadata['UnifiedGenotyperCommandLine'] = \ mity.metadata['GATKCommandLine'] if 'GATKCommandLine' in hc.metadata: mity.metadata['HaplotypeCallerCommandLine'] = \ hc.metadata['GATKCommandLine'] del mity.metadata['GATKCommandLine'] del hc.metadata['GATKCommandLine'] mity.metadata.update(hc.metadata) # add custom INFO field, denoting the variant caller for each variant # iterate over both, picking the priority when variants are called by both # files mity.infos['GATKCaller'] = _Info( 'GATKCaller', '.', 'String', 'GATK ' 'variant caller used to call the variant') out_writer = vcf.Writer(fout, mity) for hc_rec, mity_rec in walk_together(hc, mity): if hc_rec.CHROM != "MT": out_writer.write_record(hc_rec) elif mity_rec.CHROM == "MT": out_writer.write_record(mity_rec) else: assert False, "We should not be here!"
def make_header(header_num_dict): # type: (Dict[str, str]) -> VcfHeader """Builds a VcfHeader based on the header_num_dict. All fields of parser._Info are set to their default values except for the 'id' which is set to the keys in header_num_dict and 'num' which is set based on header_num_dict values mapped according to parser.field_counts. Args: header_num_dict: a dictionary mapping info keys to string num values. """ infos = {} for k, v in header_num_dict.iteritems(): if v in parser.field_counts: pyvcf_num_field_value = parser.field_counts[v] else: pyvcf_num_field_value = int(v) infos[k] = parser._Info(id=k, num=pyvcf_num_field_value, type=None, desc='', source=None, version=None) return vcf_header_io.VcfHeader(infos=infos)
def main(args): # Load VCF file if not os.path.exists(args.vcf): common.WARNING("%s does not exist" % args.vcf) return 1 invcf = vcf.Reader(filename=args.vcf) # Set up record harmonizer and infer VCF type vcftype = trh.InferVCFType(invcf) # Check filters all make sense if not CheckFilters(invcf, args, vcftype): return 1 # Set up locus-level filter list try: filter_list = BuildLocusFilters(args, vcftype) except ValueError: return 1 invcf.filters = {} for f in filter_list: short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # Set up call-level filters call_filters = BuildCallFilters(args) # Add new FORMAT fields if "FILTER" not in invcf.formats: invcf.formats["FILTER"] = _Format("FILTER", 1, "String", "Call-level filter") # Add new INFO fields invcf.infos["AC"] = _Info("AC", -1, "Integer", "Alternate allele counts", source=None, version=None) invcf.infos["REFAC"] = _Info("REFAC", 1, "Integer", "Reference allele count", source=None, version=None) invcf.infos["HET"] = _Info("HET", 1, "Float", "Heterozygosity", source=None, version=None) invcf.infos["HWEP"] = _Info("HWEP", 1, "Float", "HWE p-value for obs. vs. exp het rate", source=None, version=None) invcf.infos["HRUN"] = _Info("HRUN", 1, "Integer", "Length of longest homopolymer run", source=None, version=None) # Set up output files if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Output directory does not exist") return 1 outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv)) if outvcf is None: return 1 # Set up sample info all_reasons = GetAllCallFilters(call_filters) sample_info = {} for s in invcf.samples: sample_info[s] = {"numcalls": 0, "totaldp": 0} for r in all_reasons: sample_info[s][r] = 0 # Set up locus info loc_info = {"totalcalls": 0, "PASS": 0} for filt in filter_list: loc_info[filt.filter_name()] = 0 # Go through each record record_counter = 0 while True: try: record = next(invcf) except IndexError: common.WARNING( "Skipping TR that couldn't be parsed by PyVCF. Check VCF format" ) if args.die_on_warning: return 1 except StopIteration: break if args.verbose: common.MSG("Processing %s:%s" % (record.CHROM, record.POS)) record_counter += 1 if args.num_records is not None and record_counter > args.num_records: break # Call-level filters record = ApplyCallFilters(record, invcf, call_filters, sample_info) # Locus-level filters record.FILTER = None output_record = True for filt in filter_list: if filt(record) == None: continue if args.drop_filtered: output_record = False break record.add_filter(filt.filter_name()) loc_info[filt.filter_name()] += 1 if args.drop_filtered: if record.call_rate == 0: output_record = False if output_record: trrecord = trh.HarmonizeRecord(vcftype, record) # Recalculate locus-level INFO fields record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF) if record.num_called > 0: allele_freqs = trrecord.GetAlleleFreqs( uselength=args.use_length) genotype_counts = trrecord.GetGenotypeCounts( uselength=args.use_length) record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs) record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest( allele_freqs, genotype_counts) record.INFO["AC"] = [ int(item * (3 * record.num_called)) for item in record.aaf ] record.INFO["REFAC"] = int( (1 - sum(record.aaf)) * (2 * record.num_called)) else: record.INFO["HET"] = -1 record.INFO["HWEP"] = -1 record.INFO["AC"] = [0] * len(record.ALT) record.INFO["REFAC"] = 0 # Recalc filter if record.FILTER is None and not args.drop_filtered: record.FILTER = "PASS" loc_info["PASS"] += 1 loc_info["totalcalls"] += record.num_called # Output the record outvcf.write_record(record) # Output log info WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab") WriteLocLog(loc_info, args.out + ".loclog.tab") return 0
def add_vcf_header_info(self, vcf_template): vcf_template.infos['gnomad_genomes_AN'] = _Info( 'gnomad_genomes_AN', '.', 'Integer', 'GnomAD genomes allele number value', None, None)
def add_vcf_header_info(self, vcf_template): vcf_template.infos['gnomad_genomes_AN'] = _Info('gnomad_genomes_AN', '.', 'Integer', 'GnomAD genomes allele number value', None, None)