def add_convenience_annotations(annotation): """ Add a bunch of convenience lookups to an annotation. This is kind of a historical relic - should try to remove as many as we can TODO: yeah let's aim to get rid of this completely """ vep_annotation = annotation['vep_annotation'] annotation['gene_ids'] = vep_annotations.get_gene_ids(vep_annotation) annotation["coding_gene_ids"] = vep_annotations.get_coding_gene_ids(vep_annotation) annotation['worst_vep_annotation_index'] = vep_annotations.get_worst_vep_annotation_index(vep_annotation) annotation['worst_vep_index_per_gene'] = {} annotation['annotation_tags'] = list({a['consequence'] for a in vep_annotation}) for gene_id in annotation['gene_ids']: annotation['worst_vep_index_per_gene'][gene_id] = vep_annotations.get_worst_vep_annotation_index( vep_annotation, gene_id=gene_id ) per_gene = {} for gene_id in annotation['coding_gene_ids']: per_gene[gene_id] = vep_annotations.get_worst_vep_annotation_index(vep_annotation, gene_id=gene_id) annotation['worst_vep_index_per_gene'] = per_gene worst_vep_annotation = vep_annotation[annotation['worst_vep_annotation_index']] annotation['vep_consequence'] = None if worst_vep_annotation: annotation['vep_consequence'] = worst_vep_annotation['consequence'] annotation['vep_group'] = None if worst_vep_annotation: annotation['vep_group'] = constants.ANNOTATION_GROUP_REVERSE_MAP[annotation['vep_consequence']]
def test_get_worst_vep_annotation_index(self): annotations = [ {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000479049', 'Consequence': 'non_coding_transcript_exon_variant', 'Protein_position': '', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000258104', 'Consequence': 'stop_gained', 'Protein_position': '1968', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000394120', 'Consequence': 'stop_gained', 'Protein_position': '1969', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409366', 'Consequence': 'stop_gained', 'Protein_position': '1990', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409582', 'Consequence': 'stop_gained', 'Protein_position': '2006', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409651', 'Consequence': 'stop_gained', 'Protein_position': '2000', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409744', 'Consequence': 'stop_gained', 'Protein_position': '1976', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409762', 'Consequence': 'stop_gained', 'Protein_position': '1985', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410020', 'Consequence': 'stop_gained', 'Protein_position': '2007', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': 'YES'}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410041', 'Consequence': 'stop_gained', 'Protein_position': '1986', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000413539', 'Consequence': 'stop_gained', 'Protein_position': '1999', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000429174', 'Consequence': 'stop_gained', 'Protein_position': '1989', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''}, ] # convert keys to lower case for annot_dict in annotations: for key, value in annot_dict.items(): annot_dict[key.lower()] = value annot_dict['is_nc'] = False annot_dict['is_nmd'] = False # test basic case self.assertEqual(get_worst_vep_annotation_index(annotations), 8) # test 2 annotations being canonical - choose the worst one annotations[0]['canonical'] = 'YES' self.assertEqual(get_worst_vep_annotation_index(annotations), 8) # test 0 annotations being canonical - choose the worst one annotations[0]['canonical'] = '' annotations[8]['canonical'] = '' i = get_worst_vep_annotation_index(annotations) self.assertTrue(annotations[i]['consequence'], 'stop_gained') self.assertEqual(annotations[i]['feature'], 'ENST00000258104') # test where worst-affected transcript is not the canonical one annotations[1]['consequence'] = 'splice_donor_variant' self.assertEqual(get_worst_vep_annotation_index(annotations), 1) self.assertFalse(annotations[1]['canonical']) # test protein coding filter annotations[6]['biotype'] = 'protein_coding' self.assertEqual(get_worst_vep_annotation_index(annotations), 6) # test the gene_id arg annotations[8]['canonical'] = 'YES' annotations[1]['gene'] = 'OTHER_GENE1' annotations[2]['gene'] = 'OTHER_GENE2' self.assertEqual(get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE1'), 1) self.assertEqual(get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE2'), 2) annotations[8]['canonical'] = 'NO'
def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""): v = variant if individual_id not in v.genotypes: print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id)) return None gene_id = gene_id.split( "." )[0] if gene_id else None # strip off the gene_id suffix (eg. '.3') genotype = v.genotypes[individual_id] if genotype.gq is None: print( "skipping variant: %s because this variant is not called in this individual (%s)" % (str(xpos) + " " + ref + ">" + alt, individual_id)) #, str(genotype))) return None chrom, pos = genomeloc.get_chr_pos(xpos) chrom_without_chr = chrom.replace("chr", "") annot = v.annotation if gene_id: worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"], gene_id=gene_id) else: # create dictionary that maps gene id to the index of the worst vep annotation for that gene protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding') if not protein_coding_gene_ids: print( "skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, annot)) return None worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"], gene_id=protein_coding_gene_ids) if len(protein_coding_gene_ids) > 1: selected_gene_id = annot["vep_annotation"][ worst_vep_annotation_index]['gene'] print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index] ['symbol'], set([ a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids ]))) vep = annot["vep_annotation"][ worst_vep_annotation_index] # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index( annot["vep_annotation"]) vep = annot["vep_annotation"][worst_vep_annotation_index] if "symbol" in vep and "consequence" in vep: gene_name = vep["symbol"] # vep["gene"] functional_class = vep["consequence"] else: gene_name = functional_class = "" print( "ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) if genotype.num_alt is None: s = "\n\n" for i, g in v.genotypes.items(): s += str(i) + ": " + str(g) + "\n" raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s) genotype_str = genotype_map[genotype.num_alt] variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt) if "hgvsc" in vep and "hgvsp" in vep: #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) hgvs_c = urllib.unquote(vep["hgvsc"]) hgvs_p = urllib.unquote(vep["hgvsp"]) else: hgvs_c = hgvs_p = "" #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) rsid = annot["rsid"] or "" #rsid = vep["clinvar_rs"] exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af( chrom, pos, ref, alt) if exac_global_af is None: exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]" else: exac_global_af_annot = str(annot["freqs"]["exac_v3"]) if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01: print( "Error annot['freqs']['exac_v3'] (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot))) clinvar_clinsig = "" clinvar_clnrevstat = "" if "clin_sig" in vep: clinvar_clinsig_from_dbnsfp = vep["clin_sig"] else: clinvar_clinsig_from_dbnsfp = "" #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) clinvar_records = [ record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref ] #if clinvar_clinsig_from_dbnsfp or clinvar_records: # defensive programming #if clinvar_clinsig_from_dbnsfp and not clinvar_records: # raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str) #if not clinvar_clinsig_from_dbnsfp and clinvar_records: # raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str) if clinvar_records: #if len(clinvar_records) > 1: # raise ValueError("multiple clinvar records found for variant: %s" % variant_str) clinvar_record = clinvar_records[-1] clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"]) clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT) xbrowse_alleles = map(str, [ref] + [alt]) clinvar_value_indexes_to_use = [ i for i, clinvar_allele_index in enumerate( clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles ] clnrevstat = clinvar_record.INFO["CLNREVSTAT"] clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use] clnsig = clinvar_record.INFO["CLNSIG"] clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use] # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO)) if clnsig: clinvar_clinsig_numbers = map(int, clnsig[0].split("|")) clinvar_clinsig = "|".join( set([ clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers ])) clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|"))) # get number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]" clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term=" + chrom_without_chr + "[chr]+AND+" + str( pos) + "[chrpos37]" if clinvar_clinsig and all_fields: print("Reading from: " + clinvar_url) url_opener = urllib2.build_opener() url_opener.addheaders = [( 'User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11" )] page_contents = url_opener.open(clinvar_url).read() match = re.search("(\d) star.? out of maximum of 4 stars", page_contents) if match: number_of_stars = int(match.group(1)) else: print("No match in page: " + clinvar_url) for line in page_contents.split("\n"): if "rev_stat_text hide" in line: print( " -- this line was expected to contain number of stars: " + line) row = map(str, [ gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments ]) return row
def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""): v = variant if individual_id not in v.genotypes: print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id)) return None gene_id = gene_id.split(".")[0] if gene_id else None # strip off the gene_id suffix (eg. '.3') genotype = v.genotypes[individual_id] if genotype.gq is None: print("skipping variant: %s because this variant is not called in this individual (%s)" % (str(xpos)+" " + ref + ">" + alt, individual_id)) #, str(genotype))) return None chrom, pos = genomeloc.get_chr_pos(xpos) chrom_without_chr = chrom.replace("chr", "") annot = v.annotation if gene_id: worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id = gene_id) else: # create dictionary that maps gene id to the index of the worst vep annotation for that gene protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding') if not protein_coding_gene_ids: print("skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s" % (str(xpos)+" " + ref + ">" + alt, individual_id, annot)) return None worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id=protein_coding_gene_ids) if len(protein_coding_gene_ids) > 1: selected_gene_id = annot["vep_annotation"][worst_vep_annotation_index]['gene'] print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index]['symbol'], set([a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids]))) vep = annot["vep_annotation"][worst_vep_annotation_index] # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"]) vep = annot["vep_annotation"][worst_vep_annotation_index] if "symbol" in vep and "consequence"in vep: gene_name = vep["symbol"] # vep["gene"] functional_class = vep["consequence"] else: gene_name = functional_class = "" print("ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) if genotype.num_alt is None: s = "\n\n" for i, g in v.genotypes.items(): s += str(i) + ": " + str(g) + "\n" raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s) genotype_str = genotype_map[genotype.num_alt] variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt) if "hgvsc" in vep and "hgvsp"in vep: #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) hgvs_c = urllib.unquote(vep["hgvsc"]) hgvs_p = urllib.unquote(vep["hgvsp"]) else: hgvs_c = hgvs_p = "" #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) rsid = annot["rsid"] or "" #rsid = vep["clinvar_rs"] exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(chrom, pos, ref, alt) if exac_global_af is None: exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]" else: exac_global_af_annot = str(annot["freqs"]["exac_v3"]) if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01: print("Error annot['freqs']['exac_v3'] (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot))) clinvar_clinsig = "" clinvar_clnrevstat = "" if "clin_sig" in vep: clinvar_clinsig_from_dbnsfp = vep["clin_sig"] else: clinvar_clinsig_from_dbnsfp = "" #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals()) clinvar_records = [record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref] #if clinvar_clinsig_from_dbnsfp or clinvar_records: # defensive programming #if clinvar_clinsig_from_dbnsfp and not clinvar_records: # raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str) #if not clinvar_clinsig_from_dbnsfp and clinvar_records: # raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str) if clinvar_records: #if len(clinvar_records) > 1: # raise ValueError("multiple clinvar records found for variant: %s" % variant_str) clinvar_record = clinvar_records[-1] clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"]) clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT) xbrowse_alleles = map(str, [ref] + [alt]) clinvar_value_indexes_to_use = [i for i, clinvar_allele_index in enumerate(clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles] clnrevstat = clinvar_record.INFO["CLNREVSTAT"] clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use] clnsig = clinvar_record.INFO["CLNSIG"] clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use] # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO)) if clnsig: clinvar_clinsig_numbers = map(int, clnsig[0].split("|")) clinvar_clinsig = "|".join(set([clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers])) clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|"))) # get number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]" clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term="+chrom_without_chr+"[chr]+AND+"+str(pos)+"[chrpos37]" if clinvar_clinsig and all_fields: print("Reading from: " + clinvar_url) url_opener = urllib2.build_opener() url_opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11")] page_contents = url_opener.open(clinvar_url).read() match = re.search("(\d) star.? out of maximum of 4 stars", page_contents) if match: number_of_stars = int(match.group(1)) else: print("No match in page: " + clinvar_url) for line in page_contents.split("\n"): if "rev_stat_text hide" in line: print(" -- this line was expected to contain number of stars: " + line) row = map(str, [gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments]) return row
def test_get_worst_vep_annotation_index(self): annotations = [ { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000479049', 'Consequence': 'non_coding_transcript_exon_variant', 'Protein_position': '', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000258104', 'Consequence': 'stop_gained', 'Protein_position': '1968', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000394120', 'Consequence': 'stop_gained', 'Protein_position': '1969', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409366', 'Consequence': 'stop_gained', 'Protein_position': '1990', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409582', 'Consequence': 'stop_gained', 'Protein_position': '2006', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409651', 'Consequence': 'stop_gained', 'Protein_position': '2000', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409744', 'Consequence': 'stop_gained', 'Protein_position': '1976', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409762', 'Consequence': 'stop_gained', 'Protein_position': '1985', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410020', 'Consequence': 'stop_gained', 'Protein_position': '2007', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': 'YES' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410041', 'Consequence': 'stop_gained', 'Protein_position': '1986', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000413539', 'Consequence': 'stop_gained', 'Protein_position': '1999', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, { 'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000429174', 'Consequence': 'stop_gained', 'Protein_position': '1989', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': '' }, ] # convert keys to lower case for annot_dict in annotations: for key, value in annot_dict.items(): annot_dict[key.lower()] = value annot_dict['is_nc'] = False annot_dict['is_nmd'] = False # test basic case self.assertEqual(get_worst_vep_annotation_index(annotations), 8) # test 2 annotations being canonical - choose the worst one annotations[0]['canonical'] = 'YES' self.assertEqual(get_worst_vep_annotation_index(annotations), 8) # test 0 annotations being canonical - choose the worst one annotations[0]['canonical'] = '' annotations[8]['canonical'] = '' i = get_worst_vep_annotation_index(annotations) self.assertTrue(annotations[i]['consequence'], 'stop_gained') self.assertEqual(annotations[i]['feature'], 'ENST00000258104') # test where worst-affected transcript is not the canonical one annotations[1]['consequence'] = 'splice_donor_variant' self.assertEqual(get_worst_vep_annotation_index(annotations), 1) self.assertFalse(annotations[1]['canonical']) # test protein coding filter annotations[6]['biotype'] = 'protein_coding' self.assertEqual(get_worst_vep_annotation_index(annotations), 6) # test the gene_id arg annotations[8]['canonical'] = 'YES' annotations[1]['gene'] = 'OTHER_GENE1' annotations[2]['gene'] = 'OTHER_GENE2' self.assertEqual( get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE1'), 1) self.assertEqual( get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE2'), 2) annotations[8]['canonical'] = 'NO'