Example #1
0
def add_convenience_annotations(annotation):
    """
    Add a bunch of convenience lookups to an annotation.
    This is kind of a historical relic - should try to remove as many as we can
    TODO: yeah let's aim to get rid of this completely
    """
    vep_annotation = annotation['vep_annotation']
    annotation['gene_ids'] = vep_annotations.get_gene_ids(vep_annotation)
    annotation["coding_gene_ids"] = vep_annotations.get_coding_gene_ids(vep_annotation)
    annotation['worst_vep_annotation_index'] = vep_annotations.get_worst_vep_annotation_index(vep_annotation)
    annotation['worst_vep_index_per_gene'] = {}
    annotation['annotation_tags'] = list({a['consequence'] for a in vep_annotation})
    for gene_id in annotation['gene_ids']:
        annotation['worst_vep_index_per_gene'][gene_id] = vep_annotations.get_worst_vep_annotation_index(
            vep_annotation,
            gene_id=gene_id
        )

    per_gene = {}
    for gene_id in annotation['coding_gene_ids']:
        per_gene[gene_id] = vep_annotations.get_worst_vep_annotation_index(vep_annotation, gene_id=gene_id)
    annotation['worst_vep_index_per_gene'] = per_gene

    worst_vep_annotation = vep_annotation[annotation['worst_vep_annotation_index']]

    annotation['vep_consequence'] = None
    if worst_vep_annotation:
        annotation['vep_consequence'] = worst_vep_annotation['consequence']

    annotation['vep_group'] = None
    if worst_vep_annotation:
        annotation['vep_group'] = constants.ANNOTATION_GROUP_REVERSE_MAP[annotation['vep_consequence']]
Example #2
0
def add_convenience_annotations(annotation):
    """
    Add a bunch of convenience lookups to an annotation.
    This is kind of a historical relic - should try to remove as many as we can
    TODO: yeah let's aim to get rid of this completely
    """
    vep_annotation = annotation['vep_annotation']
    annotation['gene_ids'] = vep_annotations.get_gene_ids(vep_annotation)
    annotation["coding_gene_ids"] = vep_annotations.get_coding_gene_ids(vep_annotation)
    annotation['worst_vep_annotation_index'] = vep_annotations.get_worst_vep_annotation_index(vep_annotation)
    annotation['worst_vep_index_per_gene'] = {}
    annotation['annotation_tags'] = list({a['consequence'] for a in vep_annotation})
    for gene_id in annotation['gene_ids']:
        annotation['worst_vep_index_per_gene'][gene_id] = vep_annotations.get_worst_vep_annotation_index(
            vep_annotation,
            gene_id=gene_id
        )

    per_gene = {}
    for gene_id in annotation['coding_gene_ids']:
        per_gene[gene_id] = vep_annotations.get_worst_vep_annotation_index(vep_annotation, gene_id=gene_id)
    annotation['worst_vep_index_per_gene'] = per_gene

    worst_vep_annotation = vep_annotation[annotation['worst_vep_annotation_index']]

    annotation['vep_consequence'] = None
    if worst_vep_annotation:
        annotation['vep_consequence'] = worst_vep_annotation['consequence']

    annotation['vep_group'] = None
    if worst_vep_annotation:
        annotation['vep_group'] = constants.ANNOTATION_GROUP_REVERSE_MAP[annotation['vep_consequence']]
Example #3
0
    def test_get_worst_vep_annotation_index(self):
        annotations = [
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000479049', 'Consequence': 'non_coding_transcript_exon_variant', 'Protein_position': '', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000258104', 'Consequence': 'stop_gained', 'Protein_position': '1968', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000394120', 'Consequence': 'stop_gained', 'Protein_position': '1969', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409366', 'Consequence': 'stop_gained', 'Protein_position': '1990', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409582', 'Consequence': 'stop_gained', 'Protein_position': '2006', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409651', 'Consequence': 'stop_gained', 'Protein_position': '2000', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409744', 'Consequence': 'stop_gained', 'Protein_position': '1976', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000409762', 'Consequence': 'stop_gained', 'Protein_position': '1985', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410020', 'Consequence': 'stop_gained', 'Protein_position': '2007', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': 'YES'},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000410041', 'Consequence': 'stop_gained', 'Protein_position': '1986', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000413539', 'Consequence': 'stop_gained', 'Protein_position': '1999', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
            {'Feature_type': 'Transcript', 'biotype': 'other', 'GMAF': '', 'Feature': 'ENST00000429174', 'Consequence': 'stop_gained', 'Protein_position': '1989', 'Gene': 'ENSG00000135636', 'STRAND': '1', 'CANONICAL': ''},
        ]

        # convert keys to lower case
        for annot_dict in annotations:
            for key, value in annot_dict.items():
                annot_dict[key.lower()] = value
            annot_dict['is_nc'] = False
            annot_dict['is_nmd'] = False

        # test basic case
        self.assertEqual(get_worst_vep_annotation_index(annotations), 8)

        # test 2 annotations being canonical - choose the worst one
        annotations[0]['canonical'] = 'YES'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 8)

        # test 0 annotations being canonical - choose the worst one
        annotations[0]['canonical'] = ''
        annotations[8]['canonical'] = ''
        i = get_worst_vep_annotation_index(annotations)
        self.assertTrue(annotations[i]['consequence'], 'stop_gained')
        self.assertEqual(annotations[i]['feature'], 'ENST00000258104')

        # test where worst-affected transcript is not the canonical one
        annotations[1]['consequence'] = 'splice_donor_variant'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 1)
        self.assertFalse(annotations[1]['canonical'])

        # test protein coding filter
        annotations[6]['biotype'] = 'protein_coding'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 6)

        # test the gene_id arg
        annotations[8]['canonical'] = 'YES'
        annotations[1]['gene'] = 'OTHER_GENE1'
        annotations[2]['gene'] = 'OTHER_GENE2'
        self.assertEqual(get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE1'), 1)
        self.assertEqual(get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE2'), 2)
        annotations[8]['canonical'] = 'NO'
Example #4
0
    def get_output_row(self,
                       variant,
                       xpos,
                       ref,
                       alt,
                       individual_id,
                       family,
                       all_fields=False,
                       comments="",
                       gene_id=""):
        v = variant
        if individual_id not in v.genotypes:
            print("skipping variant: %s because individual %s not in %s" %
                  (str(xpos) + " " + ref + ">" + alt, individual_id,
                   family.family_id))
            return None

        gene_id = gene_id.split(
            "."
        )[0] if gene_id else None  # strip off the gene_id suffix (eg. '.3')

        genotype = v.genotypes[individual_id]
        if genotype.gq is None:
            print(
                "skipping variant: %s because this variant is not called in this individual (%s)"
                % (str(xpos) + " " + ref + ">" + alt,
                   individual_id))  #, str(genotype)))
            return None

        chrom, pos = genomeloc.get_chr_pos(xpos)
        chrom_without_chr = chrom.replace("chr", "")

        annot = v.annotation
        if gene_id:
            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
                annot["vep_annotation"], gene_id=gene_id)
        else:
            # create dictionary that maps gene id to the index of the worst vep annotation for that gene
            protein_coding_gene_ids = set(a['gene']
                                          for a in annot["vep_annotation"]
                                          if a['biotype'] == 'protein_coding')
            if not protein_coding_gene_ids:
                print(
                    "skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s"
                    %
                    (str(xpos) + " " + ref + ">" + alt, individual_id, annot))
                return None

            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
                annot["vep_annotation"], gene_id=protein_coding_gene_ids)
            if len(protein_coding_gene_ids) > 1:
                selected_gene_id = annot["vep_annotation"][
                    worst_vep_annotation_index]['gene']
                print("Selected %s from %s" %
                      (annot["vep_annotation"][worst_vep_annotation_index]
                       ['symbol'],
                       set([
                           a['symbol'] for a in annot["vep_annotation"]
                           if a['gene'] in protein_coding_gene_ids
                       ])))

        vep = annot["vep_annotation"][
            worst_vep_annotation_index]  # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene

        worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
            annot["vep_annotation"])

        vep = annot["vep_annotation"][worst_vep_annotation_index]

        if "symbol" in vep and "consequence" in vep:
            gene_name = vep["symbol"]  # vep["gene"]
            functional_class = vep["consequence"]
        else:
            gene_name = functional_class = ""
            print(
                "ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s"
                % locals())
        if genotype.num_alt is None:
            s = "\n\n"
            for i, g in v.genotypes.items():
                s += str(i) + ": " + str(g) + "\n"
            raise ValueError("genotype.num_alt is None: " + str(genotype) +
                             "\n" + str(v.toJSON()) + "\n" + s)

        genotype_str = genotype_map[genotype.num_alt]

        variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt)
        if "hgvsc" in vep and "hgvsp" in vep:
            #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
            hgvs_c = urllib.unquote(vep["hgvsc"])
            hgvs_p = urllib.unquote(vep["hgvsp"])
        else:
            hgvs_c = hgvs_p = ""
            #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        rsid = annot["rsid"] or ""

        #rsid = vep["clinvar_rs"]

        exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(
            chrom, pos, ref, alt)
        if exac_global_af is None:
            exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]"
        else:
            exac_global_af_annot = str(annot["freqs"]["exac_v3"])
            if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01:
                print(
                    "Error annot['freqs']['exac_v3']  (%s) doesn't match %s" %
                    (float(exac_global_af), float(exac_global_af_annot)))

        clinvar_clinsig = ""
        clinvar_clnrevstat = ""

        if "clin_sig" in vep:
            clinvar_clinsig_from_dbnsfp = vep["clin_sig"]
        else:
            clinvar_clinsig_from_dbnsfp = ""
            #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        clinvar_records = [
            record
            for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos)
            if record.POS == pos and record.REF == ref
        ]

        #if clinvar_clinsig_from_dbnsfp or clinvar_records:
        # defensive programming
        #if clinvar_clinsig_from_dbnsfp and not clinvar_records:
        #    raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str)
        #if not clinvar_clinsig_from_dbnsfp and clinvar_records:
        #    raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str)

        if clinvar_records:
            #if len(clinvar_records) > 1:
            #    raise ValueError("multiple clinvar records found for variant: %s" % variant_str)
            clinvar_record = clinvar_records[-1]
            clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"])
            clinvar_alleles = map(str,
                                  [clinvar_record.REF] + clinvar_record.ALT)
            xbrowse_alleles = map(str, [ref] + [alt])
            clinvar_value_indexes_to_use = [
                i for i, clinvar_allele_index in enumerate(
                    clinvar_allele_indexes)
                if str(clinvar_alleles[clinvar_allele_index]).upper() in
                xbrowse_alleles
            ]
            clnrevstat = clinvar_record.INFO["CLNREVSTAT"]
            clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use]
            clnsig = clinvar_record.INFO["CLNSIG"]
            clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use]
            # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO))
            if clnsig:
                clinvar_clinsig_numbers = map(int, clnsig[0].split("|"))
                clinvar_clinsig = "|".join(
                    set([
                        clinsig_map[clinvar_clinsig_number][0]
                        for clinvar_clinsig_number in clinvar_clinsig_numbers
                    ]))

                clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|")))

        # get
        number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]"
        clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term=" + chrom_without_chr + "[chr]+AND+" + str(
            pos) + "[chrpos37]"
        if clinvar_clinsig and all_fields:
            print("Reading from: " + clinvar_url)
            url_opener = urllib2.build_opener()
            url_opener.addheaders = [(
                'User-agent',
                "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"
            )]
            page_contents = url_opener.open(clinvar_url).read()
            match = re.search("(\d) star.? out of maximum of 4 stars",
                              page_contents)
            if match:
                number_of_stars = int(match.group(1))
            else:
                print("No match in page: " + clinvar_url)
                for line in page_contents.split("\n"):
                    if "rev_stat_text hide" in line:
                        print(
                            " -- this line was expected to contain number of stars: "
                            + line)

        row = map(str, [
            gene_name, genotype_str, variant_str, functional_class, hgvs_c,
            hgvs_p, rsid, exac_global_af, exac_popmax_af,
            exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat,
            number_of_stars, clinvar_url, comments
        ])
        return row
    def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""):
        v = variant
        if individual_id not in v.genotypes:
            print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id))
            return None

        gene_id = gene_id.split(".")[0] if gene_id else None  # strip off the gene_id suffix (eg. '.3')

        genotype = v.genotypes[individual_id]
        if genotype.gq is None:
            print("skipping variant: %s because this variant is not called in this individual (%s)"  % (str(xpos)+" " + ref + ">" + alt, individual_id)) #, str(genotype)))
            return None

        chrom, pos = genomeloc.get_chr_pos(xpos)
        chrom_without_chr = chrom.replace("chr", "")

        annot = v.annotation
        if gene_id:
            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id = gene_id)
        else:
            # create dictionary that maps gene id to the index of the worst vep annotation for that gene
            protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding')
            if not protein_coding_gene_ids:
                print("skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s"  % (str(xpos)+" " + ref + ">" + alt, individual_id, annot))
                return None

            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id=protein_coding_gene_ids)
            if len(protein_coding_gene_ids) > 1:
                selected_gene_id = annot["vep_annotation"][worst_vep_annotation_index]['gene']
                print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index]['symbol'], set([a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids])))

        vep = annot["vep_annotation"][worst_vep_annotation_index]  # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene

        worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"])

        vep = annot["vep_annotation"][worst_vep_annotation_index]

        if "symbol" in vep and "consequence"in vep:
            gene_name = vep["symbol"]  # vep["gene"]
            functional_class = vep["consequence"]
        else:
            gene_name = functional_class = ""
            print("ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
        if genotype.num_alt is None:
            s = "\n\n"
            for i, g in v.genotypes.items():
                s += str(i) + ": " + str(g) + "\n"
            raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s)

        genotype_str = genotype_map[genotype.num_alt]

        variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt)
        if "hgvsc" in vep and "hgvsp"in vep:
            #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
            hgvs_c = urllib.unquote(vep["hgvsc"])
            hgvs_p = urllib.unquote(vep["hgvsp"])
        else:
            hgvs_c = hgvs_p = ""
            #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        rsid = annot["rsid"] or ""

        #rsid = vep["clinvar_rs"]

        exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(chrom, pos, ref, alt)
        if exac_global_af is None:
             exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]"
        else:
            exac_global_af_annot = str(annot["freqs"]["exac_v3"])
            if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01:
                print("Error annot['freqs']['exac_v3']  (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot)))

        clinvar_clinsig = ""
        clinvar_clnrevstat = ""

        if "clin_sig" in vep:
            clinvar_clinsig_from_dbnsfp = vep["clin_sig"]
        else:
            clinvar_clinsig_from_dbnsfp = ""
            #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())


        clinvar_records = [record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref]


        #if clinvar_clinsig_from_dbnsfp or clinvar_records:
            # defensive programming
            #if clinvar_clinsig_from_dbnsfp and not clinvar_records:
            #    raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str)
            #if not clinvar_clinsig_from_dbnsfp and clinvar_records:
            #    raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str)

        if clinvar_records:
            #if len(clinvar_records) > 1:
            #    raise ValueError("multiple clinvar records found for variant: %s" % variant_str)
            clinvar_record = clinvar_records[-1]
            clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"])
            clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT)
            xbrowse_alleles = map(str, [ref] + [alt])
            clinvar_value_indexes_to_use = [i for i, clinvar_allele_index in enumerate(clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles]
            clnrevstat = clinvar_record.INFO["CLNREVSTAT"]
            clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use]
            clnsig = clinvar_record.INFO["CLNSIG"]
            clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use]
            # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO))
            if clnsig:
                clinvar_clinsig_numbers = map(int, clnsig[0].split("|"))
                clinvar_clinsig = "|".join(set([clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers]))

                clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|")))

        # get
        number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]"
        clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term="+chrom_without_chr+"[chr]+AND+"+str(pos)+"[chrpos37]"
        if clinvar_clinsig and all_fields:
            print("Reading from: " + clinvar_url)
            url_opener = urllib2.build_opener()
            url_opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11")]
            page_contents = url_opener.open(clinvar_url).read()
            match = re.search("(\d) star.? out of maximum of 4 stars", page_contents)
            if match:
                number_of_stars = int(match.group(1))
            else:
                print("No match in page: " + clinvar_url)
                for line in page_contents.split("\n"):
                    if "rev_stat_text hide" in line:
                        print(" -- this line was expected to contain number of stars: " + line)

        row = map(str, [gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments])
        return row
Example #6
0
    def test_get_worst_vep_annotation_index(self):
        annotations = [
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000479049',
                'Consequence': 'non_coding_transcript_exon_variant',
                'Protein_position': '',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000258104',
                'Consequence': 'stop_gained',
                'Protein_position': '1968',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000394120',
                'Consequence': 'stop_gained',
                'Protein_position': '1969',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000409366',
                'Consequence': 'stop_gained',
                'Protein_position': '1990',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000409582',
                'Consequence': 'stop_gained',
                'Protein_position': '2006',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000409651',
                'Consequence': 'stop_gained',
                'Protein_position': '2000',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000409744',
                'Consequence': 'stop_gained',
                'Protein_position': '1976',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000409762',
                'Consequence': 'stop_gained',
                'Protein_position': '1985',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000410020',
                'Consequence': 'stop_gained',
                'Protein_position': '2007',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': 'YES'
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000410041',
                'Consequence': 'stop_gained',
                'Protein_position': '1986',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000413539',
                'Consequence': 'stop_gained',
                'Protein_position': '1999',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
            {
                'Feature_type': 'Transcript',
                'biotype': 'other',
                'GMAF': '',
                'Feature': 'ENST00000429174',
                'Consequence': 'stop_gained',
                'Protein_position': '1989',
                'Gene': 'ENSG00000135636',
                'STRAND': '1',
                'CANONICAL': ''
            },
        ]

        # convert keys to lower case
        for annot_dict in annotations:
            for key, value in annot_dict.items():
                annot_dict[key.lower()] = value
            annot_dict['is_nc'] = False
            annot_dict['is_nmd'] = False

        # test basic case
        self.assertEqual(get_worst_vep_annotation_index(annotations), 8)

        # test 2 annotations being canonical - choose the worst one
        annotations[0]['canonical'] = 'YES'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 8)

        # test 0 annotations being canonical - choose the worst one
        annotations[0]['canonical'] = ''
        annotations[8]['canonical'] = ''
        i = get_worst_vep_annotation_index(annotations)
        self.assertTrue(annotations[i]['consequence'], 'stop_gained')
        self.assertEqual(annotations[i]['feature'], 'ENST00000258104')

        # test where worst-affected transcript is not the canonical one
        annotations[1]['consequence'] = 'splice_donor_variant'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 1)
        self.assertFalse(annotations[1]['canonical'])

        # test protein coding filter
        annotations[6]['biotype'] = 'protein_coding'
        self.assertEqual(get_worst_vep_annotation_index(annotations), 6)

        # test the gene_id arg
        annotations[8]['canonical'] = 'YES'
        annotations[1]['gene'] = 'OTHER_GENE1'
        annotations[2]['gene'] = 'OTHER_GENE2'
        self.assertEqual(
            get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE1'),
            1)
        self.assertEqual(
            get_worst_vep_annotation_index(annotations, gene_id='OTHER_GENE2'),
            2)
        annotations[8]['canonical'] = 'NO'