Exemple #1
0
    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc.fetch(is_dl_forced)

        return
Exemple #2
0
    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC()
        hgnc.fetch(is_dl_forced)

        return
Exemple #3
0
    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC()
        hgnc.fetch(is_dl_forced)

        return
Exemple #4
0
    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc.fetch(is_dl_forced)

        return
Exemple #5
0
class HGNCTestCase(SourceTestCase):
    def setUp(self):
        self.source = HGNC('rdf_graph', True)
        self.source.test_ids = self._get_testids()['gene']
        self.source.settestonly(True)
        self._setDirToSource()
        return

    def tearDown(self):
        self.source = None
        return
Exemple #6
0
class HGNCTestCase(SourceTestCase):

    def setUp(self):
        self.source = HGNC()
        self.source.test_ids = self._get_conf()['test_ids']['gene']
        self.source.settestonly(True)
        self._setDirToSource()
        return

    def tearDown(self):
        self.source = None
        return
Exemple #7
0
 def setUp(self):
     self.source = HGNC('rdf_graph', True)
     self.source.test_ids = self._get_testids()['gene']
     self.source.settestonly(True)
     self._setDirToSource()
     return
Exemple #8
0
    def _process_ddg2p_annotations(self, limit):
        """
        The ddg2p annotations associate a gene symbol to an omim disease,
        along with some HPO ids and pubs. The gene symbols come from gencode,
        which in turn come from HGNC official gene symbols.  Therefore,
        we use the HGNC source class to get the id/symbol mapping for
        use in our annotations here.

        According to http://www.gencodegenes.org/faq.html,
        "Gene names are usually HGNC or MGI-approved gene symbols mapped
        to the GENCODE genes by the Ensembl xref pipeline. Sometimes,
        when there is no official gene symbol, the Havana clone-based
        name is used."

        The kind of variation that is linked to a disease is indicated
        (LOF, GOF, CNV, etc) in the source data.
        Here, we create an anonymous variant of the specified gene of
        the indicated type (mapped to the sequence ontology (SO)).

        :param limit:
        :return:

        """

        line_counter = 0
        if self.g is not None:
            g = self.g
        else:
            g = self.graph
        gu = GraphUtils(curie_map.get())

        # in order for this to work, we need to map the HGNC id-symbol;
        hgnc = HGNC()
        hgnc_symbol_id_map = hgnc.get_symbol_id_map()

        myzip = ZipFile(
            '/'.join((self.rawdir, self.files['annot']['file'])), 'r')

        # use the ddg2p.txt file
        fname = 'ddg2p.txt'

        unmapped_omim_counter = 0
        unmapped_gene_count = 0
        with myzip.open(fname, 'r') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            # score_means_by_measure = {}
            # strain_scores_by_measure = {}   # TODO theseare unused
            for row in reader:
                line_counter += 1
                if re.match(r'#', row[0]):   # skip comments
                    continue

                (gencode_gene_name, mode, category, consequence, disease, omim,
                 ddg2p_id, pubmed_ids, hpo_codes) = row

                hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip())
                if hgnc_id is None:
                    logger.error(
                        "Couldn't map the gene symbol %s to HGNC.",
                        gencode_gene_name)
                    unmapped_gene_count += 1
                    continue
                # add the gene
                gu.addClassToGraph(g, hgnc_id, gencode_gene_name)

                # TODO make VSLC with the variation
                #   to associate with the disorder
                # TODO use the Inheritance and Mutation consequence
                #   to classify the VSLCs

                allele_id = self.make_allele_by_consequence(
                    consequence, hgnc_id, gencode_gene_name)

                if omim.strip() != '':
                    omim_id = 'OMIM:'+str(omim.strip())
                    # assume this is declared elsewhere in ontology
                    gu.addClassToGraph(g, omim_id, None)

                    if category.strip() == 'Confirmed DD gene':
                        rel = gu.object_properties['has_phenotype']
                    elif category.strip() == 'Probable DD gene':
                        rel = gu.object_properties['has_phenotype']
                    elif category.strip() == 'Possible DD gene':
                        rel = gu.object_properties['contributes_to']
                    elif category.strip() == 'Not DD gene':
                        # TODO negative annotation
                        continue
                    assoc = G2PAssoc(self.name, allele_id, omim_id)
                    # TODO 'rel' is assigned to but never used

                    for p in re.split(r';', pubmed_ids):
                        p = p.strip()
                        if p != '':
                            pmid = 'PMID:'+str(p)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(g)
                            assoc.add_source(pmid)

                    assoc.add_association_to_graph(g)
                else:
                    # these are unmapped to a disease id.
                    # note that some match OMIM disease labels
                    # but the identifiers are just not included.
                    # TODO consider mapping to OMIM or DOIDs in other ways
                    logger.warning(
                        "No omim id on line %d\n%s", line_counter, str(row))
                    unmapped_omim_counter += 1

                # TODO hpo phenotypes
                # since the DDG2P file is not documented,
                # I don't know what the HPO annotations are actually about
                # are they about the gene?  the omim disease?  something else?
                # So, we wont create associations until this is clarified

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()
        logger.warning(
            "gene-disorder associations with no omim id: %d",
            unmapped_omim_counter)
        logger.warning("unmapped gene count: %d", unmapped_gene_count)

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)

        return
Exemple #9
0
    def _process_ddg2p_annotations(self, limit):
        """
        The ddg2p annotations associate a gene symbol to an omim disease,
        along with some HPO ids and pubs. The gene symbols come from gencode,
        which in turn come from HGNC official gene symbols.  Therefore,
        we use the HGNC source class to get the id/symbol mapping for
        use in our annotations here.

        According to http://www.gencodegenes.org/faq.html,
        "Gene names are usually HGNC or MGI-approved gene symbols mapped
        to the GENCODE genes by the Ensembl xref pipeline. Sometimes,
        when there is no official gene symbol, the Havana clone-based
        name is used."

        The kind of variation that is linked to a disease is indicated
        (LOF, GOF, CNV, etc) in the source data.
        Here, we create an anonymous variant of the specified gene of
        the indicated type (mapped to the sequence ontology (SO)).

        :param limit:
        :return:

        """

        line_counter = 0
        if self.graph is not None:
            graph = self.graph
        else:
            graph = self.graph

        # in order for this to work, we need to map the HGNC id-symbol;
        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc_symbol_id_map = hgnc.get_symbol_id_map()

        myzip = ZipFile('/'.join((self.rawdir, self.files['annot']['file'])),
                        'r')

        # use the ddg2p.txt file
        fname = 'ddg2p.txt'

        unmapped_omim_counter = 0
        unmapped_gene_count = 0
        with myzip.open(fname, 'r') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            # score_means_by_measure = {}
            # strain_scores_by_measure = {}   # TODO theseare unused
            for row in reader:
                line_counter += 1
                if re.match(r'#', row[0]):  # skip comments
                    continue

                (gencode_gene_name, mode, category, consequence, disease, omim,
                 ddg2p_id, pubmed_ids, hpo_codes) = row

                hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip())
                if hgnc_id is None:
                    LOG.error("Couldn't map the gene symbol %s to HGNC.",
                              gencode_gene_name)
                    unmapped_gene_count += 1
                    continue
                # add the gene
                self.model.addClassToGraph(hgnc_id, gencode_gene_name)

                # TODO make VSLC with the variation
                #   to associate with the disorder
                # TODO use the Inheritance and Mutation consequence
                #   to classify the VSLCs

                allele_id = self.make_allele_by_consequence(
                    consequence, hgnc_id, gencode_gene_name)

                if omim.strip() != '':
                    omim_id = 'OMIM:' + str(omim.strip())
                    # assume this is declared elsewhere in ontology
                    self.model.addClassToGraph(omim_id, None)

                    # ??? rel is never used
                    # if category.strip() == 'Confirmed DD gene':
                    #     rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Probable DD gene':
                    #    rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Possible DD gene':
                    #    rel = self.self.globaltt['contributes to']
                    # elif category.strip() == 'Not DD gene':
                    #    # TODO negative annotation
                    #    continue
                    assoc = G2PAssoc(graph, self.name, allele_id, omim_id)
                    # TODO 'rel' is assigned to but never used

                    for p in re.split(r';', pubmed_ids):
                        p = p.strip()
                        if p != '':
                            pmid = 'PMID:' + str(p)
                            r = Reference(graph, pmid,
                                          self.globaltt['journal article'])
                            r.addRefToGraph()
                            assoc.add_source(pmid)

                    assoc.add_association_to_graph()
                else:
                    # these are unmapped to a disease id.
                    # note that some match OMIM disease labels
                    # but the identifiers are just not included.
                    # TODO consider mapping to OMIM or DOIDs in other ways
                    LOG.warning("No omim id on line %d\n%s", line_counter,
                                str(row))
                    unmapped_omim_counter += 1

                # TODO hpo phenotypes
                # since the DDG2P file is not documented,
                # I don't know what the HPO annotations are actually about
                # are they about the gene?  the omim disease?  something else?
                # So, we wont create associations until this is clarified

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()
        LOG.warning("gene-disorder associations with no omim id: %d",
                    unmapped_omim_counter)
        LOG.warning("unmapped gene count: %d", unmapped_gene_count)

        return
Exemple #10
0
 def setUp(self):
     self.source = HGNC()
     self.source.test_ids = self._get_conf()['test_ids']['gene']
     self.source.settestonly(True)
     self._setDirToSource()
     return