Beispiel #1
0
    def test_get_molecular_function_description(self):
        desc = QEBI.get_molecular_function_description('GO:0004689')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('GO:0004689'))

        desc = QEBI.get_molecular_function_description('GO:00046890')
        self.assertEqual(desc, 'None')
Beispiel #2
0
    def test_get_disease_description(self):
        desc = QEBI.get_disease_description('OMIM:613573')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('OMIM:613573'))

        desc = QEBI.get_disease_description('OMIM:6135730')
        self.assertEqual(desc, 'None')
Beispiel #3
0
    def test_get_cellular_component_description(self):
        desc = QEBI.get_cellular_component_description('GO:0005573')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('GO:0005573'))

        desc = QEBI.get_cellular_component_description('GO:00055730')
        self.assertEqual(desc, 'None')
Beispiel #4
0
    def test_get_bio_process_description(self):
        desc = QEBI.get_bio_process_description('HP:0011105')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('HP:0011105'))

        desc = QEBI.get_bio_process_description('HP:00111050')
        self.assertEqual(desc, 'None')
Beispiel #5
0
    def test_get_phenotype_description(self):
        desc = QEBI.get_phenotype_description('GO:0042535')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('GO:0042535'))

        desc = QEBI.get_phenotype_description('GO:00425350')
        self.assertEqual(desc, 'None')
Beispiel #6
0
    def test_get_anatomy_description(self):
        desc = QEBI.get_anatomy_description('UBERON:0004476')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('UBERON:0004476'))

        desc = QEBI.get_anatomy_description('UBERON:00044760')
        self.assertEqual(desc, 'None')

        desc = QEBI.get_anatomy_description('CL:0000038')
        self.assertIsNotNone(desc)
        self.assertEqual(desc, get_from_test_file('CL:0000038'))

        desc = QEBI.get_anatomy_description('CL:00000380')
        self.assertEqual(desc, 'None')
Beispiel #7
0
    def update_molecular_function_nodes_desc(self):
        conn = Neo4jConnection(self.neo4j_url, self.neo4j_user,
                               self.neo4j_password)
        nodes = conn.get_molecular_function_nodes()
        print("the number of molecular_function nodes: %d" % len(nodes))

        from time import time
        t = time()

        nodes_array = []
        for i, node_id in enumerate(nodes):
            # print("no %d" % i)
            node = dict()
            node['node_id'] = node_id
            node['desc'] = QueryEBIOLS.get_molecular_function_description(
                node_id)
            nodes_array.append(node)

        print("molecular_function pulling time: %f" % (time() - t))

        nodes_nums = len(nodes_array)
        chunk_size = 10000
        group_nums = nodes_nums // chunk_size + 1
        for i in range(group_nums):
            start = i * chunk_size
            end = (i + 1) * chunk_size if (
                i + 1) * chunk_size < nodes_nums else nodes_nums
            conn.update_molecular_function_nodes_desc(nodes_array[start:end])

        print("molecular_function total time: %f" % (time() - t))

        conn.close()
Beispiel #8
0
    def update_disease_nodes_desc(self):
        conn = Neo4jConnection(self.neo4j_url, self.neo4j_user,
                               self.neo4j_password)
        nodes = conn.get_disease_nodes()
        print("the number of disease nodes: %d" % len(nodes))

        from time import time
        t = time()

        nodes_array = []
        qo = QueryOMIM()
        for i, node_id in enumerate(nodes):
            node = dict()
            node['node_id'] = node_id
            if node_id[:4] == "OMIM":
                node['desc'] = qo.disease_mim_to_description(node_id)
            elif node_id[:4] == "DOID":
                node['desc'] = QueryEBIOLS.get_disease_description(node_id)
            nodes_array.append(node)

        print("disease api pulling time: %f" % (time() - t))

        nodes_nums = len(nodes_array)
        chunk_size = 10000
        group_nums = nodes_nums // chunk_size + 1
        for i in range(group_nums):
            start = i * chunk_size
            end = (i + 1) * chunk_size if (
                i + 1) * chunk_size < nodes_nums else nodes_nums
            conn.update_disease_nodes_desc(nodes_array[start:end])

        print("disease total time: %f" % (time() - t))

        conn.close()
    def test_update_phenotype_nodes_desc(self):

        conn = Neo4jConnection(self.rtxConfig.neo4j_bolt,
                               self.rtxConfig.neo4j_username,
                               self.rtxConfig.neo4j_password)
        nodes = conn.get_phenotype_nodes()

        # generate random number array
        random_indexes = random_int_list(0, len(nodes) - 1, 100)

        for i in random_indexes:
            # retrieve data from API
            node_id = nodes[i]
            desc = QueryEBIOLS.get_phenotype_description(node_id)

            # retrieve data from Neo4j
            node = conn.get_phenotype_node(node_id)
            self.assertIsNotNone(node)
            self.assertIsNotNone(node['n']['id'])
            self.assertIsNotNone(node['n']['description'])
            self.assertEqual(node_id, node['n']['id'])
            if node['n']['description'] != "None":
                self.assertEqual(desc, node['n']['description'])

        conn.close()
Beispiel #10
0
    def test_update_phenotype_nodes_desc(self):
        f = open('config.json', 'r')
        config_data = f.read()
        f.close()
        config = json.loads(config_data)

        conn = Neo4jConnection(config['url'], config['username'], config['password'])
        nodes = conn.get_phenotype_nodes()

        # generate random number array
        random_indexes = random_int_list(0, len(nodes)-1, 100)

        for i in random_indexes:
            # retrieve data from API
            node_id = nodes[i]
            desc = QueryEBIOLS.get_phenotype_description(node_id)

            # retrieve data from Neo4j
            node = conn.get_phenotype_node(node_id)
            self.assertIsNotNone(node)
            self.assertIsNotNone(node['n']['id'])
            self.assertIsNotNone(node['n']['description'])
            self.assertEqual(node_id, node['n']['id'])
            if node['n']['description'] != "None":
                self.assertEqual(desc, node['n']['description'])

        conn.close()
 def get_mesh_term_for_all(curie_id, description):
     """
     Takes a curie ID, detects the ontology from the curie id, and then finds the mesh term
     Params:
         curie_id - A string containing the curie id of the node. Formatted <source abbreviation>:<number> e.g. DOID:8398
         description - A string containing the English name for the node
     current functionality (+ means has it, - means does not have it)
         "Reactome" +
         "GO" - found gene conversion but no biological process conversion
         "UniProt" +
         "HP" - +
         "UBERON" +
         "CL" - not supposed to be here?
         "NCBIGene" +
         "DOID" +
         "OMIM" +
         "ChEMBL" +
     """
     if type(description) != str:
         description = str(description)
     curie_list = curie_id.split(':')
     names = None
     if QueryNCBIeUtils.is_mesh_term(description):
         return [description + '[MeSH Terms]']
     names = NormGoogleDistance.get_mesh_from_oxo(curie_id)
     if names is None:
         if curie_list[0].lower().startswith("react"):
             res = QueryNCBIeUtils.get_reactome_names(curie_list[1])
             if res is not None:
                 names = res.split('|')
         elif curie_list[0] == "GO":
             pass
         elif curie_list[0].startswith("UniProt"):
             res = QueryNCBIeUtils.get_uniprot_names(curie_list[1])
             if res is not None:
                 names = res.split('|')
         elif curie_list[0] == "HP":
             names = QueryNCBIeUtils.get_mesh_terms_for_hp_id(curie_id)
         elif curie_list[0] == "UBERON":
             if curie_id.endswith('PHENOTYPE'):
                 curie_id = curie_id[:-9]
             mesh_id = QueryEBIOLS.get_mesh_id_for_uberon_id(curie_id)
             names = []
             for entry in mesh_id:
                 if len(entry.split('.')) > 1:
                     uids=QueryNCBIeUtils.get_mesh_uids_for_mesh_tree(entry.split(':')[1])
                     for uid in uids:
                         try:
                             uid_num = int(uid.split(':')[1][1:]) + 68000000
                             names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num)
                         except IndexError:
                             uid_num = int(uid)
                             names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num)
                 else:
                     try:
                         uid = entry.split(':')[1]
                         uid_num = int(uid[1:]) + 68000000
                         names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num)
                     except IndexError:
                         uid_num = int(entry)
                         names += QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num)
             if len(names) == 0:
                 names = None
             else:
                 names[0] = names[0] + '[MeSH Terms]'
         elif curie_list[0] == "NCBIGene":
             gene_id = curie_id.split(':')[1]
             names = QueryNCBIeUtils.get_pubmed_from_ncbi_gene(gene_id)
         elif curie_list[0] == "DOID":
             mesh_id = QueryDisont.query_disont_to_mesh_id(curie_id)
             names = []
             for uid in mesh_id:
                 uid_num = int(uid[1:]) + 68000000
                 name = QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(uid_num)
                 if name is not None:
                     names += name
             if len(names) == 0:
                 names = None
             else:
                 names[0] = names[0] + '[MeSH Terms]'
         elif curie_list[0] == "OMIM":
             names = QueryNCBIeUtils.get_mesh_terms_for_omim_id(curie_list[1])
         elif curie_list[0] == "ChEMBL":
             chembl_id = curie_id.replace(':', '').upper()
             mesh_id = QueryMyChem.get_mesh_id(chembl_id)
             if mesh_id is not None:
                 mesh_id = int(mesh_id[1:]) + 68000000
                 names = QueryNCBIeUtils.get_mesh_terms_for_mesh_uid(mesh_id)
     if names is not None:
         if type(names) == list:
             for name in names:
                 if name.endswith('[MeSH Terms]'):
                     return [name]
         return names
     return [description.replace(';', '|')]
Beispiel #12
0
 def setUpClass(cls):
     cls.ebiols = QueryEBIOLS()
Beispiel #13
0
    def query_neighbor_genes_for_gene_set_in_a_given_anatomy(self,
                                                             anatomy_curie_id_str,
                                                             protein_set_curie_id_str):

        assert type(protein_set_curie_id_str) == tuple
        assert len(protein_set_curie_id_str) > 0
        assert type(anatomy_curie_id_str) == str

        # convert UBERON anatomy curie ID str to a brenda anatomy ID
        assert anatomy_curie_id_str.startswith("UBERON:")
        bto_id_set = QueryEBIOLS.get_bto_id_for_uberon_id(anatomy_curie_id_str)
        ret_dict = dict()
        if len(bto_id_set) == 0:
            return ret_dict

        assert len(bto_id_set) == 1

        bto_term = QueryEBIOLS.get_bto_term_for_bto_id(next(iter(bto_id_set))).replace(" ", "_")

        entrez_gene_ids = set()
        entrez_gene_ids_int = set()
        
        # convert uniprot IDs to Entrez gene IDs
        for protein_curie_id_str in protein_set_curie_id_str:
            assert protein_curie_id_str.startswith("UniProtKB:")
            uniprot_acc = protein_curie_id_str.split(":")[1]
            entrez_gene_id_set = self.mg.convert_uniprot_id_to_entrez_gene_ID(uniprot_acc)
            for entrez_gene_id in entrez_gene_id_set:
                entrez_gene_ids_int.add(entrez_gene_id)
                entrez_gene_ids.add(str(entrez_gene_id))

        entrez_gene_ids_str = ",".join(entrez_gene_ids)

        data = {"ids": entrez_gene_ids_str,
                "tissue": bto_term,
                "limit": self.limit}

        results = self._wrapper(self.ENDPOINT, data)

        ret_dict = dict()
        gene_dict = dict()
        
        for index, row in results.iterrows():
            gene1 = row["Gene1"]
            gene2 = row["Gene2"]
            avg_corr = row["aveCorr"]
            assert type(gene1) == int
            assert type(gene2) == int
            assert type(avg_corr) == float
            if gene1 in entrez_gene_ids_int:
                if gene2 in entrez_gene_ids_int:
                    # do nothing since this is not a new gene
                    new_gene_id = None
                else:
                    # gene2 is the new gene
                    new_gene_id = gene2
            else:
                if gene2 in entrez_gene_ids_int:
                    new_gene_id = gene1
                else:
                    print("neither gene was in the set of query genes, this should not happen", file=sys.stderr)
                    assert False
            if new_gene_id is not None:
                gene_dict[new_gene_id] = avg_corr

        for gene_id, avg_corr in gene_dict.items():
            uniprot_id_set = self.mg.convert_entrez_gene_id_to_uniprot_id(gene_id)
            if len(uniprot_id_set) > 0:
                for uniprot_id in uniprot_id_set:
                    ret_dict["UniProtKB:" + uniprot_id] = avg_corr

        query_res = get_nodes_that_match_in_list(ret_dict.keys(), 'protein')
        res_list = str(query_res[0])
        res_list = ast.literal_eval(res_list[res_list.find('['):-1])

        for uniprot_id in list(ret_dict):
            if uniprot_id not in res_list:
                ret_dict.pop(uniprot_id)

        return ret_dict