Esempio n. 1
0
 def standardize_relationship(self,relationship):
     xref = relationship.identifier
     r = self.relations_by_xref[xref]
     if r is None:
         return LabeledID(identifier="GAMMA:0", label="Unmapped_Relation")
     else:
         return LabeledID(identifier=r.identifier, label=r.name)
Esempio n. 2
0
def test_combined_gene_annotation(gene_annotator):
    # gene_annotator.annotate - these are coming from the cache after the first time

    gene_node = KNode('HGNC:9604', type=node_types.GENE)
    gene_node.add_synonyms(
        set([LabeledID(identifier='ENSEMBL:ENSG00000095303', label='PTGS1')]))
    gene_annotator.annotate(gene_node)
    # these are from ensembl
    assert gene_node.properties['ensembl_name'] == 'PTGS1'
    assert gene_node.properties['chromosome'] == '9'
    # these are from hgnc
    assert gene_node.properties['location'] == '9q33.2'

    gene_node = KNode('HGNC:13089', type=node_types.GENE)
    gene_node.add_synonyms(
        set([LabeledID(identifier='ENSEMBL:ENSG00000166526', label='ZNF3')]))
    gene_annotator.annotate(gene_node)
    # these are from ensembl
    assert gene_node.properties['ensembl_name'] == 'ZNF3'
    assert gene_node.properties['chromosome'] == '7'
    # these are from hgnc
    assert 'Zinc fingers C2H2-type' in gene_node.properties['gene_family']
    assert 28 in gene_node.properties['gene_family_id']

    gene_node = KNode('HGNC:122', type=node_types.GENE)
    gene_node.add_synonyms(
        set([LabeledID(identifier='ENSEMBL:ENSG00000143727', label='ACP1')]))
    gene_annotator.annotate(gene_node)
    # these are from ensembl
    assert gene_node.properties['ensembl_name'] == 'ACP1'
    assert gene_node.properties['chromosome'] == '2'
    # these are from hgnc
    assert 1071 in gene_node.properties['gene_family_id']
Esempio n. 3
0
    def __init__(self, context):
        super(Ensembl, self).__init__("ensembl", context)
        self.clingen = context.core.clingen
        self.cache = context.cache
        self.redis = context.cache.redis
        self.var_to_gene_predicate = LabeledID(identifier=f'GAMMA:0000102',
                                               label=f'nearby_variant_of')
        self.var_to_var_predicate = LabeledID(identifier=f'NCIT:C16798',
                                              label=f'linked_to')

        self.gene_db_successfully_created = False
        self.gene_db_path = os.path.join(os.path.dirname(__file__),
                                         'genes.sqlite3')

        self.persistent_conn = None
        self.all_gene_annotations = None

        # we assume the order of attributes from this url -
        # if we change this we need to change the indexing in create_genes_db below
        self.ensembl_genes_url = """http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
                                    <!DOCTYPE Query>
                                    <Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
                                        <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
                                            <Attribute name = "ensembl_gene_id" />
                                            <Attribute name = "gene_biotype" />
                                            <Attribute name = "external_gene_name" />
                                            <Attribute name = "start_position" />
                                            <Attribute name = "end_position" />
                                            <Attribute name = "description" />
                                            <Attribute name = "chromosome_name" />
                                        </Dataset>
                                    </Query>"""
        self.gene_batch_url = 'http://www.ensembl.org/biomart/martservice'

        self.check_if_already_done_sql = "SELECT name FROM sqlite_master WHERE type='table' AND name='genes';"

        self.genes_table_sql = """CREATE TABLE IF NOT EXISTS genes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        ensembl_id text, 
        gene_name text, 
        chromosome INTEGER,
        start_pos INTEGER, 
        end_pos INTEGER, 
        gene_type text,
        description text);"""

        self.genes_table_ensembl_id_index_sql = "CREATE UNIQUE INDEX ensembl_ids on genes(ensembl_id);"
        self.genes_table_composite_index_sql = "CREATE INDEX gene_composite on genes(chromosome, start_pos, end_pos, ensembl_id);"

        self.gene_entry_sql = """INSERT INTO genes 
        (ensembl_id, gene_name, chromosome, start_pos, end_pos, gene_type, description) 
        VALUES (?,?,?,?,?,?,?);"""

        self.gene_range_select_sql = """SELECT ensembl_id, start_pos, end_pos
        FROM genes WHERE chromosome = ? AND ((? >= start_pos AND ? <= end_pos)
        OR (? >= start_pos AND ? <= end_pos) OR (? <= start_pos AND ? >= end_pos));"""

        self.gene_ensembl_id_select_sql = "SELECT * FROM genes WHERE ensembl_id = ?"
Esempio n. 4
0
 def normalize(self, node):
     """Given a node, which will have many potential identifiers, choose the best identifier to be the node ID,
     where 'best' is defined by the order in which identifiers appear in the id prefix configurations within the concept model."""
     #If we have two synonyms with the same id, but one has no label, chuck it
     smap = defaultdict(list)
     for labeledid in node.synonyms:
         smap[labeledid.identifier].append(labeledid.label)
     for lid, labels in smap.items():
         if len(labels) > 1 and (None in labels):
             node.synonyms.remove(LabeledID(identifier=lid, label=None))
         if len(labels) > 1 and ('' in labels):
             node.synonyms.remove(LabeledID(identifier=lid, label=''))
     #Now find the bset one for an id
     type_curies = self.concepts.get(node.type).id_prefixes
     #Now start looking for the best curies
     synonyms_by_curie = defaultdict(list)
     for s in node.synonyms:
         c = Text.get_curie(s.identifier)
         synonyms_by_curie[c].append(s)
     for type_curie in type_curies:
         potential_identifiers = synonyms_by_curie[type_curie]
         if len(potential_identifiers) > 0:
             if len(potential_identifiers) > 1:
                 pis = [
                     f'{pi.identifier}({pi.label})'
                     for pi in potential_identifiers
                 ]
                 ids_with_labels = list(
                     filter(lambda x: x.label is not None,
                            potential_identifiers))
                 if len(ids_with_labels) > 0:
                     potential_identifiers = ids_with_labels
                 potential_identifiers.sort()
             node.id = potential_identifiers[0].identifier
             #Only replace the label if we have a label.
             if potential_identifiers[0].label != '':
                 node.name = potential_identifiers[0].label
             break
     #Remove any synonyms with extraneous prefixes.  The point of this is not so much to remove
     # unknown prefixes, as to make sure that if we got e.g. a meddra, and we downcast it to a disease,
     # that we don't end up with HP's in the equivalent ids.
     bad_synonyms = set()
     for synonym in node.synonyms:
         if isinstance(synonym, LabeledID):
             prefix = Text.get_curie(synonym.identifier)
         else:
             prefix = Text.get_curie(synonym)
         if prefix not in type_curies:
             bad_synonyms.add(synonym)
     for bs in bad_synonyms:
         node.synonyms.remove(bs)
     if node.id.startswith('DOID'):
         logger.warn("We are ending up with a DOID here")
         logger.warn(node.id)
         logger.warn(node.synonyms)
         logger.warn(node.type)
Esempio n. 5
0
 def standardize_predicate(self, predicate, sourcenode=None, targetnode=None):
     """CTD has a little more work to do than the standard service."""
     if '|' not in predicate.label:
         return self.concept_model.standardize_relationship(predicate)
     parts = predicate.label.split('|')
     goodparts = list(filter(lambda p:'reaction' not in p and 'cotreatment' not in p, parts))
     if len(goodparts) != 1:
         return self.concept_model.standardize_relationship(LabeledID(identifier='CTD:interacts_with', label='interacts_with'))
     #Change the modifier to "affects" to deal with the fact that we don't know what the deleted part does.
     thing = self.term_parents[goodparts[0].split('^')[1]]
     new_id = f'CTD:affects^{thing}'
     return self.normalize_predicate(LabeledID(identifier=new_id, label=new_id))
Esempio n. 6
0
 def get_drug_from_adverse_events(self, input_node):
     """Given a node (drug or phenotype), find chemicals that have a high or low rate of causing the node
     concept as an adverse event"""
     meddras = input_node.get_labeled_ids_by_prefix('MEDDRA')
     return_results = []
     for meddra in meddras:
         mname = meddra.label
         murl = f'{self.url}query?q=aeolus.outcomes.name:{mname}'
         hits = self.page_calls(murl, 100)
         for hit in hits:
             #import json
             #print(json.dumps(hit,indent=4))
             if 'aeolus' in hit:
                 aeolus = hit['aeolus']
                 for outcome in aeolus['outcomes']:
                     #I think it makes sense to do some filtering here.  I don't want anything unless the lower
                     # CI bound is > 1, and if I have enough counts (at least 5)
                     if (outcome['name'] != mname):
                         continue
                     print(outcome['name'], outcome['case_count'],
                           outcome['prr_95_ci'])
                     if outcome['case_count'] > 5 and min(
                             outcome['prr_95_ci']) > 1:
                         predicate = LabeledID(
                             identifier="RO:0003302",
                             label="causes_or_contributes_to")
                     elif outcome['case_count'] > 5 and max(
                             outcome['prr_95_ci']) < 1:
                         predicate = LabeledID(identifier="RO:0002599",
                                               label="prevents")
                     else:
                         continue
                     drug_node = self.make_drug_node(hit)
                     if drug_node is None:
                         continue
                         #obj_node = KNode(meddra_id, type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE, name=outcome['name'])
                     props = {
                         'prr': outcome['prr'],
                         'ror': outcome['ror'],
                         'case_count': outcome['case_count']
                     }
                     edge = self.create_edge(drug_node,
                                             input_node,
                                             'mychem.get_adverse_events',
                                             mname,
                                             predicate,
                                             url=murl,
                                             properties=props)
                     return_results.append((edge, drug_node))
     return return_results
Esempio n. 7
0
 def anatomy_to_gene (self, anat):
     anat_identifiers = list(anat.get_synonyms_by_prefix('UBERON'))
     anat_identifier = anat_identifiers[0]
     nodes,edges = self.query ( "MATCH (a:Anatomy)-[ar]-(g:Gene) WHERE a.identifier='{0}' RETURN a, ar, g ".format (anat_identifier),
                           labels=['Gene'], kinds=['node','relationship'])
     node_ids = [ LabeledID(identifier=f"NCBIGENE:{node['identifier']}", label=node['name']) for node in nodes ]
     edge_ids = [ edge['type'] for edge in edges ]
     results = []
     for node_id, predicate_label in zip(node_ids,edge_ids):
         predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label)
         gene = KNode(node_id.identifier, type=node_types.GENE, name=node_id.label)
         #These edges all go from anatomy to gene
         edge = self.create_edge(anat, gene,'hetio.anatomy_to_gene',anat_identifier,predicate)
         results.append((edge, gene))
     return results
Esempio n. 8
0
 def gene_to_anatomy (self, gene):
     gene_identifiers = list(gene.get_synonyms_by_prefix('NCBIGENE'))
     gene_identifier = Text.un_curie(gene_identifiers[0])
     nodes,edges = self.query ( "MATCH (a:Anatomy)-[ar]-(g:Gene) WHERE g.identifier={0} RETURN a, ar, g LIMIT 200".format (gene_identifier),
                           labels=['Anatomy'], kinds=['node','relationship'])
     node_ids = [ LabeledID(identifier=node['identifier'], label=node['name']) for node in nodes ]
     edge_ids = [ edge['type'] for edge in edges ]
     results = []
     for node_id, predicate_label in zip(node_ids,edge_ids):
         predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label)
         anatomy = KNode(node_id.identifier, type=node_types.ANATOMY, name=node_id.label)
         #These edges all go from anatomy to gene
         edge = self.create_edge(anatomy, gene,'hetio.gene_to_anatomy',gene_identifier,predicate)
         results.append((edge, anatomy))
     return results
 def drug_get_gene(self, subject):
     """ Get a gene from a drug. """
     resolved_edge_nodes = []
     identifiers = subject.get_synonyms_by_prefix('CHEMBL.COMPOUND')
     for s in identifiers:
         pharosid = Text.un_curie(s)
         original_edge_nodes = []
         url = 'https://pharos.nih.gov/idg/api/v1/ligands(%s)?view=full' % pharosid
         r = requests.get(url)
         try: 
             result = r.json()
         except:
             #Pharos returns a 404 if it doesn't recognize the identifier, which ends up producing
             # errors in turning into json. Skip to next identifier
             continue
         actions = set()  # for testing
         predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target')
         for link in result['links']:
             if link['kind'] == 'ix.idg.models.Target':
                 pharos_target_id = int(link['refid'])
                 hgnc = self.target_to_hgnc(pharos_target_id)
                 if hgnc is not None:
                     hgnc_node = KNode(hgnc, type=node_types.GENE)
                     edge = self.create_edge(subject,hgnc_node,'pharos.drug_get_gene',pharosid,predicate,url=url)
                     resolved_edge_nodes.append((edge, hgnc_node))
                 else:
                     logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id)
     return resolved_edge_nodes
 def gene_get_drug(self, gene_node):
     """ Get a drug from a gene. """
     resolved_edge_nodes = []
     identifiers = gene_node.get_synonyms_by_prefix('UNIPROTKB')
     for s in identifiers:
         try:
             logger.debug(f'Call with {s}')
             pharosid = Text.un_curie(s)
             original_edge_nodes = []
             url = 'https://pharos.nih.gov/idg/api/v1/targets(%s)?view=full' % pharosid
             r = requests.get(url)
             try:
                 result = r.json()
                 logger.debug('back')
             except:
                 #If pharos doesn't know the identifier, it just 404s.  move to the next
                 logger.debug('404')
                 continue 
             actions = set()  # for testing
             predicate = LabeledID(identifier='PHAROS:drug_targets', label='is_target')
             chembl_id = None
             for link in result['links']:
                 if link['kind'] == 'ix.idg.models.Ligand':
                     pharos_drug_id = link['refid']
                     chembl_id, label = self.drugid_to_identifiers(pharos_drug_id)
                     if chembl_id is not None:
                         drug_node = KNode(chembl_id, type=node_types.CHEMICAL_SUBSTANCE, name=label)
                         edge = self.create_edge(drug_node,gene_node, 'pharos.gene_get_drug',
                                 pharosid,predicate, url=url)
                         resolved_edge_nodes.append( (edge,drug_node) )
         except:
             logger.debug("Error encountered calling pharos with",s)
         logger.debug('ok')
     return resolved_edge_nodes
Esempio n. 11
0
 def graph_drugbank_to_uniprot(self, drugbank):
     response = self.triplestore.query_template(inputs={
         "drugID":
         "DB{0}".format(Text.un_curie(drugbank.identifier))
     },
                                                outputs=["uniprotGeneID"],
                                                template_text="""
         prefix drugbank:      <http://chem2bio2rdf.org/drugbank/resource/>
         prefix drugbank_drug: <http://chem2bio2rdf.org/drugbank/resource/drugbank_drug/>
         prefix ctd:           <http://chem2bio2rdf.org/ctd/resource/>
         select distinct ?uniprotGeneID where {
            values ( ?drugID ) { ( drugbank_drug:${drugID} ) }
            ?dbInter     drugbank:GeneBank_ID        ?geneBankID ;
                         drugbank:gene               ?uniprotGeneID .
            ?drugID      drugbank:CID                ?pubchemCID ;
                         drugbank:Generic_Name       ?drugGenericName .
            ?ctd_disease ctd:diseaseid               ?diseaseID ;
                         ctd:cid                     ?pubchemCID .
         }""")
     predicate = LabeledID(identifier='SIO:001257',
                           label='chemical to gene association')
     results = []
     for r in response:
         node = KNode("UNIPROT:{0}".format(
             r['uniprotGeneID'].split('/')[-1:][0]),
                      type=node_types.GENE)
         edge = self.create_edge(drugbank, node,
                                 'chembio.graph_drugbank_to_uniprot',
                                 predicate, drugbank.id)
         results.append(edge, node)
     return results
    def get_edges_from_file(self, file_name, provided_by, delimiter):
        """
        All is stuff is till we get kgx to merge edges. For now creating
        a pattern looking like a robokopservice and let writer handle it.
        :param file_name:
        :return:
        """
        if not file_name:
            return

        bl_resolver = BL_lookup()
        with open(file_name) as edge_file:
            reader = csv.DictReader(edge_file, delimiter=delimiter)
            for raw_edge in reader:
                edge_label = raw_edge['edge_label'].split(':')[-1]
                relation_predicate = raw_edge['relation']
                predicate = LabeledID(
                    identifier=
                    relation_predicate,  #bl_resolver.resolve_curie(edge_label),
                    label=edge_label)
                source_node = KNode(raw_edge['subject'])
                target_node = KNode(raw_edge['object'])
                edge = self.create_edge(
                    source_node=source_node,
                    target_node=target_node,
                    input_id=source_node.id,
                    provided_by=provided_by,
                    predicate=predicate,
                )
                edge.standard_predicate = predicate
                yield edge
Esempio n. 13
0
 def disease_get_gene(self, disease_node):
     """ Get a gene from a pharos disease id."""
     resolved_edge_nodes = []
     hgncs = set()
     # WD:P2293 gene assoc with condition.
     # domain is gene and range is disease or phenotype for this relationship
     predicate = LabeledID(identifier='WD:P2293', label='gene_involved')
     #Pharos contains multiple kinds of disease identifiers in its disease table:
     # For OMIM identifiers, they can have either prefix OMIM or MIM
     # UMLS doen't have any prefixes.... :(
     pharos_predicates = {'DOID':('DOID',),'UMLS':(None,),'MESH':('MESH',),'OMIM':('OMIM','MIM'),'ORPHANET':('Orphanet',)}
     for ppred,dbpreds in pharos_predicates.items():
         pharos_candidates = [Text.un_curie(x) for x in disease_node.get_synonyms_by_prefix(ppred)]
         for dbpred in dbpreds:
             if dbpred is None:
                 pharos_ids = pharos_candidates
             else:
                 pharos_ids = [f'{dbpred}:{x}' for x in pharos_candidates]
                 for pharos_id in pharos_ids:
                     cursor = self.db.cursor(dictionary = True, buffered = True)
                     query = f"select distinct x.value, p.sym  from disease d join xref x on x.protein_id = d.target_id join protein p on d.target_id = p.id where x.xtype = 'HGNC' and d.dtype <> 'Expression Atlas' and d.did='{pharos_id}';"
                     cursor.execute(query)
                     for result in cursor:
                         label = result['sym']
                         hgnc = result['value']
                         if hgnc not in hgncs:
                             hgncs.add(hgnc)
                             gene_node = KNode(hgnc, type=node_types.GENE, name=label)
                             edge = self.create_edge(gene_node, disease_node, 'pharos.disease_get_gene', pharos_id, predicate)
                             resolved_edge_nodes.append((edge, gene_node))
     return resolved_edge_nodes
Esempio n. 14
0
 def disease_to_phenotype (self, disease):
     disease_identifiers = list(disease.get_synonyms_by_prefix('DOID'))
     if len(disease_identifiers) == 0:
         return []
     disease_identifier = disease_identifiers[0]
     query = """MATCH (d:Disease{identifier:'%s'})-[r]-(s:Symptom) RETURN d,r,s""" % (disease_identifier)
     nodes,edges = self.query (query, labels=['Symptom'], kinds=['node','relationship'])
     node_ids = [ LabeledID(identifier=f"MESH:{node['identifier']}", label=node['name']) for node in nodes ]
     edge_ids = [ edge['type'] for edge in edges ]
     results = []
     for node_id, predicate_label in zip(node_ids,edge_ids):
         predicate = LabeledID(identifier=f'hetio:{predicate_label}', label=predicate_label)
         phenotype = KNode(node_id.identifier, type=node_types.PHENOTYPE, name=node_id.label)
         edge = self.create_edge(disease, phenotype, 'hetio.disease_to_phenotype', disease_identifier, predicate)
         results.append( (edge, phenotype) )
     return results
Esempio n. 15
0
 def term_get_ancestors(self, node_type, root_iri):
     results = self.triplestore.query_template(
         template_text=self.query,
         inputs={'root_uri': root_iri},
         outputs=['parent_id', 'parent_label', 'child_id', 'child_label'])
     print('found total ', len(results), ' results.')
     nodes = set()
     edges = set()
     for index, row in enumerate(results):
         # Output type would be same as input type?
         ancestor_node = KNode(Text.obo_to_curie(row['parent_id']),
                               name=row['parent_label'],
                               type=node_type)
         child_node = KNode(Text.obo_to_curie(row['child_id']),
                            name=row['child_label'],
                            type=node_type)
         if ancestor_node.id == child_node.id:
             # refrain from adding edge to the node itself
             continue
         predicate = LabeledID(identifier='rdfs:subClassOf',
                               label='subclass of')
         edge = self.create_edge(
             source_node=child_node,
             target_node=ancestor_node,
             predicate=predicate,
             provided_by='uberongraph.term_get_ancestors',
             input_id=child_node.id)
         nodes.add(child_node)
         nodes.add(ancestor_node)
         edges.add(edge)
     return nodes, edges
Esempio n. 16
0
 def gene_to_drug_expanded(self, gene_node):
     output = []
     identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE')
     for identifier in identifiers:
         unique = set()
         geneid = Text.un_curie(identifier)
         url = f"{self.url}CTD_chem_gene_expanded_geneID/ncbigene:{geneid}/"
         obj = requests.get (url).json ()
         for r in obj:
             good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r)
             if not good_row:
                 continue
             predicate = self.normalize_predicate(
                 LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label)
             )
             #Should this be substance?
             drug_node = KNode(Text.upper_curie(r['chemicalID']), type=node_types.CHEMICAL_SUBSTANCE, name=r['chem_label'])
             direction = r['direction']
             if direction == '->':
                 subject = drug_node
                 object = gene_node
             else:
                 subject = gene_node
                 object = drug_node
             edge = self.create_edge(subject,object,'ctd.gene_to_drug_expanded',identifier,predicate,properties = props,url=url,publications=pmids)
             #This is what we'd like it to be, but right now there's not enough real specificity on the predicates
             #key = (drug_node.id, edge.standard_predicate.label)
             key = (drug_node.id, edge.original_predicate.label)
             if key not in unique:
                 output.append( (edge,drug_node) )
                 unique.add(key)
     return output
Esempio n. 17
0
 def disease_to_exposure(self, disease_node):
     logger.info("disease-to-exposure")
     output = []
     identifiers = disease_node.get_synonyms_by_prefix('MESH')
     for identifier in identifiers:
         unique = set()
         url = f"{self.url}CTD_exposure_events_diseaseid/{Text.un_curie(identifier)}/"
         obj = requests.get (url).json ()
         logger.info(url)
         logger.info(len(obj))
         for r in obj:
             predicate_label = r['outcomerelationship']
             if predicate_label == 'no correlation':
                 continue
             predicate = self.normalize_predicate(
                 LabeledID(identifier=f"CTD:{''.join(predicate_label.split())}", label=predicate_label)
             )
             #Should this be substance?
             drug_node = KNode(f"MESH:{r['exposurestressorid']}", type=node_types.CHEMICAL_SUBSTANCE, name=r['exposurestressorname'])
             edge = self.create_edge(drug_node,disease_node,'ctd.disease_to_exposure',identifier,predicate,
                                     publications=[f"PMID:{r['reference']}"],url=url)
             key = (drug_node.id, edge.standard_predicate)
             if key not in unique:
                 output.append( (edge,drug_node) )
                 unique.add(key)
     return output
Esempio n. 18
0
 def gene_to_drug(self, gene_node):
     output = []
     identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE')
     for identifier in identifiers:
         unique = set()
         geneid = Text.un_curie(identifier)
         url = f"{self.url}/CTD_chem_gene_ixns_GeneID/{geneid}/"
         obj = requests.get (url).json ()
         for r in obj:
             if r['GeneID'] != geneid:
                 continue
             good_row, predicate_label, props = self.check_gene_chemical_row(r)
             if not good_row:
                 continue
             predicate = self.normalize_predicate(
                 LabeledID(identifier=f'CTD:{predicate_label}', label=predicate_label)
             )
             #Should this be substance?
             drug_node = KNode(f"MESH:{r['ChemicalID']}", type=node_types.CHEMICAL_SUBSTANCE, name=f"{r['ChemicalName']}")
             if sum([s in predicate.identifier for s in self.g2d_strings]) > 0:
                 subject = gene_node
                 obj = drug_node
             else:
                 subject = drug_node
                 obj = gene_node
             edge = self.create_edge(subject,obj,'ctd.gene_to_drug',identifier,predicate,
                                     publications=[f"PMID:{x}" for x in r['PubMedIDs'].split('|') ],url=url,properties=props)
             #This is what we'd like it to be, but right now there's not enough real specificity on the predicates
             #key = (drug_node.id, edge.standard_predicate.label)
             key = (drug_node.id, edge.original_predicate.label)
             if key not in unique:
                 output.append( (edge,drug_node) )
                 unique.add(key)
     return output
Esempio n. 19
0
 def drug_to_gene_expanded(self, drug):
     output = []
     identifiers = drug.get_synonyms_by_prefix('MESH')
     for identifier in identifiers:
         url=f"{self.url}CTD_chem_gene_expanded_chemicalID/mesh:{Text.un_curie(identifier)}/"
         result = requests.get(url)
         obj=result.json()
         for r in obj:
             good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r)
             if not good_row:
                 continue
             predicate = self.normalize_predicate(
                 LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label)
             )
             gene_node = KNode(Text.upper_curie(r['geneID']), name=r['gene_label'],type=node_types.GENE)
             direction = r['direction']
             if direction == '->':
                 subject = drug
                 object = gene_node
             else:
                 subject = gene_node
                 object = drug
             edge = self.create_edge(subject,object,'ctd.drug_to_gene_expanded',identifier,predicate,publications=pmids,properties=props,url=url )
             output.append( (edge,gene_node) )
     return output
    def get_pathway_by_gene_family(self, gene_family_node):
        """        
        """
        results = []
        predicate = LabeledID('BFO:0000054', 'related_to')
        rows = self.get_rows_using_curie(gene_family_node.id)
        for gene_family_data in rows:
            pathway_data = [x for x in gene_family_data['pathway'] if x != '']
            #parse out the data
            for row in pathway_data:
                pathway_data_raw, component_data_raw = self.split_with(
                    row, splitter='>')
                pathway_data_split = self.split_with(
                    pathway_data_raw,
                    splitter='#',
                    keys=['pathway_name', 'pathway_access'])
                # component_data = self.split_with(component_data_raw, splitter= '#', ['component_name', 'component_access'])
                pathway_node = KNode(
                    f"PANTHER.PATHWAY:{pathway_data_split['pathway_access']}",
                    type=node_types.PATHWAY,
                    name=pathway_data_split['pathway_name'])

                edge = self.create_edge(gene_family_node, pathway_node,
                                        'panther.get_pathway_by_gene_family',
                                        gene_family_node.id, predicate)
                results.append((edge, pathway_node))
        return results
 def disease_get_gene(self, subject):
     """ Get a gene from a pharos disease id. """
     pharos_ids = subject.get_synonyms_by_prefix('DOID')
     resolved_edge_nodes = []
     for pharosid in pharos_ids:
         logging.getLogger('application').debug("Identifier:" + subject.id)
         original_edge_nodes = []
         url='https://pharos.nih.gov/idg/api/v1/diseases/%s?view=full' % pharosid
         logger.info(url)
         r = requests.get(url)
         result = r.json()
         predicate=LabeledID(identifier='PHAROS:gene_involved', label='gene_involved')
         for link in result['links']:
             if link['kind'] == 'ix.idg.models.Target':
                 pharos_target_id = int(link['refid'])
                 logger.info(f"Pharos ID: {pharos_target_id}")
                 hgnc = self.target_to_hgnc(pharos_target_id)
                 if hgnc is not None:
                     hgnc_node = KNode(hgnc, type=node_types.GENE)
                     edge = self.create_edge(subject,hgnc_node,'pharos.disease_get_gene',pharosid,predicate,url=url)
                     resolved_edge_nodes.append((edge, hgnc_node))
                     logger.info(f" HGNC ID: {hgnc}")
                 else:
                     logging.getLogger('application').warn('Did not get HGNC for pharosID %d' % pharos_target_id)
         return resolved_edge_nodes
Esempio n. 22
0
    def disease_or_phenotypic_feature_to_sequence_variant(
            self, phenotype_node):
        ## TODO this could support Orphanet etc
        return_results = []
        trait_ids = phenotype_node.get_synonyms_by_prefix('EFO')
        trait_prefix = 'EFO'
        if not trait_ids:
            trait_ids = phenotype_node.get_synonyms_by_prefix('ORPHANET')
            trait_prefix = 'Orphanet'
        if not trait_ids:
            trait_ids = phenotype_node.get_synonyms_by_prefix('HP')
            trait_prefix = 'HP'
        for trait_id in trait_ids:
            query_url = f'{self.url}efoTraits/{trait_prefix}_{Text.un_curie(trait_id)}/associations?projection=associationByEfoTrait'
            query_json = self.query_service(query_url)
            if query_json:
                try:
                    for association in query_json['_embedded']['associations']:
                        variant_nodes = []
                        for snp in association['snps']:
                            variant_rsid = snp['rsId']
                            variant_nodes.append(
                                KNode(f'DBSNP:{variant_rsid}',
                                      name=f'{variant_rsid}',
                                      type=node_types.SEQUENCE_VARIANT))

                        if variant_nodes:
                            props = {}
                            try:
                                props['pvalue'] = float(association['pvalue'])
                            except ValueError:
                                pass

                            pubs = []
                            association_id = association['_links']['self'][
                                'href'].rsplit('/', 1)[1]
                            pubmed_id = self.get_pubmed_id_by_association(
                                association_id)
                            if pubmed_id:
                                pubs.append(f'PMID:{pubmed_id}')

                            predicate = LabeledID(identifier=f'RO:0002609',
                                                  label=f'related_to')
                            for new_node in variant_nodes:
                                edge = self.create_edge(
                                    phenotype_node,
                                    new_node,
                                    'gwascatalog.disease_or_phenotypic_feature_to_sequence_variant',
                                    phenotype_node.id,
                                    predicate,
                                    url=query_url,
                                    properties=props,
                                    publications=pubs)
                                return_results.append((edge, new_node))

                except (KeyError, IndexError) as e:
                    logger.warning(
                        f'problem parsing results from GWASCatalog: {e}')

        return return_results
Esempio n. 23
0
 def graph_get_pathways_by_gene(self, gene):  #reasoner
     response = self.triplestore.query_template(
         inputs={"gene": gene.id.split(':')[1].upper()},
         outputs=['keggPath'],
         template_text="""
         prefix kegg:      <http://chem2bio2rdf.org/kegg/resource/>
         prefix drugbank:  <http://chem2bio2rdf.org/drugbank/resource/>
         prefix uniprot:   <http://chem2bio2rdf.org/uniprot/resource/gene/>
         prefix ctd:       <http://chem2bio2rdf.org/ctd/resource/>
         prefix mesh:      <http://bio2rdf.org/mesh:>
         select ?drugGenericName ?uniprotGeneID ?pathwayName ?keggPath where {
            ?keggPath    kegg:protein                ?swissProtID ;
                         kegg:Pathway_name           ?pathwayName .
            ?keggInter   kegg:cid                    ?pubchemCID .
            ?dbInter     drugbank:GeneBank_ID        ?geneBankID ;
                         drugbank:SwissProt_ID       ?swissProtID ;
                         drugbank:gene               ?uniprotGeneID .
            ?drugID      drugbank:CID                ?pubchemCID ;
                         drugbank:Generic_Name       ?drugGenericName .
            ?ctd_disease ctd:diseaseid               ?diseaseID ;
                         ctd:cid                     ?pubchemCID .
            values ( ?uniprotGeneID ) {
               ( uniprot:$gene )
            }
         } LIMIT 2000""")
     results = []
     predicate = LabeledID(identifier='RO:0000056', label='participates_in')
     for r in response:
         node = KNode("KEGG:{0}".format(r['keggPath'].split('/')[-1:][0]),
                      type=node_types.PATHWAY)
         edge = self.create_edge(gene, node,
                                 'chembio.graph_get_pathways_by_gene',
                                 gene.id, predicate)
         results.append((edge, node))
     return results
    def get_biological_process_or_activity_by_gene_family(
            self, gene_family_node):
        """
        Creates Biological process/activity nodes associated with a gene family.
        """
        results = []
        # @TODO make sensible edge here
        predicate = LabeledID('RO:0000056', 'participates_in')
        rows = self.get_rows_using_curie(gene_family_node.id)
        for row in rows:
            bio_process_or_activity_data = [
                x for x in row['panther_molecular_func'] if x != ''
            ] + [x for x in row['panther_biological_process'] if x != '']
            for bp in bio_process_or_activity_data:
                label, id = bp.split('#')
                bio_process_or_activity_node = KNode(
                    id,
                    type=node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY,
                    name=label)

                edge = self.create_edge(
                    gene_family_node, bio_process_or_activity_node,
                    'panther.get_biological_process_or_activity_by_gene_family',
                    gene_family_node.id, predicate)
                results.append((edge, bio_process_or_activity_node))
        return results
Esempio n. 25
0
    def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator):
        all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes)
        for source_node_id, results in all_results.items():
            # convert the simple edges and nodes to rags objects and write them to the graph
            for (edge, node) in results:
                gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties)
                if self.recreate_sv_node:
                    variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])
                    writer.write_node(variant_node)
                if gene_node.id not in self.written_genes:
                    writer.write_node(gene_node)
                    self.written_genes.add(gene_node.id)

                predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label)
                gene_edge = KEdge(source_id=source_node_id,
                                  target_id=gene_node.id,
                                  provided_by=edge.provided_by,
                                  ctime=edge.ctime,
                                  original_predicate=predicate,
                                  # standard_predicate=predicate,
                                  input_id=edge.input_id,
                                  properties=edge.properties)
                writer.write_edge(gene_edge)
            logger.info(f'added {len(results)} variant relationships for {source_node_id}')
Esempio n. 26
0
 def graph_diseaseid_to_uniprot(self, drugbank):
     print(drugbank.id.lower())
     response = self.triplestore.query_template(
         inputs={"diseaseID": drugbank.id.lower()},
         outputs=["uniprotGeneID"],
         template_text="""
         prefix drugbank:      <http://chem2bio2rdf.org/drugbank/resource/>
         prefix drugbank_drug: <http://chem2bio2rdf.org/drugbank/resource/drugbank_drug/>
         prefix ctd:           <http://chem2bio2rdf.org/ctd/resource/>
         prefix mesh.disease:          <http://bio2rdf.org/mesh:> 
         select distinct ?uniprotGeneID where {
            values ( ?diseaseID ) { ( $diseaseID ) }
            ?dbInter     drugbank:gene               ?uniprotGeneID .
            ?drugID      drugbank:CID                ?pubchemCID.
            ?ctd_disease ctd:diseaseid               ?diseaseID ;
                         ctd:cid                     ?pubchemCID .
         }""")
     predicate = LabeledID(identifier='NCIT:R176',
                           label='disease to gene association')
     results = []
     for r in response:
         node = KNode("UNIPROT:{0}".format(
             r['uniprotGeneID'].split('/')[-1:][0]),
                      type=node_types.GENE)
         edge = self.create_edge(drugbank, node,
                                 'chembio.graph_diseaseid_to_uniprot',
                                 drugbank.id, predicate)
         results.append((edge, node))
     return results
 def parse_edges(self, provided_by, limit=0):
     """ Construct KEdges"""
     if not provided_by:
         raise RuntimeError(
             'Error edge property provided by is not specified')
     limit_counter = 0
     with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file:
         reader = csv.DictReader(edges_file, delimiter='\t')
         for edge_raw in reader:
             predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH',
                                   label='related_to')
             source_node = KNode(edge_raw['Term1'])
             target_node = KNode(edge_raw['Term2'])
             edge = self.create_edge(source_node=source_node,
                                     target_node=target_node,
                                     input_id=edge_raw['Term1'],
                                     provided_by=provided_by,
                                     predicate=predicate,
                                     publications=[],
                                     properties={
                                         'num_publications':
                                         float(edge_raw['Effective_Pubs']),
                                         'enrichment_p':
                                         float(edge_raw['Enrichment_p'])
                                     })
             edge.standard_predicate = predicate
             limit_counter += 1
             if limit and limit_counter > limit:
                 break
             yield limit_counter - 1, edge
    def get_gene_family_by_gene_family(self, gene_family_node):
        """
        Create Gene family nodes given a gene family.
        """
        response = []
        fam_id, sub_fam_id = self.get_family_sub_family_ids_from_curie(
            gene_family_node.id)

        predicate = LabeledID('BFO:0000050', 'part of')
        if sub_fam_id == None:
            # we are looking for subfamilies
            sub_id_keys = [
                y for y in self.gene_family_data[fam_id] if y != 'family_name'
            ]
            for sub_id in sub_id_keys:
                panther_id = f'{fam_id}:{sub_id}'
                # logger.debug(f'GENE _ FAMILY DATA: { self.gene_family_data[fam_id]}')
                sub_family_node = self.__create_gene_family_node(
                    panther_id,
                    self.gene_family_data[fam_id][sub_id]['sub_family_name'])
                edge = self.create_edge(
                    sub_family_node, gene_family_node,
                    'panther.get_gene_family_by_gene_family',
                    sub_family_node.id, predicate)
                response.append((edge, sub_family_node))
            return response
        # else we are a sub family
        family_node = self.__create_gene_family_node(
            fam_id, self.gene_family_data[fam_id]['family_name'])
        edge = self.create_edge(gene_family_node, family_node,
                                'panther.get_gene_family_by_gene_family',
                                gene_family_node.id, predicate)
        return [(edge, family_node)]
Esempio n. 29
0
 def gene_to_tissues(self, drug):
     output = []
     identifiers = drug.get_synonyms_by_prefix('NCBIGENE')
     for identifier in identifiers:
         url = f"{self.url}/RNAseqDB_bicluster_gene_to_tissue_gene/ncbigene:{Text.un_curie(identifier)}/"
         obj = requests.get(url).json()
         for r in obj:
             anatomy_id = r['col_enrich_UBERON']
             if anatomy_id == '':
                 continue
             predicate = LabeledID(identifier='RO:0002610',
                                   label='correlated with')
             anat_node = KNode(anatomy_id,
                               type=node_types.ANATOMICAL_ENTITY)
             if sum([s in predicate.identifier
                     for s in self.g2d_strings]) > 0:
                 subject = gene_node
                 object = drug
             else:
                 subject = drug
                 object = gene_node
             edge = self.create_edge(
                 subject,
                 object,
                 'ctd.drug_to_gene',
                 identifier,
                 predicate,
                 publications=[f"PMID:{r['PubMedIDs']}"],
                 url=url,
                 properties=props)
             output.append((edge, gene_node))
     return output
Esempio n. 30
0
    def create_phenotype_to_variant_components(self,
                                               query_url,
                                               phenotype_node,
                                               variant_id,
                                               variant_label,
                                               pubmed_id=None,
                                               properties={}):

        variant_node = KNode(variant_id,
                             name=variant_label,
                             type=node_types.SEQUENCE_VARIANT)

        pubs = []
        if pubmed_id:
            pubs.append(f'PMID:{pubmed_id}')

        predicate = LabeledID(identifier=f'RO:0002609', label=f'related_to')
        edge = self.create_edge(
            phenotype_node,
            variant_node,
            'gwascatalog.disease_or_phenotypic_feature_to_sequence_variant',
            phenotype_node.id,
            predicate,
            url=query_url,
            properties=properties,
            publications=pubs)
        return (edge, variant_node)