def get_or_create_metabolite(self, row, metabolites, metabolite_list):
     chembl_id = row['metabolite_chembl_id']
     if chembl_id in metabolites:
         return metabolites[chembl_id]
     names = Names(name=row['metabolite_name'], synonyms=[],source=SOURCE)
     if row['metabolite_pref_name'] is not None and row['metabolite_pref_name'] != row['metabolite_name']:
         names.synonyms.append(row['metabolite_pref_name'])
     metabolite = Element(
         id=CHEMBL+chembl_id,
         biolink_class=CHEMICAL_SUBSTANCE,
         identifiers = {'chembl':CHEMBL+chembl_id},
         names_synonyms=[names],
         connections=[],
         attributes=[]
     )
     structure_source=None
     for struct in ['inchi', 'inchikey', 'smiles']:
         if row[struct] is not None:
             metabolite.identifiers[struct] = row[struct]
             structure_source=Attribute(name='structure source', value=SOURCE,source=self.info.name)
     if structure_source is not None:
         metabolite.attributes.append(structure_source)
     metabolites[chembl_id]=metabolite
     metabolite_list.append(metabolite)
     return metabolite
    def add_element(self, row, gene_list):
        # Set up identifiers
        identifiers={}
        # Add only if HGNC_ID is present (human target)
        if row['HGNC_ID'] is not None: 
            identifiers['hgnc']= 'HGNC:'+ str(row['HGNC_ID'])

            # Set up synonyms 
            synonyms=[]
            for syn in ['TARGET_SYSTEMATIC_NAME', 'TARGET_ABBREVIATED_NAME', 'HGNC_NAME']:
                if row[syn] is not None and row[syn] != '': 
                    synonyms.append(row[syn])   
            for synonym in self.get_target_synonyms(row['TARGET_ID']):
                synonyms.append(synonym)
            
            names= Names(name=row['TARGET_NAME'],synonyms=synonyms, source=SOURCE)
            
            Element()
            gene = Element(
                        id = 'HGNC:'+ str(row['HGNC_ID']),
                        biolink_class='Gene',
                        identifiers = identifiers,
                        names_synonyms = [names],
                        attributes= [],
                        connections=[],
                        source=self.info.name
                    )
            self.get_element_attributes(row,gene)       
        
            gene_list.append(gene)        
    def find_compound_by_unii(self, unii):
        """
            Find compound by a unii
        """
        # slect * is not a good practice rather than * distinct listing is better.
        id = "UNII:" + unii

        query = """
        select
            UNII.UNII,
            UNII.PT,
            UNII.RN,
            UNII.NCIT,
            UNII.PUBCHEM,
            UNII.INCHIKEY,
            UNII.SMILES,
            RXNCONSO.CODE
        from RXNCONSO
        join UNII on RXNCONSO.CODE = UNII.UNII
        where (UNII.UNII = ?)
        """
        cur = connection.execute(
            query,
            (unii,
             ))  # in order to make the varible as a tuple of one explicitely.

        compound = Element(id=id,
                           biolink_class='ChemicalSubstance',
                           identifiers={'unii': unii},
                           attributes=self.find_compound_attributes(unii),
                           connections=[],
                           source=self.info.name)

        # TODO issue SQL query and collect results
        # for each hit (i.e., of the same drug name, an unlikely but possible occurrence)
        for row in cur.fetchall():
            if (row['UNII'] == unii):
                compound.names_synonyms = [
                    Names(name=row['PT'], synonyms=[], source=UNIISOURCE)
                ]  # add names & synonyms from the database

                compound.identifiers = {
                    'unii': id,
                    'cas': 'CAS:' + row['RN'],
                    'ncit': 'NCIT:' + row['NCIT'],
                    'inchikey': row['INCHIKEY'],
                    'smiles': row['SMILES'],
                    'pubchem': 'CID:' + str(row['pubchem'])
                }

        return [compound]
Example #4
0
 def find_compound_by_name(self, name):
     """
         Find compound by a name
     """
     compounds = []
     molecules = requests.get(CHEMBL_NAME_URL.format(name.upper())).json()
     for molecule in molecules['molecules']:
         id = molecule['molecule_chembl_id']
         compound_id = CHEMBL + id
         identifiers = {
             'chembl': compound_id,
             'smiles': molecule['molecule_structures']['canonical_smiles'],
             'inchi': molecule['molecule_structures']['standard_inchi'],
             'inchikey':
             molecule['molecule_structures']['standard_inchi_key'],
         }
         compound = Element(id=compound_id,
                            biolink_class=CHEMICAL_SUBSTANCE,
                            identifiers=identifiers,
                            names_synonyms=self.get_names_synonyms(
                                id, molecule['pref_name'],
                                molecule['molecule_synonyms']),
                            attributes=[
                                Attribute(name='query name',
                                          value=name,
                                          source=self.info.name),
                                Attribute(name='structure source',
                                          value=SOURCE,
                                          source=self.info.name)
                            ],
                            connections=[],
                            source=self.info.name)
         compounds.append(compound)
     return compounds
    def add_element(self, row, compounds):
        # Set up identifiers
        identifiers = {}
        if row['SMILES_string'] is not None and row['SMILES_string'] != '':
            identifiers['smiles'] = str(row['SMILES_string'])
        if row['stereo_chemical_id'] is not None and row['stereo_chemical_id'] != '':
            cids_digits_as_int = int(str(row['stereo_chemical_id'])[4:])
            cids_digits_as_short_str = str(cids_digits_as_int)
            identifiers['pubchem'] = "CID:" + cids_digits_as_short_str
        if row['inchikey'] is not None and row['inchikey'] != '':
            identifiers['inchikey'] = str(row['inchikey'])

        name = row['name']
        synonyms = []
        names = Names(name=name, synonyms=synonyms, source=SOURCE)

        compound = Element(
            id=str(row['STEREO_CHEMICAL_ID']),
            biolink_class='ChemicalSubstance',
            identifiers=identifiers,
            names_synonyms=[names],
            attributes=[],
            connections=[],
            source=self.info.name
        )
        self.get_attributes(row, compound)
        compounds.append(compound)
 def create_element(self, hit_id):
     return Element(id=hit_id,
                    biolink_class=BIOLINK_CLASS[self.output_class],
                    identifiers={ID_KEY[self.output_class]: hit_id},
                    attributes=[],
                    connections=[],
                    source=self.info.name)
    def get_or_create_assay(self, row, assays, assay_list):
        assay_id = CHEMBL + row['assay_chembl_id']
        if assay_id in assays:
            return assays[assay_id]
        names = Names(
            name = row['assay_description'] if row['assay_description'] is not None else assay_id,
            synonyms = [],
            source = SOURCE
        )

        assay = Element(
            id=assay_id,
            biolink_class=ASSAY,
            identifiers = {'chembl':assay_id},
            names_synonyms=[names],
            connections=[],
            attributes=[]
        )
        add_attribute(self,assay,row,'BAO_label')
        add_attribute(self,assay,row,'assay_organism')
        add_attribute(self,assay,row,'target_chembl_id')
        add_attribute(self,assay,row,'target_name')
        add_attribute(self,assay,row,'target_organism')
        add_attribute(self,assay,row,'target_type')
        add_attribute(self,assay,row,'cell_chembl_id')
        add_attribute(self,assay,row,'assay_type')
        add_attribute(self,assay,row,'bao_format')
        add_attribute(self,assay,row,'assay_tissue_chembl_id')
        add_attribute(self,assay,row,'assay_tissue_name')
        add_attribute(self,assay,row,'assay_cell_type')
        add_attribute(self,assay,row,'assay_subcellular_fraction')
        assays[assay_id] = assay
        assay_list.append(assay)
        return assay
Example #8
0
 def map(self, compound_list, controls):
     gene_list = []
     genes = {}
     for compound in compound_list:
         metabolite = find_metabolite(compound)
         if metabolite is not None:
             targets = self.find_targets(metabolite, compound.id)
             for target in targets:
                 gene_id = target['entrez']
                 gene = genes.get(gene_id)
                 if gene is None:
                     gene = Element(
                         id=gene_id,
                         biolink_class=GENE,
                         identifiers={'entrez': gene_id},
                         names_synonyms=[Names(name=target['name'])],
                         attributes=[
                             Attribute(name='UniProtKB',
                                       value=target['uniprot'],
                                       source=self.info.label,
                                       provided_by=self.info.name)
                         ],
                         connections=[],
                         source=self.info.name)
                     gene_list.append(gene)
                     genes[gene_id] = gene
                 gene.connections.append(target['connection'])
     return gene_list
Example #9
0
 def create_element(self, disorder):
     beacon_concept_id = disorder['beacon_concept_id']
     identifiers = {}
     synonyms = []
     for synonym in get_synonyms(beacon_concept_id):
         if synonym['EXACT_MATCH'] == 0:
             synonyms.append(synonym['SYNONYM'])
         if synonym['EXACT_MATCH'] == 1:
             (prefix, suffix) = parse_curie(synonym['SYNONYM'])
             if prefix is not None and prefix in self.id_map:
                 key = self.id_map[prefix]
                 curie = self.prefix_map[prefix] + suffix
                 add_identifier(identifiers, key, curie)
     disorder_id = self.disorder_id(disorder, identifiers)
     element = Element(id=disorder_id,
                       biolink_class=DISEASE,
                       identifiers=identifiers,
                       names_synonyms=[
                           Names(name=disorder['name'],
                                 synonyms=synonyms,
                                 source=self.info.label)
                       ],
                       attributes=[],
                       connections=[],
                       source=self.info.name)
     return element
    def find_drug(self, rxcui, source_element_id):

        substance_list = []

        query = """
        SELECT
            RXNREL.RXCUI1,
            RXNREL.RXCUI2
        from RXNREL

        where RXNREL.RXCUI1 = ?
        """
        cur = connection.execute(query, (rxcui, ))

        for row in cur.fetchall():
            substance_rxcui2 = row['RXCUI2']

            if substance_rxcui2 is not None:

                substance = Element(
                    id='RXCUI:' + str(substance_rxcui2),
                    biolink_class='related_to',
                    identifiers={'rxnorm': 'RXCUI:' + str(substance_rxcui2)},
                    connections=[],
                    source=self.info.name)
                substance_list.append(substance)

                connect = Connection(
                    source_element_id=source_element_id,
                    type=self.info.knowledge_map.predicates[0].predicate,
                    attributes=self.connection_attributes(
                        rxcui, substance_rxcui2))
                substance.connections.append(connect)

        return substance_list
    def find_drug_by_rxcui(self, rxcui):
        """
            Find drug by a rxcui
        """

        # slect * is not a good practice rather than * distinct listing is better.
        id = "RXCUI:" + str(rxcui)

        query = """
        select
            RXNCONSO.RXCUI,
            RXNCONSO.CODE
        from RXNCONSO

        where (RXNCONSO.RXCUI = ?)
        """

        cur = connection.execute(
            query,
            (rxcui,
             ))  # in order to make the varible as a tuple of one explicitely.
        drug = Element(id=id,
                       biolink_class='Drug',
                       identifiers={'rxnorm': id},
                       attributes=self.find_drug_attributes(rxcui),
                       names_synonyms=self.find_drug_synonyms(rxcui),
                       connections=[],
                       source=self.info.name)

        return [drug]
    def find_drugs_by_gene(info_name, drug_list, gene):
        """
            Collect all the drugs that inhibit the gene
            """
        entrez_id = gene.identifiers['entrez'].split(":", 1)[1].strip()
        print(entrez_id)
        #       Inhibitors SQL query.
        query4 = """ 
                    SELECT
                        drugs.id AS drug_id,
                        drugs.chembl_id,
                        drugs.name AS name,
                        drugs.immunotherapy,
                        drugs.anti_neoplastic,
                        drugs.fda_approved,
                        genes.long_name AS name,
                        interactions.id AS interaction_id
                    FROM genes
                    JOIN interactions on interactions.gene_id = genes.id
                    JOIN drugs ON drugs.id = interactions.drug_id
                    WHERE genes.entrez_id = ?;
                    """
        global connection
        connection = DGIdbDataSupply.get_db()
        cur4 = connection.execute(query4, (entrez_id, ))

        for row in cur4.fetchall():  # loop for each drug interaction
            dgidb_drug_id = row['drug_id']
            interaction_id = row['interaction_id']
            id = "ChEMBL:" + (row['chembl_id'])
            drug = Element(id=id,
                           biolink_class="ChemicalSubstance",
                           identifiers={'chembl': id},
                           names_synonyms=[],
                           attributes=[],
                           connections=[],
                           source=info_name)

            #   Start adding drug name & synonyms from the drugs table
            drug.names_synonyms.append(
                Names(
                    name=row['name'],
                    synonyms=[],
                    source=info_name,
                    #   type = row['name'],        # Interim solution for providing "type", pending Consortium's final decision
                    #   provided_by = info_name
                ))

            DGIdbDataSupply.get_names_synonyms(dgidb_drug_id, drug)

            #   Append to drug additional attributes collected from DGIdb drug_attributes table
            DGIdbDataSupply.get_drug_attributes(info_name, dgidb_drug_id, drug)

            #   Append connection to gene, per interaction_id
            DGIdbDataSupply.get_connection_data(drug, info_name,
                                                interaction_id, gene.id,
                                                "affected_by")

            drug_list.append(drug)
 def add_element(self, row, compound_list):
     # Set up identifiers
     identifiers={}
     # Add only if pubchemcid is present
     # Keys are lowercase, prefixes are uppercase
     if row['PUBCHEMCID'] is not None and row['PUBCHEMCID'] != '':
         identifiers['pubchem']= 'CID:'+str(row['PUBCHEMCID'])
     if row['SMILES'] is not None and row['SMILES'] != '':
         identifiers['smiles']= row['SMILES']
     if row['INCHIKEY'] is not None and row['INCHIKEY'] != '':
         identifiers['inchikey']= row['INCHIKEY']
     if row['INCHI'] is not None and row['INCHI'] !='':
         identifiers['inchi']= row['INCHI']   
     if row['PUBCHEMSID'] is not None and row['PUBCHEMSID'] !='':
         identifiers['pubchem.substance']= 'SID:'+str(row['PUBCHEMSID'])
     if row['LIGAND_ID'] is not None and row['LIGAND_ID'] != '':
         identifiers['gtopdb']= 'GTOPDB:'+str(row['LIGAND_ID'])
     # Set up proper name if inn is available     
     name= row['NAME']
     synonyms=[]
     if row['INN'] is not None and row['INN'] != '':
         synonyms.append(name)
         name= row['INN']
     # Set up synonyms 
     if row['IUPAC'] is not None and row['IUPAC'] !='':
         synonyms.append(row['IUPAC'])
     for synonym in self.get_names_synonyms(row['LIGAND_ID']):
         synonyms.append(synonym)
         
     names= Names(name=name,synonyms=synonyms, source=SOURCE)
     # Id as CID
     Element()
     compound = Element(
                 id = "GTOPDB:"+ str(row['LIGAND_ID']),
                 biolink_class='ChemicalSubstance',
                 identifiers = identifiers,
                 names_synonyms = [names],
                 attributes= [],
                 connections=[],
                 source=self.info.name
             )
     self.get_element_attributes(row,compound)       
 
     compound_list.append(compound)        
    def find_genes_by_drug(info_name, gene_list, compound):
        """
            Collect all the genes that the drug interact with
            """
        #   drugs_chembl_id = compound.id.split(":",1)[1].strip()
        drugs_chembl_id = compound.identifiers['chembl'].split(":",
                                                               1)[1].strip()

        #       Targets SQL query.
        query2 = """ 
            SELECT
                drugs.id AS drug_id,
                genes.entrez_id,
                genes.name AS symbol,
                genes.long_name AS name,
                genes.id AS gene_id,
                interactions.id AS interaction_id
            FROM drugs
            JOIN interactions on interactions.drug_id = drugs.id
            JOIN genes ON genes.id = interactions.gene_id
            WHERE drugs.chembl_id = ?;
            """
        global connection
        connection = DGIdbDataSupply.get_db()
        cur2 = connection.execute(query2, (drugs_chembl_id, ))

        for row in cur2.fetchall():  # loop for each gene interaction
            dgidb_gene_id = row['gene_id']
            interaction_id = row['interaction_id']
            id = "NCBIGene:" + str(row['entrez_id'])
            gene = Element(id=id,
                           biolink_class="Gene",
                           identifiers={"entrez": id},
                           names_synonyms=[],
                           attributes=[],
                           connections=[],
                           source=info_name)

            #   Start adding the gene name & symbol from the genes table
            gene.names_synonyms.append(
                Names(
                    name=row['name'],
                    synonyms=[row['symbol']],
                    source=info_name,
                ))

            #   Append to gene additional attributes collected from DGIdb gene_attributes table
            DGIdbDataSupply.get_gene_attributes(gene, info_name, dgidb_gene_id)

            #   Append connection to gene, per interaction_id
            DGIdbDataSupply.get_connection_data(gene, info_name,
                                                interaction_id, compound.id,
                                                "affects")

            gene_list.append(gene)
Example #15
0
 def get_element(self, gene_id, polypeptideId):
     element = Element(
             id = None,
             biolink_class = "Gene",
             identifiers = {},
             names_synonyms = [],
             attributes = [],
             connections = [],
             source = self.info.name
     )
     # e.g., HGNC:1097
     get_connection_polypeptide_identifiers( polypeptideId, element, 'HUGO Gene Nomenclature Committee (HGNC)' )# query database for gene id for given polypeptide id
               # (RESOURCE = HUGO Gene Nomenclature Committee (HGNC))
     return element
    def add_element(self, protein_id, protein_list):
        # Set up identifiers
        identifiers = {}
        # Add only if HGNC_ID is present (human target)
        identifiers['ensembl'] = protein_id

        #Set up synonyms
        synonyms = []

        names = Names(name=protein_id, synonyms=synonyms, source=SOURCE)

        Element()
        protein = Element(
            id=protein_id,
            biolink_class='Protein',
            identifiers=identifiers,
            names_synonyms=[names],
            attributes=[],
            connections=[],
            source=self.info.name
        )
        self.get_element_attributes(protein)

        protein_list.append(protein)
Example #17
0
    def get_element(self, protein_id, polypeptideId):

        element = Element(
            id = None,
            biolink_class = "Protein",
            identifiers = {},
            names_synonyms = [],
            attributes = [],
            connections = [],
            source = self.info.name
        )
        get_connection_polypeptide_identifiers(polypeptideId, element, 'UniProtKB' )
                 # query database for protein id for given polypeptide id
                 # (RESOURCE = UniProtKB )
        return element
Example #18
0
 def Element(self,
             id,
             biolink_class,
             identifiers,
             names_synonyms=None,
             attributes=None):
     return Element(id=id,
                    biolink_class=biolink_class,
                    identifiers=identifiers,
                    names_synonyms=names_synonyms
                    if names_synonyms is not None else [],
                    attributes=attributes if attributes is not None else [],
                    connections=[],
                    source=self.SOURCE,
                    provided_by=self.PROVIDED_BY)
Example #19
0
    def row_to_element(self, row):
        id = row['ID']
        beacon_concept_id = row['BEACON_CONCEPT_ID']
        identifiers = {
            'hmdb': id,
        }
        synonyms = []
        for synonym in get_synonyms(beacon_concept_id):
            if synonym['EXACT_MATCH'] == 0:
                synonyms.append(synonym['SYNONYM'])
            if synonym['EXACT_MATCH'] == 1:
                (prefix, suffix) = parse_curie(synonym['SYNONYM'])
                if prefix is not None and prefix in self.id_map:
                    key = self.id_map[prefix]
                    curie = self.prefix_map[prefix] + suffix
                    if prefix == 'DrugBank' and not suffix.startswith(':DB'):
                        key = 'pubchem'
                        curie = 'CID' + suffix
                    add_identifier(identifiers, key, curie)

        attributes = []
        for detail in get_details(beacon_concept_id):
            if detail['TAG'] in self.id_map:
                add_identifier(identifiers, self.id_map[detail['TAG']],
                               detail['VALUE'])
                if detail['TAG'] == 'inchi':
                    attributes.append(
                        Attribute(name='structure source',
                                  value=self.info.label,
                                  source=self.info.label,
                                  provided_by=self.info.name))

        element = Element(id=id,
                          biolink_class=CHEMICAL_SUBSTANCE,
                          identifiers=identifiers,
                          names_synonyms=[
                              Names(name=row['NAME'],
                                    synonyms=synonyms,
                                    source=self.info.label)
                          ],
                          attributes=attributes,
                          connections=[],
                          source=self.info.name)

        return element
Example #20
0
 def map(self, compound_list, controls):
     gene_list = []
     genes = {}
     for compound in compound_list:
         targets = self.get_targets(compound)
         for target in targets:
             gene_id = ENSEMBL + target['gene_id']
             gene = genes.get(gene_id)
             if gene is None:
                 gene = Element(id=gene_id,
                                biolink_class=GENE,
                                identifiers={'ensembl': [gene_id]},
                                connections=[],
                                attributes=[],
                                source=self.info.name)
                 gene_list.append(gene)
                 genes[gene_id] = gene
             gene.connections.append(target['connection'])
     return gene_list
Example #21
0
 def find_compound_by_name(self, name):
     """
         Find compound by a name
     """
     compound_list = []
     compounds = self.find_compound_by_id(name) if self.has_prefix(
         'chembank', str(name), "compound") else find_compound_by_name(name)
     for compound in compounds:
         chembank_id = compound['CHEMBANK_ID']
         compound_id = self.add_prefix('chembank', str(chembank_id))
         names = self.get_names(chembank_id)
         identifiers = {}
         if compound['CHEMBANK_ID'] is not None and compound[
                 'CHEMBANK_ID'] != '':
             identifiers['chembank'] = self.add_prefix(
                 'chembank', str(compound['CHEMBANK_ID']))
         if compound['SMILES'] is not None and compound['SMILES'] != '':
             identifiers['smiles'] = compound['SMILES']
         if compound['INCHI'] is not None and compound['INCHI'] != '':
             identifiers['inchi'] = compound['INCHI']
         if compound[
                 'INCHI_KEY'] is not None and compound['INCHI_KEY'] != '':
             identifiers['inchikey'] = compound['INCHI_KEY']
         if 'DrugBank' in names:
             identifiers['drugbank'] = self.add_prefix(
                 'drugbank', str(names['DrugBank'][0]))
         if 'PubChem' in names:
             identifiers['pubchem'] = self.add_prefix(
                 'pubchem', str(names['PubChem'][0]))
         if 'CAS' in names:
             identifiers['cas'] = self.add_prefix('cas', names['CAS'][0])
         element = Element(
             id=compound_id,
             biolink_class=self.biolink_class('ChemicalSubstance'),
             identifiers=identifiers,
             names_synonyms=self.get_names_synonyms(names),
             attributes=[],
             connections=[],
             provided_by=self.PROVIDED_BY,
             source=self.SOURCE)
         compound_list.append(element)
     return compound_list
Example #22
0
 def get_compound(self, cpd_id):
     compound = get_compound(cpd_id)[0]
     element = Element(
             id = compound['PUBCHEM_CID'],
             biolink_class='ChemicalSubstance',
             identifiers = {
                 "pubchem": compound['PUBCHEM_CID'],
                 "smiles": compound['SMILES'],
                 "inchi": compound['INCHI'],
                 "inchikey": compound['INCHI_KEY'],
             },
             names_synonyms = [Names(
                 name=compound['COMPOUND_NAME'], 
                 synonyms = [compound['BROAD_CPD_ID']], 
                 source = self.info.label)],
             attributes = [],
             connections=[],
             source = self.info.name
         )
     return element
 def get_or_create_mechanism(self, row, mechanisms, mechanism_list):
     name = row['target_chembl_id']
     id = CHEMBL+name
     if id in mechanisms:
         return mechanisms[id]
     names = Names(name=name, synonyms=[],source=SOURCE)
     mechanism = Element(
         id=id,
         biolink_class=MOLECULAR_ENTITY,
         identifiers = {'chembl':id},
         names_synonyms=[names],
         connections=[],
         attributes=[]
     )
     add_attribute(self,mechanism,row,'target_name')
     add_attribute(self,mechanism,row,'target_type')
     add_attribute(self,mechanism,row,'target_organism')
     mechanisms[id] = mechanism
     mechanism_list.append(mechanism)
     return mechanism
 def row_to_element(self, row):
     id = CHEMBL + row['chembl_id']
     identifiers = {
         'chembl': id,
         'smiles':  row['canonical_smiles'],
         'inchi': row['standard_inchi'],
         'inchikey': row['standard_inchi_key'],
     }
     element = Element(
         id=id,
         biolink_class=CHEMICAL_SUBSTANCE,
         identifiers=identifiers,
         names_synonyms=self.get_names_synonyms(row['chembl_id'],row['pref_name'],row['molregno']),
         attributes = [],
         connections=[],
         source=self.info.name
     )
     if row['standard_inchi_key'] is not None:
         element.attributes.append(Attribute(name='structure source', value=SOURCE,source=SOURCE,provided_by=self.info.name))
     self.add_attributes(element, row)
     return element
 def get_or_create_indication(self, row, indications, indication_list):
     mesh_id = MESH+row['mesh_id']
     efo_id = row['efo_id']
     if mesh_id is not None and mesh_id in indications:
         return indications[mesh_id]
     if efo_id is not None and efo_id in indications:
         return indications[efo_id]
     id = mesh_id if mesh_id is not None else efo_id
     names = Names(name = row['mesh_heading'],synonyms=[],source=SOURCE)
     indication = Element(id=id, biolink_class=DISEASE,connections=[])
     indication.identifiers={'efo':[]}
     indication.names_synonyms=[names]
     indication.source = self.info.name
     if mesh_id is not None:
         indications[mesh_id] = indication
         indication.identifiers['mesh'] = mesh_id
     if efo_id is not None:
         indications[efo_id] = indication
     indication_list.append(indication)
     return indication
Example #26
0
    def export(self, gene_list, controls):

        genes = dict([(entrez_gene_id(gene)
                       if entrez_gene_id(gene) != None else gene.gene_id, None)
                      for gene in gene_list])

        #Read in the gene sets
        gene_set_y_gene_list_y = {}
        gene_set_y_gene_list_n = {}
        gene_set_n_gene_list_y = {}
        gene_set_n_gene_list_n = {}
        gene_set_k = {}
        gene_set_N = {}
        gene_set_gene_ids = {}
        all_gene_set_gene_ids = set()
        for msigdb_gmt_file in msigdb_gmt_files:
            msigdb_gmt_fh = open(msigdb_gmt_file)
            for line in msigdb_gmt_fh:
                cols = line.strip().split('\t')
                if len(cols) < 3:
                    continue
                gene_set_id = cols[0]
                gene_ids = cols[2:len(cols)]
                overlap = len([x for x in gene_ids if x in genes])
                if overlap == 0:
                    continue
                gene_set_y_gene_list_y[gene_set_id] = overlap
                gene_set_gene_ids[gene_set_id] = gene_ids
                gene_set_N[gene_set_id] = len(gene_ids)

                gene_set_y_gene_list_n[gene_set_id] = gene_set_N[
                    gene_set_id] - gene_set_y_gene_list_y[gene_set_id]
                gene_set_n_gene_list_y[gene_set_id] = len(
                    genes) - gene_set_y_gene_list_y[gene_set_id]
                for x in gene_ids:
                    all_gene_set_gene_ids.add(x)
            msigdb_gmt_fh.close()
        M = len(all_gene_set_gene_ids)

        gene_set_pvalues = {}
        gene_set_qvalues = {}
        gene_set_odds_ratios = {}
        all_pvalues = []
        all_gene_set_ids = []

        for gene_set_id in gene_set_y_gene_list_y:
            gene_set_n_gene_list_n[gene_set_id] = M - gene_set_y_gene_list_y[
                gene_set_id] - gene_set_y_gene_list_n[
                    gene_set_id] - gene_set_n_gene_list_y[gene_set_id]

            table = [[
                gene_set_y_gene_list_y[gene_set_id],
                gene_set_y_gene_list_n[gene_set_id]
            ],
                     [
                         gene_set_n_gene_list_y[gene_set_id],
                         gene_set_n_gene_list_n[gene_set_id]
                     ]]
            odds_ratio, pvalue = scipy.stats.fisher_exact(table)

            all_pvalues.append(pvalue)
            all_gene_set_ids.append(gene_set_id)

            if pvalue < controls['max p-value']:
                gene_set_pvalues[gene_set_id] = pvalue
                gene_set_odds_ratios[gene_set_id] = odds_ratio

        all_qvalues = correct_pvalues_for_multiple_testing(
            all_pvalues, correction_type="Benjamini-Hochberg")
        for i, gene_set_id in enumerate(all_gene_set_ids):
            if gene_set_id in gene_set_pvalues and all_qvalues[i] < controls[
                    'max q-value']:
                gene_set_qvalues[gene_set_id] = all_qvalues[i]

        pathways = []
        for gene_set_id in sorted(gene_set_qvalues.keys(),
                                  key=lambda x: gene_set_qvalues[x]):
            enriched_gene_set = Element(
                id='MSigDB:' + gene_set_id,
                biolink_class='Pathway',
                identifiers={'MSigDB': 'MSigDB:' + gene_set_id},
                names_synonyms=[
                    Names(
                        name=gene_set_id,
                        synonyms=[],
                        source='MSigDB',
                        url=
                        'http://software.broadinstitute.org/gsea/msigdb/cards/{}.html'
                        .format(gene_set_id))
                ],
                attributes=[
                    Attribute(name='p-value',
                              value=str(gene_set_pvalues[gene_set_id]),
                              source=self.info.name),
                    Attribute(name='q-value',
                              value=str(gene_set_qvalues[gene_set_id]),
                              source=self.info.name),
                    Attribute(name='odds ratio',
                              value=str(gene_set_odds_ratios[gene_set_id]),
                              source=self.info.name),
                ],
                source=self.info.name)
            pathways.append(enriched_gene_set)
        return pathways
    def get_mixtures(self, relationship_list, substance):
        substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip()
        source_element_id = substance.identifiers['unii']
        """
            Get mixtures that "has part" that includes the substance as a component,
            so append any mixtures to the relationship_list
            and annotate their Connection per biolink: 
            (https://biolink.github.io/biolink-model/docs/has_part.html)
        """
        query14 = """
            SELECT 
                substances._name AS substance_name,
                mixture_substances.uuid AS mixture_uuid,
                mixture_substances._name AS mixture_substance,
                mixture_substances.UNII AS mixture_substance_unii,
                mixture_substances.substanceClass,
                unii_lookup.RXCUI AS mixture_RXCUI,
                unii_lookup.PUBCHEM AS mixture_PUBCHEM,
                unii_lookup.INCHIKEY AS mixture_InChiKey,
                unii_lookup.NCBI AS mixture_NCBI
            FROM substances
            JOIN components ON substances.uuid = components.refuuid
            JOIN mixtures ON components.mixture_id = mixtures.uuid
            JOIN substances AS mixture_substances ON mixtures.uuid = mixture_substances.mixture
            LEFT JOIN unii_lookup ON mixture_substances.UNII = unii_lookup.UNII 
            WHERE substances.UNII = ?   ;
            """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur14 = connection.execute(query14, (substance_unii, ))
        for row in cur14.fetchall():  # loop for each mixture substance found
            id = "UNII:" + str(row['mixture_substance_unii'])
            name = row['mixture_substance']
            #   Create identifiers by annotating ids with appropriate CURIE prefix
            identifiers = {'unii': id}
            if (row['mixture_InChiKey']):
                identifiers["inchikey"] = row['mixture_InChiKey']
            if (row['mixture_PUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['mixture_PUBCHEM']

            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['substanceClass'], row['mixture_InChiKey'])
            if row['mixture_RXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['mixture_RXCUI']

            connection = Connection(source_element_id=source_element_id,
                                    type='has_part',
                                    relation='has_part',
                                    evidence_type="",
                                    attributes=[])
            mixture = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[connection],
                source=self.info.name)
            if biolink_class != 'ignore':
                relationship_list.append(mixture)
    def get_active_ingredients(self, related_list, drug):
        drug_rxcui = drug.identifiers['rxnorm'].split(":", 1)[1].strip()
        source_element_id = drug.id.strip()
        query7 = """
            SELECT DISTINCT
                unii_lookup.PT,
                unii_lookup.RXCUI,
                unii_lookup.PUBCHEM,
                unii_lookup.NCBI,
                substances._name,
                relatedSubstances._name AS related_substance,
                relatedSubstances.uuid AS related_substance_uuid,
                relatedSubstances.UNII AS related_substance_unii,
                relatedSubstances.substanceClass AS related_substance_class,
                relationships.type AS relationships_type,
                relationships.qualification,
                relationships.amount_average AS average,
                relationships.amount_high AS high,
                relationships.amount_low AS low,
                relationships.amount_units,
                unii_lookup.INCHIKEY AS InChiKey,
                unii_lookup.INGREDIENT_TYPE
            FROM unii_lookup
            JOIN substances ON unii_lookup.UNII = substances.UNII
            LEFT JOIN relationships ON substances.uuid = relationships.substance_id
            JOIN substances AS relatedSubstances ON relationships.relatedSubstance_id = relatedSubstances.uuid
            WHERE
                relationships.type LIKE ("%ACTIVE%") 
                AND NOT relationships.type LIKE ("%INACTIVE%")
                AND NOT relationships.type LIKE ("%PARENT->%")
                AND NOT relationships.type LIKE ("%PRODRUG->%")
                AND NOT relationships.type LIKE ("%RACEMATE->%")
                AND NOT relationships.type LIKE ("%SUBSTANCE->%")  
                AND NOT relationships.type LIKE ("%METABOLITE ACTIVE%") 
                AND NOT relationships.type LIKE ("%METABOLITE LESS ACTIVE%") 
                AND NOT relationships.type LIKE ("%ACTIVE CONSTITUENT ALWAYS PRESENT%")
                AND RXCUI = ?; 
        """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur7 = connection.execute(query7, (drug_rxcui, ))

        for row in cur7.fetchall():  # loop for each related substance found
            id = "UNII:" + str(row['related_substance_unii'])
            uuid = row['related_substance_uuid']
            identifiers = {'unii': id}
            if row['InChiKey']:
                identifiers['inchikey'] = row['InChiKey']
            if (row['PUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['PUBCHEM']
            name = row['related_substance']
            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['related_substance_class'], row['InChiKey'])
            relationships_type = row['relationships_type']
            if row['RXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['RXCUI']
            substance = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[],
                source=self.info.name)
            # Append synonyms to the substance
            Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance)
            relationship = Connection(source_element_id=source_element_id,
                                      type=relationships_type,
                                      relation="has_active_ingredient",
                                      evidence_type="",
                                      attributes=[])
            if (row['qualification']):
                #   active ingredients attributes
                attributes_list = ['average', 'high', 'low']
                for attribute in attributes_list:
                    if row[attribute] is not None and len(
                            row[attribute].strip()) > 0:
                        name = attribute + ' ' + str(
                            row['qualification']) + ' (' + str(
                                row['amount_units']) + ')'
                        value = str(row[attribute])
                        relationship.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=name,
                                      value=value,
                                      source=SOURCE,
                                      type=name))
            substance.connections.append(relationship)
            if biolink_class != 'ignore':  # the ingredient is not a "concept"
                related_list.append(substance)
    def get_relationships(self, relationship_list, substance):
        #   substance is a mixture that must have components
        Inxight_Drugs_DataSupply.get_components(self, relationship_list,
                                                substance)
        # also check if it is a component
        Inxight_Drugs_DataSupply.get_mixtures(self, relationship_list,
                                              substance)

        substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip()
        source_element_id = substance.identifiers['unii']
        """
            Find relationships to other substances by a substance UNII
            """
        query5 = """
            SELECT 
                substances._name AS substance_name,
                substances.mixture,
                relationships.type AS relationships_type,
                relationships.mediatorSubstance_id,
                relationships.interactionType AS interaction_type,
                relationships.qualification,
                relationships.amount_average AS average,
                relationships.amount_high AS high,
                relationships.amount_low AS low,
                relationships.amount_units,
                relationships.comments,
                related.uuid AS related_uuid,
                related._name AS related_substance,
                related.UNII AS related_substance_unii,
                related.substanceClass,
                unii_lookup.RXCUI AS relatedRXCUI,
                unii_lookup.PUBCHEM AS relatedPUBCHEM,
                unii_lookup.INCHIKEY AS InChiKey,
                unii_lookup.NCBI
            FROM substances
            JOIN relationships ON substances.uuid = relationships.substance_id
            JOIN substances AS related ON relationships.relatedSubstance_id = related.uuid
            LEFT JOIN unii_lookup ON related.UNII = unii_lookup.UNII
            WHERE substances.UNII = ?; 
        """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur5 = connection.execute(query5, (substance_unii, ))

        for row in cur5.fetchall():  # loop for each related substance found
            id = "UNII:" + str(row['related_substance_unii'])
            name = row['related_substance']
            #   Create identifiers by annotating ids with appropriate CURIE prefix
            identifiers = {'unii': id}
            if (row['InChiKey']):
                identifiers["inchikey"] = row['InChiKey']
            if (row['relatedPUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['relatedPUBCHEM']
            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['substanceClass'], row['InChiKey'])
            if row['relatedRXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['relatedRXCUI']

            related_substances = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[],
                source=self.info.name)
            #   Append additional attributes collected from Inxight:Drugs substances & unii tables
            attributes_list = ['substanceClass', 'NCBI']
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    if attribute != 'NCBI':
                        related_substances.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=attribute,
                                      value=str(row[attribute]),
                                      source=SOURCE,
                                      type=attribute))
                    else:  # NCBI id
                        related_substances.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name='OrganismTaxon',
                                      value='NCBITaxon:' + row[attribute],
                                      source=SOURCE,
                                      type='biolink:OrganismTaxon'))
            relationship = Connection(source_element_id=source_element_id,
                                      type=row['relationships_type'],
                                      relation=row['relationships_type'],
                                      evidence_type="",
                                      attributes=[])
            #   Append additional attributes collected from Inxight:Drugs relationships table
            attributes_list = ['interaction_type', 'comments']
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    relationship.attributes.append(
                        Attribute(provided_by=self.info.name,
                                  name=attribute,
                                  value=str(row[attribute]),
                                  source=SOURCE,
                                  type=attribute))
            if (row['qualification']):
                attributes_list = ['average', 'high', 'low']
                for attribute in attributes_list:
                    if row[attribute] is not None and len(
                            row[attribute].strip()) > 0:
                        # e.g., attribute.name: average IC50 (NANOMOLAR), attribute.value: 2.7
                        name = attribute + ' ' + str(
                            row['qualification']) + ' (' + str(
                                row['amount_units']) + ')'
                        value = str(row[attribute])
                        relationship.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=name,
                                      value=value,
                                      source=SOURCE,
                                      type=name))

            related_substances.connections.append(relationship)
            if biolink_class != 'ignore':
                relationship_list.append(related_substances)
    def find_substance(self, substance_list, name_value):
        search_column = '_name'  # by default, assume a search for substance by name
        inchikey_regex = re.compile('[A-Z]{14}-[A-Z]{10}-[A-Z]')

        #   check if submitted name is native CURIE, e.g., CID:2244
        #   or InChiKey e.g., BSYNRYMUTXBXSQ-UHFFFAOYSA-N
        #   or else just a substance name
        if name_value.find(':') > -1:
            if name_value.find(
                    'UNII'
            ) > -1:  # a search for substance by UNII, i.e., column 'substanceUNII'
                search_column = 'substanceUNII'
            elif name_value.find(
                    'CID'
            ) > -1:  # a search for substance by CID, i.e., column 'structurePubChem'
                search_column = 'structurePubChem'
            name = name_value.split(":", 1)[1].strip()
        elif inchikey_regex.match(name_value) is not None:
            search_column = 'structureInChiKey'  # a search for substance by inchikey, i.e., column 'structureInChiKey'
            name = name_value
        else:
            name = name_value
            search_column = '_name'
        """
            Find substance by a name
        """
        query1 = """
            SELECT DISTINCT 
                substances.uuid AS uuid,
                substanceClass,
                substances.UNII AS substanceUNII,
                _name,
                structurallyDiverse,
                protein,
                nucleicAcid,
                mixture,
                polymer,
                structure_id,
                formula,
                opticalActivity,
                atropisomerism,
                stereoCenters,
                definedStereo,
                ezCenters,
                charge,
                mwt AS molecularWeight,
                stereochemistry,
                structures.InChiKey AS structureInChiKey,
                structures.pubChem AS structurePubChem,
                RXCUI,
                NCBI,
                stereoComments
            FROM substances
            LEFT JOIN structures ON substances.structure_id = structures.id
            LEFT JOIN unii_lookup ON substances.UNII = unii_lookup.UNII
            WHERE {search_column} = ?;
        """.format(search_column=search_column)
        connection = Inxight_Drugs_DataSupply.get_db()
        if search_column is not None:
            cur = connection.execute(query1, (name, ))
        #   for each hit (i.e., of the same substance name, an unlikely but possible occurrence)
        for row in cur.fetchall():
            uuid = row["uuid"]
            inchikey = None
            biolink_class = None
            substanceClass = row['substanceClass']
            #       Create identifiers by annotating ids with appropriate CURIE
            id = "UNII:" + row['substanceUNII']
            identifiers = {'unii': id}
            if (row['structureInChiKey']):
                identifiers["inchikey"] = row['structureInChiKey']
            if (row['structurePubChem']):
                identifiers["pubchem"] = "CID:" + row['structurePubChem']
        #       Select the correct biolink_class based on substanceClass
            if (row['substanceClass'] in [
                    'structurallyDiverse', 'polymer', 'protein', 'nucleicAcid',
                    'chemical', 'mixture'
            ]):
                biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                    row['substanceClass'], row['structureInChiKey'])
            substance = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name_value, synonyms=[], source=SOURCE)
                ],  # add names & synonyms from the database
                attributes=[
                    Attribute(name='query name',
                              value=name_value,
                              provided_by=self.info.name,
                              source=SOURCE),
                ],
                connections=[],
                source=self.info.name)
            #   Append additional attributes collected from Inxight:Drugs substances & unii tables
            attributes_list = [
                'substanceClass', 'formula', 'opticalActivity',
                'atropisomerism', 'stereoCenters', 'definedStereo',
                'ezCenters', 'charge', 'molecularWeight', 'stereochemistry',
                'NCBI'
            ]
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    if attribute != 'NCBI':
                        substance.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=attribute,
                                      value=str(row[attribute]),
                                      source=SOURCE,
                                      type=attribute))
                    else:
                        substance.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name='OrganismTaxon',
                                      value='NCBITaxon:' + row[attribute],
                                      source=SOURCE,
                                      type='biolink:OrganismTaxon'))
            if biolink_class != 'ignore':
                substance_list.append(substance)
                # Append synonyms to the substance
                Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance)
                # Append references to the substance
                Inxight_Drugs_DataSupply.get_references(self, uuid, substance)
                # Append codes as refererences to the substance
                Inxight_Drugs_DataSupply.get_codes(self, uuid, substance)

                ##### Need to put this in a dictionary with protein, polymer, nucleic acid, ...
                if substanceClass == 'protein':
                    Inxight_Drugs_DataSupply.get_protein_info(
                        self, uuid, substance)
                elif substanceClass == 'nucleicAcid':
                    Inxight_Drugs_DataSupply.get_nucleicAcid_info(
                        self, uuid, substance)