Beispiel #1
0
 def create_element(self, disorder):
     beacon_concept_id = disorder['beacon_concept_id']
     identifiers = {}
     synonyms = []
     for synonym in get_synonyms(beacon_concept_id):
         if synonym['EXACT_MATCH'] == 0:
             synonyms.append(synonym['SYNONYM'])
         if synonym['EXACT_MATCH'] == 1:
             (prefix, suffix) = parse_curie(synonym['SYNONYM'])
             if prefix is not None and prefix in self.id_map:
                 key = self.id_map[prefix]
                 curie = self.prefix_map[prefix] + suffix
                 add_identifier(identifiers, key, curie)
     disorder_id = self.disorder_id(disorder, identifiers)
     element = Element(id=disorder_id,
                       biolink_class=DISEASE,
                       identifiers=identifiers,
                       names_synonyms=[
                           Names(name=disorder['name'],
                                 synonyms=synonyms,
                                 source=self.info.label)
                       ],
                       attributes=[],
                       connections=[],
                       source=self.info.name)
     return element
    def find_drugs_by_gene(info_name, drug_list, gene):
        """
            Collect all the drugs that inhibit the gene
            """
        entrez_id = gene.identifiers['entrez'].split(":", 1)[1].strip()
        print(entrez_id)
        #       Inhibitors SQL query.
        query4 = """ 
                    SELECT
                        drugs.id AS drug_id,
                        drugs.chembl_id,
                        drugs.name AS name,
                        drugs.immunotherapy,
                        drugs.anti_neoplastic,
                        drugs.fda_approved,
                        genes.long_name AS name,
                        interactions.id AS interaction_id
                    FROM genes
                    JOIN interactions on interactions.gene_id = genes.id
                    JOIN drugs ON drugs.id = interactions.drug_id
                    WHERE genes.entrez_id = ?;
                    """
        global connection
        connection = DGIdbDataSupply.get_db()
        cur4 = connection.execute(query4, (entrez_id, ))

        for row in cur4.fetchall():  # loop for each drug interaction
            dgidb_drug_id = row['drug_id']
            interaction_id = row['interaction_id']
            id = "ChEMBL:" + (row['chembl_id'])
            drug = Element(id=id,
                           biolink_class="ChemicalSubstance",
                           identifiers={'chembl': id},
                           names_synonyms=[],
                           attributes=[],
                           connections=[],
                           source=info_name)

            #   Start adding drug name & synonyms from the drugs table
            drug.names_synonyms.append(
                Names(
                    name=row['name'],
                    synonyms=[],
                    source=info_name,
                    #   type = row['name'],        # Interim solution for providing "type", pending Consortium's final decision
                    #   provided_by = info_name
                ))

            DGIdbDataSupply.get_names_synonyms(dgidb_drug_id, drug)

            #   Append to drug additional attributes collected from DGIdb drug_attributes table
            DGIdbDataSupply.get_drug_attributes(info_name, dgidb_drug_id, drug)

            #   Append connection to gene, per interaction_id
            DGIdbDataSupply.get_connection_data(drug, info_name,
                                                interaction_id, gene.id,
                                                "affected_by")

            drug_list.append(drug)
Beispiel #3
0
    def get_names_synonyms(self, names):
        """
            Build names and synonyms list
        """
        names_synonyms = []
        if 'primary-common' in names or 'common' in names:
            names_synonyms.append(
                Names(name=names['primary-common'][0]
                      if 'primary-common' in names else None,
                      synonyms=names['common'] if 'common' in names else None,
                      source=self.SOURCE,
                      name_type="",
                      provided_by=self.PROVIDED_BY))

        if 'primary-brand' in names or 'brand' in names:
            names_synonyms.append(
                Names(name=names['primary-brand'][0]
                      if 'primary-brand' in names else None,
                      synonyms=names['brand'] if 'brand' in names else None,
                      source=self.SOURCE,
                      provided_by=self.PROVIDED_BY,
                      name_type='brand name'))

        if 'primary-chemical' in names or 'chemical' in names:
            names_synonyms.append(
                Names(name=names['primary-chemical'][0]
                      if 'primary-chemical' in names else None,
                      synonyms=names['chemical']
                      if 'chemical' in names else None,
                      source=self.SOURCE,
                      provided_by=self.PROVIDED_BY,
                      name_type='chemical name'))

        for name_type, name_list in names.items():
            if name_type not in {
                    'DrugBank', 'PubChem', 'CAS', 'primary-common', 'common',
                    'primary-brand', 'brand', 'primary-chemical', 'chemical'
            }:
                names_synonyms.append(
                    Names(name=name_list[0] if len(name_list) == 1 else None,
                          synonyms=name_list if len(name_list) > 1 else None,
                          source=self.SOURCE,
                          provided_by=self.PROVIDED_BY,
                          name_type=name_type
                          if name_type != 'ChemBank' else 'ChemBank ID'))
        return names_synonyms
    def find_genes_by_drug(info_name, gene_list, compound):
        """
            Collect all the genes that the drug interact with
            """
        #   drugs_chembl_id = compound.id.split(":",1)[1].strip()
        drugs_chembl_id = compound.identifiers['chembl'].split(":",
                                                               1)[1].strip()

        #       Targets SQL query.
        query2 = """ 
            SELECT
                drugs.id AS drug_id,
                genes.entrez_id,
                genes.name AS symbol,
                genes.long_name AS name,
                genes.id AS gene_id,
                interactions.id AS interaction_id
            FROM drugs
            JOIN interactions on interactions.drug_id = drugs.id
            JOIN genes ON genes.id = interactions.gene_id
            WHERE drugs.chembl_id = ?;
            """
        global connection
        connection = DGIdbDataSupply.get_db()
        cur2 = connection.execute(query2, (drugs_chembl_id, ))

        for row in cur2.fetchall():  # loop for each gene interaction
            dgidb_gene_id = row['gene_id']
            interaction_id = row['interaction_id']
            id = "NCBIGene:" + str(row['entrez_id'])
            gene = Element(id=id,
                           biolink_class="Gene",
                           identifiers={"entrez": id},
                           names_synonyms=[],
                           attributes=[],
                           connections=[],
                           source=info_name)

            #   Start adding the gene name & symbol from the genes table
            gene.names_synonyms.append(
                Names(
                    name=row['name'],
                    synonyms=[row['symbol']],
                    source=info_name,
                ))

            #   Append to gene additional attributes collected from DGIdb gene_attributes table
            DGIdbDataSupply.get_gene_attributes(gene, info_name, dgidb_gene_id)

            #   Append connection to gene, per interaction_id
            DGIdbDataSupply.get_connection_data(gene, info_name,
                                                interaction_id, compound.id,
                                                "affects")

            gene_list.append(gene)
    def get_names_synonyms(self, names):
        """
            Build names and synonyms list
        """
        names_synonyms = []
        if 'primary-common' in names or 'common' in names:
            names_synonyms.append(
                Names(name=names['primary-common'][0]
                      if 'primary-common' in names else None,
                      synonyms=names['common'] if 'common' in names else None,
                      source='ChemBank'))

        if 'primary-brand' in names or 'brand' in names:
            names_synonyms.append(
                Names(name=names['primary-brand'][0]
                      if 'primary-brand' in names else None,
                      synonyms=names['brand'] if 'brand' in names else None,
                      source='brand-name@ChemBank'))

        if 'primary-chemical' in names or 'chemical' in names:
            names_synonyms.append(
                Names(name=names['primary-chemical'][0]
                      if 'primary-chemical' in names else None,
                      synonyms=names['chemical']
                      if 'chemical' in names else None,
                      source='chemical-name@ChemBank'))

        for name_type, name_list in names.items():
            if name_type not in {
                    'DrugBank', 'PubChem', 'CAS', 'primary-common', 'common',
                    'primary-brand', 'brand', 'primary-chemical', 'chemical'
            }:
                names_synonyms.append(
                    Names(
                        name=name_list[0] if len(name_list) == 1 else None,
                        synonyms=name_list if len(name_list) > 1 else None,
                        source=name_type + '@ChemBank'
                        if name_type != 'ChemBank' else 'ChemBank ID',
                    ))
        return names_synonyms
 def names(self, id, name):
     name_map = {
         'ChEBI':
         Names(name=name,
               synonyms=[],
               source='ChEBI',
               url='https://www.ebi.ac.uk/chebi/searchId.do?chebiId={}'.
               format(id))
     }
     names_list = [name_map['ChEBI']]
     synonyms = get_synonyms(id)
     for synonym, type, source, language in synonyms:
         if source not in name_map.keys():
             name_map[source] = Names(synonyms=[], source=source + '@ChEBI')
             names_list.append(name_map[source])
         names = name_map[source]
         if (type == 'INN' or type
                 == 'NAME') and language == 'en' and names.name is None:
             names.name = synonym
         else:
             names.synonyms.append(synonym)
     return names_list
    def find_compound_by_unii(self, unii):
        """
            Find compound by a unii
        """
        # slect * is not a good practice rather than * distinct listing is better.
        id = "UNII:" + unii

        query = """
        select
            UNII.UNII,
            UNII.PT,
            UNII.RN,
            UNII.NCIT,
            UNII.PUBCHEM,
            UNII.INCHIKEY,
            UNII.SMILES,
            RXNCONSO.CODE
        from RXNCONSO
        join UNII on RXNCONSO.CODE = UNII.UNII
        where (UNII.UNII = ?)
        """
        cur = connection.execute(
            query,
            (unii,
             ))  # in order to make the varible as a tuple of one explicitely.

        compound = Element(id=id,
                           biolink_class='ChemicalSubstance',
                           identifiers={'unii': unii},
                           attributes=self.find_compound_attributes(unii),
                           connections=[],
                           source=self.info.name)

        # TODO issue SQL query and collect results
        # for each hit (i.e., of the same drug name, an unlikely but possible occurrence)
        for row in cur.fetchall():
            if (row['UNII'] == unii):
                compound.names_synonyms = [
                    Names(name=row['PT'], synonyms=[], source=UNIISOURCE)
                ]  # add names & synonyms from the database

                compound.identifiers = {
                    'unii': id,
                    'cas': 'CAS:' + row['RN'],
                    'ncit': 'NCIT:' + row['NCIT'],
                    'inchikey': row['INCHIKEY'],
                    'smiles': row['SMILES'],
                    'pubchem': 'CID:' + str(row['pubchem'])
                }

        return [compound]
 def get_compound(self, cpd_id):
     compound = get_compound(cpd_id)[0]
     compound_info = CompoundInfo(
         compound_id=compound[3],
         identifiers=CompoundInfoIdentifiers(pubchem=compound[3]),
         names_synonyms=[
             Names(name=compound[1],
                   synonyms=[compound[2]],
                   source=self.info.name)
         ],
         structure=CompoundInfoStructure(smiles=compound[4],
                                         inchi=compound[5],
                                         inchikey=compound[6],
                                         source='CTRP'),
         attributes=[],
         source=self.info.name)
     return compound_info
Beispiel #9
0
    def row_to_element(self, row):
        id = row['ID']
        beacon_concept_id = row['BEACON_CONCEPT_ID']
        identifiers = {
            'hmdb': id,
        }
        synonyms = []
        for synonym in get_synonyms(beacon_concept_id):
            if synonym['EXACT_MATCH'] == 0:
                synonyms.append(synonym['SYNONYM'])
            if synonym['EXACT_MATCH'] == 1:
                (prefix, suffix) = parse_curie(synonym['SYNONYM'])
                if prefix is not None and prefix in self.id_map:
                    key = self.id_map[prefix]
                    curie = self.prefix_map[prefix] + suffix
                    if prefix == 'DrugBank' and not suffix.startswith(':DB'):
                        key = 'pubchem'
                        curie = 'CID' + suffix
                    add_identifier(identifiers, key, curie)

        attributes = []
        for detail in get_details(beacon_concept_id):
            if detail['TAG'] in self.id_map:
                add_identifier(identifiers, self.id_map[detail['TAG']],
                               detail['VALUE'])
                if detail['TAG'] == 'inchi':
                    attributes.append(
                        Attribute(name='structure source',
                                  value=self.info.label,
                                  source=self.info.label,
                                  provided_by=self.info.name))

        element = Element(id=id,
                          biolink_class=CHEMICAL_SUBSTANCE,
                          identifiers=identifiers,
                          names_synonyms=[
                              Names(name=row['NAME'],
                                    synonyms=synonyms,
                                    source=self.info.label)
                          ],
                          attributes=attributes,
                          connections=[],
                          source=self.info.name)

        return element
 def add_element(self, row, compound_list):
     # Set up identifiers
     identifiers={}
     # Add only if pubchemcid is present
     # Keys are lowercase, prefixes are uppercase
     if row['PUBCHEMCID'] is not None and row['PUBCHEMCID'] != '':
         identifiers['pubchem']= 'CID:'+str(row['PUBCHEMCID'])
     if row['SMILES'] is not None and row['SMILES'] != '':
         identifiers['smiles']= row['SMILES']
     if row['INCHIKEY'] is not None and row['INCHIKEY'] != '':
         identifiers['inchikey']= row['INCHIKEY']
     if row['INCHI'] is not None and row['INCHI'] !='':
         identifiers['inchi']= row['INCHI']   
     if row['PUBCHEMSID'] is not None and row['PUBCHEMSID'] !='':
         identifiers['pubchem.substance']= 'SID:'+str(row['PUBCHEMSID'])
     if row['LIGAND_ID'] is not None and row['LIGAND_ID'] != '':
         identifiers['gtopdb']= 'GTOPDB:'+str(row['LIGAND_ID'])
     # Set up proper name if inn is available     
     name= row['NAME']
     synonyms=[]
     if row['INN'] is not None and row['INN'] != '':
         synonyms.append(name)
         name= row['INN']
     # Set up synonyms 
     if row['IUPAC'] is not None and row['IUPAC'] !='':
         synonyms.append(row['IUPAC'])
     for synonym in self.get_names_synonyms(row['LIGAND_ID']):
         synonyms.append(synonym)
         
     names= Names(name=name,synonyms=synonyms, source=SOURCE)
     # Id as CID
     Element()
     compound = Element(
                 id = "GTOPDB:"+ str(row['LIGAND_ID']),
                 biolink_class='ChemicalSubstance',
                 identifiers = identifiers,
                 names_synonyms = [names],
                 attributes= [],
                 connections=[],
                 source=self.info.name
             )
     self.get_element_attributes(row,compound)       
 
     compound_list.append(compound)        
 def get_or_create_mechanism(self, row, mechanisms, mechanism_list):
     name = row['target_chembl_id']
     id = CHEMBL+name
     if id in mechanisms:
         return mechanisms[id]
     names = Names(name=name, synonyms=[],source=SOURCE)
     mechanism = Element(
         id=id,
         biolink_class=MOLECULAR_ENTITY,
         identifiers = {'chembl':id},
         names_synonyms=[names],
         connections=[],
         attributes=[]
     )
     add_attribute(self,mechanism,row,'target_name')
     add_attribute(self,mechanism,row,'target_type')
     add_attribute(self,mechanism,row,'target_organism')
     mechanisms[id] = mechanism
     mechanism_list.append(mechanism)
     return mechanism
 def get_or_create_indication(self, row, indications, indication_list):
     mesh_id = MESH+row['mesh_id']
     efo_id = row['efo_id']
     if mesh_id is not None and mesh_id in indications:
         return indications[mesh_id]
     if efo_id is not None and efo_id in indications:
         return indications[efo_id]
     id = mesh_id if mesh_id is not None else efo_id
     names = Names(name = row['mesh_heading'],synonyms=[],source=SOURCE)
     indication = Element(id=id, biolink_class=DISEASE,connections=[])
     indication.identifiers={'efo':[]}
     indication.names_synonyms=[names]
     indication.source = self.info.name
     if mesh_id is not None:
         indications[mesh_id] = indication
         indication.identifiers['mesh'] = mesh_id
     if efo_id is not None:
         indications[efo_id] = indication
     indication_list.append(indication)
     return indication
Beispiel #13
0
 def get_compound(self, cpd_id):
     compound = get_compound(cpd_id)[0]
     element = Element(
             id = compound['PUBCHEM_CID'],
             biolink_class='ChemicalSubstance',
             identifiers = {
                 "pubchem": compound['PUBCHEM_CID'],
                 "smiles": compound['SMILES'],
                 "inchi": compound['INCHI'],
                 "inchikey": compound['INCHI_KEY'],
             },
             names_synonyms = [Names(
                 name=compound['COMPOUND_NAME'], 
                 synonyms = [compound['BROAD_CPD_ID']], 
                 source = self.info.label)],
             attributes = [],
             connections=[],
             source = self.info.name
         )
     return element
Beispiel #14
0
def get_names(element, drug_id):
    query9 = """
            SELECT
                SYNONYM,
                LANGUAGE,
                CODER
            FROM SYNONYM
            LEFT JOIN LANGUAGE ON LANGUAGE.LANGUAGE_ID = SYNONYM.LANGUAGE_ID
            LEFT JOIN CODER ON CODER.CODER_ID = SYNONYM.CODER_ID
            WHERE DRUG_ID = ?
    """
    dbConnection = get_db() 
    cur9 = dbConnection.cursor()
    cur9.execute(query9,(drug_id,))

    for row in cur9.fetchall(): 
        name = row['SYNONYM']
        if row['LANGUAGE'] is not None:
            language = '[' + row['LANGUAGE'] + ']'
        else:
            language = ''

        coders = str(row['CODER']).split('/') 
        for code in coders:  
            source = code + language + '@DrugBank'
        #   add drug name from the drugs table
            notFound = True
            for nameObj in element.names_synonyms:
                if  nameObj.source == source:
                    if nameObj.synonyms == [] and source != 'DrugBank':
                        nameObj.synonyms.append(nameObj.name)
                        nameObj.name = None
                    nameObj.synonyms.append(name)
                    notFound = False
            if notFound:
                element.names_synonyms.append(
                    Names(
                        name = name,
                        synonyms = [],
                        source = source
                    ))
 def compound_info(self, sample):
     print(sample)
     drug_id = sample[0]
     sample_id = sample[1]
     smiles = sample[2]
     inchi_key = sample[3]
     pubchem_cid = PUBCHEM + str(sample[4]) if sample[4] != '' else None
     compound = get_compound(drug_id)[0]
     synonyms = get_synonyms(sample_id)
     compound_info = CompoundInfo(
         compound_id=pubchem_cid if pubchem_cid is not None else inchi_key,
         identifiers=CompoundInfoIdentifiers(pubchem=pubchem_cid),
         structure=CompoundInfoStructure(smiles=smiles,
                                         inchikey=inchi_key,
                                         source=self.info.label),
         names_synonyms=[
             Names(name=compound[1],
                   synonyms=[synonym[1] for synonym in synonyms],
                   source='Drug Repurposing Hub')
         ],
         attributes=[],
         source=self.info.name)
     return compound_info
Beispiel #16
0
    def get_drug(self, row):
        drug_id = str(row[0])
        drug_name = row[1]
        cas = row[2]
        smiles = row[3]
        inchi = row[4]
        inchi_key = row[5]

        return CompoundInfo(
            compound_id='DrugCentral:' + drug_id,
            identifiers=CompoundInfoIdentifiers(drugcentral='DrugCentral:' +
                                                drug_id,
                                                cas=cas),
            names_synonyms=[
                Names(name=drug_name,
                      synonyms=[],
                      source='DrugCentral',
                      url='http://drugcentral.org/drugcard/' + drug_id)
            ],
            structure=CompoundInfoStructure(smiles=smiles,
                                            inchi=inchi,
                                            inchikey=inchi_key,
                                            source='DrugCentral'),
            source=self.info.name)
    def add_element(self, protein_id, protein_list):
        # Set up identifiers
        identifiers = {}
        # Add only if HGNC_ID is present (human target)
        identifiers['ensembl'] = protein_id

        #Set up synonyms
        synonyms = []

        names = Names(name=protein_id, synonyms=synonyms, source=SOURCE)

        Element()
        protein = Element(
            id=protein_id,
            biolink_class='Protein',
            identifiers=identifiers,
            names_synonyms=[names],
            attributes=[],
            connections=[],
            source=self.info.name
        )
        self.get_element_attributes(protein)

        protein_list.append(protein)
Beispiel #18
0
    def export(self, gene_list, controls):

        genes = dict([(entrez_gene_id(gene)
                       if entrez_gene_id(gene) != None else gene.gene_id, None)
                      for gene in gene_list])

        #Read in the gene sets
        gene_set_y_gene_list_y = {}
        gene_set_y_gene_list_n = {}
        gene_set_n_gene_list_y = {}
        gene_set_n_gene_list_n = {}
        gene_set_k = {}
        gene_set_N = {}
        gene_set_gene_ids = {}
        all_gene_set_gene_ids = set()
        for msigdb_gmt_file in msigdb_gmt_files:
            msigdb_gmt_fh = open(msigdb_gmt_file)
            for line in msigdb_gmt_fh:
                cols = line.strip().split('\t')
                if len(cols) < 3:
                    continue
                gene_set_id = cols[0]
                gene_ids = cols[2:len(cols)]
                overlap = len([x for x in gene_ids if x in genes])
                if overlap == 0:
                    continue
                gene_set_y_gene_list_y[gene_set_id] = overlap
                gene_set_gene_ids[gene_set_id] = gene_ids
                gene_set_N[gene_set_id] = len(gene_ids)

                gene_set_y_gene_list_n[gene_set_id] = gene_set_N[
                    gene_set_id] - gene_set_y_gene_list_y[gene_set_id]
                gene_set_n_gene_list_y[gene_set_id] = len(
                    genes) - gene_set_y_gene_list_y[gene_set_id]
                for x in gene_ids:
                    all_gene_set_gene_ids.add(x)
            msigdb_gmt_fh.close()
        M = len(all_gene_set_gene_ids)

        gene_set_pvalues = {}
        gene_set_qvalues = {}
        gene_set_odds_ratios = {}
        all_pvalues = []
        all_gene_set_ids = []

        for gene_set_id in gene_set_y_gene_list_y:
            gene_set_n_gene_list_n[gene_set_id] = M - gene_set_y_gene_list_y[
                gene_set_id] - gene_set_y_gene_list_n[
                    gene_set_id] - gene_set_n_gene_list_y[gene_set_id]

            table = [[
                gene_set_y_gene_list_y[gene_set_id],
                gene_set_y_gene_list_n[gene_set_id]
            ],
                     [
                         gene_set_n_gene_list_y[gene_set_id],
                         gene_set_n_gene_list_n[gene_set_id]
                     ]]
            odds_ratio, pvalue = scipy.stats.fisher_exact(table)

            all_pvalues.append(pvalue)
            all_gene_set_ids.append(gene_set_id)

            if pvalue < controls['max p-value']:
                gene_set_pvalues[gene_set_id] = pvalue
                gene_set_odds_ratios[gene_set_id] = odds_ratio

        all_qvalues = correct_pvalues_for_multiple_testing(
            all_pvalues, correction_type="Benjamini-Hochberg")
        for i, gene_set_id in enumerate(all_gene_set_ids):
            if gene_set_id in gene_set_pvalues and all_qvalues[i] < controls[
                    'max q-value']:
                gene_set_qvalues[gene_set_id] = all_qvalues[i]

        pathways = []
        for gene_set_id in sorted(gene_set_qvalues.keys(),
                                  key=lambda x: gene_set_qvalues[x]):
            enriched_gene_set = Element(
                id='MSigDB:' + gene_set_id,
                biolink_class='Pathway',
                identifiers={'MSigDB': 'MSigDB:' + gene_set_id},
                names_synonyms=[
                    Names(
                        name=gene_set_id,
                        synonyms=[],
                        source='MSigDB',
                        url=
                        'http://software.broadinstitute.org/gsea/msigdb/cards/{}.html'
                        .format(gene_set_id))
                ],
                attributes=[
                    Attribute(name='p-value',
                              value=str(gene_set_pvalues[gene_set_id]),
                              source=self.info.name),
                    Attribute(name='q-value',
                              value=str(gene_set_qvalues[gene_set_id]),
                              source=self.info.name),
                    Attribute(name='odds ratio',
                              value=str(gene_set_odds_ratios[gene_set_id]),
                              source=self.info.name),
                ],
                source=self.info.name)
            pathways.append(enriched_gene_set)
        return pathways
    def get_mixtures(self, relationship_list, substance):
        substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip()
        source_element_id = substance.identifiers['unii']
        """
            Get mixtures that "has part" that includes the substance as a component,
            so append any mixtures to the relationship_list
            and annotate their Connection per biolink: 
            (https://biolink.github.io/biolink-model/docs/has_part.html)
        """
        query14 = """
            SELECT 
                substances._name AS substance_name,
                mixture_substances.uuid AS mixture_uuid,
                mixture_substances._name AS mixture_substance,
                mixture_substances.UNII AS mixture_substance_unii,
                mixture_substances.substanceClass,
                unii_lookup.RXCUI AS mixture_RXCUI,
                unii_lookup.PUBCHEM AS mixture_PUBCHEM,
                unii_lookup.INCHIKEY AS mixture_InChiKey,
                unii_lookup.NCBI AS mixture_NCBI
            FROM substances
            JOIN components ON substances.uuid = components.refuuid
            JOIN mixtures ON components.mixture_id = mixtures.uuid
            JOIN substances AS mixture_substances ON mixtures.uuid = mixture_substances.mixture
            LEFT JOIN unii_lookup ON mixture_substances.UNII = unii_lookup.UNII 
            WHERE substances.UNII = ?   ;
            """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur14 = connection.execute(query14, (substance_unii, ))
        for row in cur14.fetchall():  # loop for each mixture substance found
            id = "UNII:" + str(row['mixture_substance_unii'])
            name = row['mixture_substance']
            #   Create identifiers by annotating ids with appropriate CURIE prefix
            identifiers = {'unii': id}
            if (row['mixture_InChiKey']):
                identifiers["inchikey"] = row['mixture_InChiKey']
            if (row['mixture_PUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['mixture_PUBCHEM']

            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['substanceClass'], row['mixture_InChiKey'])
            if row['mixture_RXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['mixture_RXCUI']

            connection = Connection(source_element_id=source_element_id,
                                    type='has_part',
                                    relation='has_part',
                                    evidence_type="",
                                    attributes=[])
            mixture = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[connection],
                source=self.info.name)
            if biolink_class != 'ignore':
                relationship_list.append(mixture)
    def get_active_ingredients(self, related_list, drug):
        drug_rxcui = drug.identifiers['rxnorm'].split(":", 1)[1].strip()
        source_element_id = drug.id.strip()
        query7 = """
            SELECT DISTINCT
                unii_lookup.PT,
                unii_lookup.RXCUI,
                unii_lookup.PUBCHEM,
                unii_lookup.NCBI,
                substances._name,
                relatedSubstances._name AS related_substance,
                relatedSubstances.uuid AS related_substance_uuid,
                relatedSubstances.UNII AS related_substance_unii,
                relatedSubstances.substanceClass AS related_substance_class,
                relationships.type AS relationships_type,
                relationships.qualification,
                relationships.amount_average AS average,
                relationships.amount_high AS high,
                relationships.amount_low AS low,
                relationships.amount_units,
                unii_lookup.INCHIKEY AS InChiKey,
                unii_lookup.INGREDIENT_TYPE
            FROM unii_lookup
            JOIN substances ON unii_lookup.UNII = substances.UNII
            LEFT JOIN relationships ON substances.uuid = relationships.substance_id
            JOIN substances AS relatedSubstances ON relationships.relatedSubstance_id = relatedSubstances.uuid
            WHERE
                relationships.type LIKE ("%ACTIVE%") 
                AND NOT relationships.type LIKE ("%INACTIVE%")
                AND NOT relationships.type LIKE ("%PARENT->%")
                AND NOT relationships.type LIKE ("%PRODRUG->%")
                AND NOT relationships.type LIKE ("%RACEMATE->%")
                AND NOT relationships.type LIKE ("%SUBSTANCE->%")  
                AND NOT relationships.type LIKE ("%METABOLITE ACTIVE%") 
                AND NOT relationships.type LIKE ("%METABOLITE LESS ACTIVE%") 
                AND NOT relationships.type LIKE ("%ACTIVE CONSTITUENT ALWAYS PRESENT%")
                AND RXCUI = ?; 
        """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur7 = connection.execute(query7, (drug_rxcui, ))

        for row in cur7.fetchall():  # loop for each related substance found
            id = "UNII:" + str(row['related_substance_unii'])
            uuid = row['related_substance_uuid']
            identifiers = {'unii': id}
            if row['InChiKey']:
                identifiers['inchikey'] = row['InChiKey']
            if (row['PUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['PUBCHEM']
            name = row['related_substance']
            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['related_substance_class'], row['InChiKey'])
            relationships_type = row['relationships_type']
            if row['RXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['RXCUI']
            substance = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[],
                source=self.info.name)
            # Append synonyms to the substance
            Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance)
            relationship = Connection(source_element_id=source_element_id,
                                      type=relationships_type,
                                      relation="has_active_ingredient",
                                      evidence_type="",
                                      attributes=[])
            if (row['qualification']):
                #   active ingredients attributes
                attributes_list = ['average', 'high', 'low']
                for attribute in attributes_list:
                    if row[attribute] is not None and len(
                            row[attribute].strip()) > 0:
                        name = attribute + ' ' + str(
                            row['qualification']) + ' (' + str(
                                row['amount_units']) + ')'
                        value = str(row[attribute])
                        relationship.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=name,
                                      value=value,
                                      source=SOURCE,
                                      type=name))
            substance.connections.append(relationship)
            if biolink_class != 'ignore':  # the ingredient is not a "concept"
                related_list.append(substance)
    def get_relationships(self, relationship_list, substance):
        #   substance is a mixture that must have components
        Inxight_Drugs_DataSupply.get_components(self, relationship_list,
                                                substance)
        # also check if it is a component
        Inxight_Drugs_DataSupply.get_mixtures(self, relationship_list,
                                              substance)

        substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip()
        source_element_id = substance.identifiers['unii']
        """
            Find relationships to other substances by a substance UNII
            """
        query5 = """
            SELECT 
                substances._name AS substance_name,
                substances.mixture,
                relationships.type AS relationships_type,
                relationships.mediatorSubstance_id,
                relationships.interactionType AS interaction_type,
                relationships.qualification,
                relationships.amount_average AS average,
                relationships.amount_high AS high,
                relationships.amount_low AS low,
                relationships.amount_units,
                relationships.comments,
                related.uuid AS related_uuid,
                related._name AS related_substance,
                related.UNII AS related_substance_unii,
                related.substanceClass,
                unii_lookup.RXCUI AS relatedRXCUI,
                unii_lookup.PUBCHEM AS relatedPUBCHEM,
                unii_lookup.INCHIKEY AS InChiKey,
                unii_lookup.NCBI
            FROM substances
            JOIN relationships ON substances.uuid = relationships.substance_id
            JOIN substances AS related ON relationships.relatedSubstance_id = related.uuid
            LEFT JOIN unii_lookup ON related.UNII = unii_lookup.UNII
            WHERE substances.UNII = ?; 
        """
        connection = Inxight_Drugs_DataSupply.get_db()
        cur5 = connection.execute(query5, (substance_unii, ))

        for row in cur5.fetchall():  # loop for each related substance found
            id = "UNII:" + str(row['related_substance_unii'])
            name = row['related_substance']
            #   Create identifiers by annotating ids with appropriate CURIE prefix
            identifiers = {'unii': id}
            if (row['InChiKey']):
                identifiers["inchikey"] = row['InChiKey']
            if (row['relatedPUBCHEM']):
                identifiers["pubchem"] = "CID:" + row['relatedPUBCHEM']
            biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                row['substanceClass'], row['InChiKey'])
            if row['relatedRXCUI'] and biolink_class == 'Drug':
                identifiers["rxnorm"] = "RXCUI:" + row['relatedRXCUI']

            related_substances = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name, synonyms=[], source=SOURCE)
                ],  # add name & later synonyms from the database
                attributes=[],
                connections=[],
                source=self.info.name)
            #   Append additional attributes collected from Inxight:Drugs substances & unii tables
            attributes_list = ['substanceClass', 'NCBI']
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    if attribute != 'NCBI':
                        related_substances.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=attribute,
                                      value=str(row[attribute]),
                                      source=SOURCE,
                                      type=attribute))
                    else:  # NCBI id
                        related_substances.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name='OrganismTaxon',
                                      value='NCBITaxon:' + row[attribute],
                                      source=SOURCE,
                                      type='biolink:OrganismTaxon'))
            relationship = Connection(source_element_id=source_element_id,
                                      type=row['relationships_type'],
                                      relation=row['relationships_type'],
                                      evidence_type="",
                                      attributes=[])
            #   Append additional attributes collected from Inxight:Drugs relationships table
            attributes_list = ['interaction_type', 'comments']
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    relationship.attributes.append(
                        Attribute(provided_by=self.info.name,
                                  name=attribute,
                                  value=str(row[attribute]),
                                  source=SOURCE,
                                  type=attribute))
            if (row['qualification']):
                attributes_list = ['average', 'high', 'low']
                for attribute in attributes_list:
                    if row[attribute] is not None and len(
                            row[attribute].strip()) > 0:
                        # e.g., attribute.name: average IC50 (NANOMOLAR), attribute.value: 2.7
                        name = attribute + ' ' + str(
                            row['qualification']) + ' (' + str(
                                row['amount_units']) + ')'
                        value = str(row[attribute])
                        relationship.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=name,
                                      value=value,
                                      source=SOURCE,
                                      type=name))

            related_substances.connections.append(relationship)
            if biolink_class != 'ignore':
                relationship_list.append(related_substances)
    def find_substance(self, substance_list, name_value):
        search_column = '_name'  # by default, assume a search for substance by name
        inchikey_regex = re.compile('[A-Z]{14}-[A-Z]{10}-[A-Z]')

        #   check if submitted name is native CURIE, e.g., CID:2244
        #   or InChiKey e.g., BSYNRYMUTXBXSQ-UHFFFAOYSA-N
        #   or else just a substance name
        if name_value.find(':') > -1:
            if name_value.find(
                    'UNII'
            ) > -1:  # a search for substance by UNII, i.e., column 'substanceUNII'
                search_column = 'substanceUNII'
            elif name_value.find(
                    'CID'
            ) > -1:  # a search for substance by CID, i.e., column 'structurePubChem'
                search_column = 'structurePubChem'
            name = name_value.split(":", 1)[1].strip()
        elif inchikey_regex.match(name_value) is not None:
            search_column = 'structureInChiKey'  # a search for substance by inchikey, i.e., column 'structureInChiKey'
            name = name_value
        else:
            name = name_value
            search_column = '_name'
        """
            Find substance by a name
        """
        query1 = """
            SELECT DISTINCT 
                substances.uuid AS uuid,
                substanceClass,
                substances.UNII AS substanceUNII,
                _name,
                structurallyDiverse,
                protein,
                nucleicAcid,
                mixture,
                polymer,
                structure_id,
                formula,
                opticalActivity,
                atropisomerism,
                stereoCenters,
                definedStereo,
                ezCenters,
                charge,
                mwt AS molecularWeight,
                stereochemistry,
                structures.InChiKey AS structureInChiKey,
                structures.pubChem AS structurePubChem,
                RXCUI,
                NCBI,
                stereoComments
            FROM substances
            LEFT JOIN structures ON substances.structure_id = structures.id
            LEFT JOIN unii_lookup ON substances.UNII = unii_lookup.UNII
            WHERE {search_column} = ?;
        """.format(search_column=search_column)
        connection = Inxight_Drugs_DataSupply.get_db()
        if search_column is not None:
            cur = connection.execute(query1, (name, ))
        #   for each hit (i.e., of the same substance name, an unlikely but possible occurrence)
        for row in cur.fetchall():
            uuid = row["uuid"]
            inchikey = None
            biolink_class = None
            substanceClass = row['substanceClass']
            #       Create identifiers by annotating ids with appropriate CURIE
            id = "UNII:" + row['substanceUNII']
            identifiers = {'unii': id}
            if (row['structureInChiKey']):
                identifiers["inchikey"] = row['structureInChiKey']
            if (row['structurePubChem']):
                identifiers["pubchem"] = "CID:" + row['structurePubChem']
        #       Select the correct biolink_class based on substanceClass
            if (row['substanceClass'] in [
                    'structurallyDiverse', 'polymer', 'protein', 'nucleicAcid',
                    'chemical', 'mixture'
            ]):
                biolink_class = Inxight_Drugs_DataSupply.get_biolink_class(
                    row['substanceClass'], row['structureInChiKey'])
            substance = Element(
                id=id,
                biolink_class=biolink_class,
                identifiers=identifiers,
                names_synonyms=[
                    Names(name=name_value, synonyms=[], source=SOURCE)
                ],  # add names & synonyms from the database
                attributes=[
                    Attribute(name='query name',
                              value=name_value,
                              provided_by=self.info.name,
                              source=SOURCE),
                ],
                connections=[],
                source=self.info.name)
            #   Append additional attributes collected from Inxight:Drugs substances & unii tables
            attributes_list = [
                'substanceClass', 'formula', 'opticalActivity',
                'atropisomerism', 'stereoCenters', 'definedStereo',
                'ezCenters', 'charge', 'molecularWeight', 'stereochemistry',
                'NCBI'
            ]
            for attribute in attributes_list:
                if row[attribute] is not None and len(
                        row[attribute].strip()) > 0:
                    if attribute != 'NCBI':
                        substance.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name=attribute,
                                      value=str(row[attribute]),
                                      source=SOURCE,
                                      type=attribute))
                    else:
                        substance.attributes.append(
                            Attribute(provided_by=self.info.name,
                                      name='OrganismTaxon',
                                      value='NCBITaxon:' + row[attribute],
                                      source=SOURCE,
                                      type='biolink:OrganismTaxon'))
            if biolink_class != 'ignore':
                substance_list.append(substance)
                # Append synonyms to the substance
                Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance)
                # Append references to the substance
                Inxight_Drugs_DataSupply.get_references(self, uuid, substance)
                # Append codes as refererences to the substance
                Inxight_Drugs_DataSupply.get_codes(self, uuid, substance)

                ##### Need to put this in a dictionary with protein, polymer, nucleic acid, ...
                if substanceClass == 'protein':
                    Inxight_Drugs_DataSupply.get_protein_info(
                        self, uuid, substance)
                elif substanceClass == 'nucleicAcid':
                    Inxight_Drugs_DataSupply.get_nucleicAcid_info(
                        self, uuid, substance)
    def find_compound_by_name(info_name, compound_list, name):
        """
            Find compound by a name
        """
        query1 = """
        SELECT DISTINCT 
            drugs.id AS drug_id,
            drugs.name AS drug_name, 
            drugs.fda_approved, 
            drugs.immunotherapy, 
            drugs.anti_neoplastic, 
            drugs.chembl_id AS ChEMBL_id
        FROM drugs
        JOIN drug_aliases ON drugs.id = drug_aliases.drug_id
        WHERE drugs.name = upper(?)
        OR drug_aliases.alias = ?;
        """
        global connection
        connection = DGIdbDataSupply.get_db()
        cur = connection.execute(query1, (name, name))

        # for each hit (i.e., of the same drug name, an unlikely but possible occurrence)
        for row in cur.fetchall():

            id = "ChEMBL:" + row['ChEMBL_id']
            identifiers = {'chembl': id}
            type = name  # Interim solution for providing "type", pending Consortium's final decision

            compound = Element(
                id=id,
                biolink_class='ChemicalSubstance',
                identifiers=identifiers,
                names_synonyms=[Names(name=name, synonyms=[], source=SOURCE)
                                ],  # add names & synonyms from the database
                attributes=[
                    Attribute(name='query name',
                              value=name,
                              provided_by=info_name,
                              source=SOURCE),
                ],
                connections=[],
                source=info_name)

            # Append synonyms
            DGIdbDataSupply.get_names_synonyms(row['drug_id'], compound)

            # Append additional attributes collected from DGIdb drugs table
            if (row['fda_approved'] == 't'):
                compound.attributes.append(
                    Attribute(provided_by=info_name,
                              name='FDA approval',
                              value="approved",
                              source=SOURCE,
                              type='FDA approval'))

            if (row['immunotherapy'] == 't'):
                compound.attributes.append(
                    Attribute(provided_by=info_name,
                              name="Drug Class",
                              value="immunotherapy",
                              source=SOURCE,
                              type="Drug Class"))
            if (row['anti_neoplastic'] == 't'):
                compound.attributes.append(
                    Attribute(provided_by=info_name,
                              name="Drug Class",
                              value="anti_neoplastic",
                              source=SOURCE,
                              type="Drug Class"))

        #   Append additional attributes from drug attributes table
            DGIdbDataSupply.get_drug_attributes(info_name, row['drug_id'],
                                                compound)
            compound_list.append(compound)