def create_element(self, disorder): beacon_concept_id = disorder['beacon_concept_id'] identifiers = {} synonyms = [] for synonym in get_synonyms(beacon_concept_id): if synonym['EXACT_MATCH'] == 0: synonyms.append(synonym['SYNONYM']) if synonym['EXACT_MATCH'] == 1: (prefix, suffix) = parse_curie(synonym['SYNONYM']) if prefix is not None and prefix in self.id_map: key = self.id_map[prefix] curie = self.prefix_map[prefix] + suffix add_identifier(identifiers, key, curie) disorder_id = self.disorder_id(disorder, identifiers) element = Element(id=disorder_id, biolink_class=DISEASE, identifiers=identifiers, names_synonyms=[ Names(name=disorder['name'], synonyms=synonyms, source=self.info.label) ], attributes=[], connections=[], source=self.info.name) return element
def find_drugs_by_gene(info_name, drug_list, gene): """ Collect all the drugs that inhibit the gene """ entrez_id = gene.identifiers['entrez'].split(":", 1)[1].strip() print(entrez_id) # Inhibitors SQL query. query4 = """ SELECT drugs.id AS drug_id, drugs.chembl_id, drugs.name AS name, drugs.immunotherapy, drugs.anti_neoplastic, drugs.fda_approved, genes.long_name AS name, interactions.id AS interaction_id FROM genes JOIN interactions on interactions.gene_id = genes.id JOIN drugs ON drugs.id = interactions.drug_id WHERE genes.entrez_id = ?; """ global connection connection = DGIdbDataSupply.get_db() cur4 = connection.execute(query4, (entrez_id, )) for row in cur4.fetchall(): # loop for each drug interaction dgidb_drug_id = row['drug_id'] interaction_id = row['interaction_id'] id = "ChEMBL:" + (row['chembl_id']) drug = Element(id=id, biolink_class="ChemicalSubstance", identifiers={'chembl': id}, names_synonyms=[], attributes=[], connections=[], source=info_name) # Start adding drug name & synonyms from the drugs table drug.names_synonyms.append( Names( name=row['name'], synonyms=[], source=info_name, # type = row['name'], # Interim solution for providing "type", pending Consortium's final decision # provided_by = info_name )) DGIdbDataSupply.get_names_synonyms(dgidb_drug_id, drug) # Append to drug additional attributes collected from DGIdb drug_attributes table DGIdbDataSupply.get_drug_attributes(info_name, dgidb_drug_id, drug) # Append connection to gene, per interaction_id DGIdbDataSupply.get_connection_data(drug, info_name, interaction_id, gene.id, "affected_by") drug_list.append(drug)
def get_names_synonyms(self, names): """ Build names and synonyms list """ names_synonyms = [] if 'primary-common' in names or 'common' in names: names_synonyms.append( Names(name=names['primary-common'][0] if 'primary-common' in names else None, synonyms=names['common'] if 'common' in names else None, source=self.SOURCE, name_type="", provided_by=self.PROVIDED_BY)) if 'primary-brand' in names or 'brand' in names: names_synonyms.append( Names(name=names['primary-brand'][0] if 'primary-brand' in names else None, synonyms=names['brand'] if 'brand' in names else None, source=self.SOURCE, provided_by=self.PROVIDED_BY, name_type='brand name')) if 'primary-chemical' in names or 'chemical' in names: names_synonyms.append( Names(name=names['primary-chemical'][0] if 'primary-chemical' in names else None, synonyms=names['chemical'] if 'chemical' in names else None, source=self.SOURCE, provided_by=self.PROVIDED_BY, name_type='chemical name')) for name_type, name_list in names.items(): if name_type not in { 'DrugBank', 'PubChem', 'CAS', 'primary-common', 'common', 'primary-brand', 'brand', 'primary-chemical', 'chemical' }: names_synonyms.append( Names(name=name_list[0] if len(name_list) == 1 else None, synonyms=name_list if len(name_list) > 1 else None, source=self.SOURCE, provided_by=self.PROVIDED_BY, name_type=name_type if name_type != 'ChemBank' else 'ChemBank ID')) return names_synonyms
def find_genes_by_drug(info_name, gene_list, compound): """ Collect all the genes that the drug interact with """ # drugs_chembl_id = compound.id.split(":",1)[1].strip() drugs_chembl_id = compound.identifiers['chembl'].split(":", 1)[1].strip() # Targets SQL query. query2 = """ SELECT drugs.id AS drug_id, genes.entrez_id, genes.name AS symbol, genes.long_name AS name, genes.id AS gene_id, interactions.id AS interaction_id FROM drugs JOIN interactions on interactions.drug_id = drugs.id JOIN genes ON genes.id = interactions.gene_id WHERE drugs.chembl_id = ?; """ global connection connection = DGIdbDataSupply.get_db() cur2 = connection.execute(query2, (drugs_chembl_id, )) for row in cur2.fetchall(): # loop for each gene interaction dgidb_gene_id = row['gene_id'] interaction_id = row['interaction_id'] id = "NCBIGene:" + str(row['entrez_id']) gene = Element(id=id, biolink_class="Gene", identifiers={"entrez": id}, names_synonyms=[], attributes=[], connections=[], source=info_name) # Start adding the gene name & symbol from the genes table gene.names_synonyms.append( Names( name=row['name'], synonyms=[row['symbol']], source=info_name, )) # Append to gene additional attributes collected from DGIdb gene_attributes table DGIdbDataSupply.get_gene_attributes(gene, info_name, dgidb_gene_id) # Append connection to gene, per interaction_id DGIdbDataSupply.get_connection_data(gene, info_name, interaction_id, compound.id, "affects") gene_list.append(gene)
def get_names_synonyms(self, names): """ Build names and synonyms list """ names_synonyms = [] if 'primary-common' in names or 'common' in names: names_synonyms.append( Names(name=names['primary-common'][0] if 'primary-common' in names else None, synonyms=names['common'] if 'common' in names else None, source='ChemBank')) if 'primary-brand' in names or 'brand' in names: names_synonyms.append( Names(name=names['primary-brand'][0] if 'primary-brand' in names else None, synonyms=names['brand'] if 'brand' in names else None, source='brand-name@ChemBank')) if 'primary-chemical' in names or 'chemical' in names: names_synonyms.append( Names(name=names['primary-chemical'][0] if 'primary-chemical' in names else None, synonyms=names['chemical'] if 'chemical' in names else None, source='chemical-name@ChemBank')) for name_type, name_list in names.items(): if name_type not in { 'DrugBank', 'PubChem', 'CAS', 'primary-common', 'common', 'primary-brand', 'brand', 'primary-chemical', 'chemical' }: names_synonyms.append( Names( name=name_list[0] if len(name_list) == 1 else None, synonyms=name_list if len(name_list) > 1 else None, source=name_type + '@ChemBank' if name_type != 'ChemBank' else 'ChemBank ID', )) return names_synonyms
def names(self, id, name): name_map = { 'ChEBI': Names(name=name, synonyms=[], source='ChEBI', url='https://www.ebi.ac.uk/chebi/searchId.do?chebiId={}'. format(id)) } names_list = [name_map['ChEBI']] synonyms = get_synonyms(id) for synonym, type, source, language in synonyms: if source not in name_map.keys(): name_map[source] = Names(synonyms=[], source=source + '@ChEBI') names_list.append(name_map[source]) names = name_map[source] if (type == 'INN' or type == 'NAME') and language == 'en' and names.name is None: names.name = synonym else: names.synonyms.append(synonym) return names_list
def find_compound_by_unii(self, unii): """ Find compound by a unii """ # slect * is not a good practice rather than * distinct listing is better. id = "UNII:" + unii query = """ select UNII.UNII, UNII.PT, UNII.RN, UNII.NCIT, UNII.PUBCHEM, UNII.INCHIKEY, UNII.SMILES, RXNCONSO.CODE from RXNCONSO join UNII on RXNCONSO.CODE = UNII.UNII where (UNII.UNII = ?) """ cur = connection.execute( query, (unii, )) # in order to make the varible as a tuple of one explicitely. compound = Element(id=id, biolink_class='ChemicalSubstance', identifiers={'unii': unii}, attributes=self.find_compound_attributes(unii), connections=[], source=self.info.name) # TODO issue SQL query and collect results # for each hit (i.e., of the same drug name, an unlikely but possible occurrence) for row in cur.fetchall(): if (row['UNII'] == unii): compound.names_synonyms = [ Names(name=row['PT'], synonyms=[], source=UNIISOURCE) ] # add names & synonyms from the database compound.identifiers = { 'unii': id, 'cas': 'CAS:' + row['RN'], 'ncit': 'NCIT:' + row['NCIT'], 'inchikey': row['INCHIKEY'], 'smiles': row['SMILES'], 'pubchem': 'CID:' + str(row['pubchem']) } return [compound]
def get_compound(self, cpd_id): compound = get_compound(cpd_id)[0] compound_info = CompoundInfo( compound_id=compound[3], identifiers=CompoundInfoIdentifiers(pubchem=compound[3]), names_synonyms=[ Names(name=compound[1], synonyms=[compound[2]], source=self.info.name) ], structure=CompoundInfoStructure(smiles=compound[4], inchi=compound[5], inchikey=compound[6], source='CTRP'), attributes=[], source=self.info.name) return compound_info
def row_to_element(self, row): id = row['ID'] beacon_concept_id = row['BEACON_CONCEPT_ID'] identifiers = { 'hmdb': id, } synonyms = [] for synonym in get_synonyms(beacon_concept_id): if synonym['EXACT_MATCH'] == 0: synonyms.append(synonym['SYNONYM']) if synonym['EXACT_MATCH'] == 1: (prefix, suffix) = parse_curie(synonym['SYNONYM']) if prefix is not None and prefix in self.id_map: key = self.id_map[prefix] curie = self.prefix_map[prefix] + suffix if prefix == 'DrugBank' and not suffix.startswith(':DB'): key = 'pubchem' curie = 'CID' + suffix add_identifier(identifiers, key, curie) attributes = [] for detail in get_details(beacon_concept_id): if detail['TAG'] in self.id_map: add_identifier(identifiers, self.id_map[detail['TAG']], detail['VALUE']) if detail['TAG'] == 'inchi': attributes.append( Attribute(name='structure source', value=self.info.label, source=self.info.label, provided_by=self.info.name)) element = Element(id=id, biolink_class=CHEMICAL_SUBSTANCE, identifiers=identifiers, names_synonyms=[ Names(name=row['NAME'], synonyms=synonyms, source=self.info.label) ], attributes=attributes, connections=[], source=self.info.name) return element
def add_element(self, row, compound_list): # Set up identifiers identifiers={} # Add only if pubchemcid is present # Keys are lowercase, prefixes are uppercase if row['PUBCHEMCID'] is not None and row['PUBCHEMCID'] != '': identifiers['pubchem']= 'CID:'+str(row['PUBCHEMCID']) if row['SMILES'] is not None and row['SMILES'] != '': identifiers['smiles']= row['SMILES'] if row['INCHIKEY'] is not None and row['INCHIKEY'] != '': identifiers['inchikey']= row['INCHIKEY'] if row['INCHI'] is not None and row['INCHI'] !='': identifiers['inchi']= row['INCHI'] if row['PUBCHEMSID'] is not None and row['PUBCHEMSID'] !='': identifiers['pubchem.substance']= 'SID:'+str(row['PUBCHEMSID']) if row['LIGAND_ID'] is not None and row['LIGAND_ID'] != '': identifiers['gtopdb']= 'GTOPDB:'+str(row['LIGAND_ID']) # Set up proper name if inn is available name= row['NAME'] synonyms=[] if row['INN'] is not None and row['INN'] != '': synonyms.append(name) name= row['INN'] # Set up synonyms if row['IUPAC'] is not None and row['IUPAC'] !='': synonyms.append(row['IUPAC']) for synonym in self.get_names_synonyms(row['LIGAND_ID']): synonyms.append(synonym) names= Names(name=name,synonyms=synonyms, source=SOURCE) # Id as CID Element() compound = Element( id = "GTOPDB:"+ str(row['LIGAND_ID']), biolink_class='ChemicalSubstance', identifiers = identifiers, names_synonyms = [names], attributes= [], connections=[], source=self.info.name ) self.get_element_attributes(row,compound) compound_list.append(compound)
def get_or_create_mechanism(self, row, mechanisms, mechanism_list): name = row['target_chembl_id'] id = CHEMBL+name if id in mechanisms: return mechanisms[id] names = Names(name=name, synonyms=[],source=SOURCE) mechanism = Element( id=id, biolink_class=MOLECULAR_ENTITY, identifiers = {'chembl':id}, names_synonyms=[names], connections=[], attributes=[] ) add_attribute(self,mechanism,row,'target_name') add_attribute(self,mechanism,row,'target_type') add_attribute(self,mechanism,row,'target_organism') mechanisms[id] = mechanism mechanism_list.append(mechanism) return mechanism
def get_or_create_indication(self, row, indications, indication_list): mesh_id = MESH+row['mesh_id'] efo_id = row['efo_id'] if mesh_id is not None and mesh_id in indications: return indications[mesh_id] if efo_id is not None and efo_id in indications: return indications[efo_id] id = mesh_id if mesh_id is not None else efo_id names = Names(name = row['mesh_heading'],synonyms=[],source=SOURCE) indication = Element(id=id, biolink_class=DISEASE,connections=[]) indication.identifiers={'efo':[]} indication.names_synonyms=[names] indication.source = self.info.name if mesh_id is not None: indications[mesh_id] = indication indication.identifiers['mesh'] = mesh_id if efo_id is not None: indications[efo_id] = indication indication_list.append(indication) return indication
def get_compound(self, cpd_id): compound = get_compound(cpd_id)[0] element = Element( id = compound['PUBCHEM_CID'], biolink_class='ChemicalSubstance', identifiers = { "pubchem": compound['PUBCHEM_CID'], "smiles": compound['SMILES'], "inchi": compound['INCHI'], "inchikey": compound['INCHI_KEY'], }, names_synonyms = [Names( name=compound['COMPOUND_NAME'], synonyms = [compound['BROAD_CPD_ID']], source = self.info.label)], attributes = [], connections=[], source = self.info.name ) return element
def get_names(element, drug_id): query9 = """ SELECT SYNONYM, LANGUAGE, CODER FROM SYNONYM LEFT JOIN LANGUAGE ON LANGUAGE.LANGUAGE_ID = SYNONYM.LANGUAGE_ID LEFT JOIN CODER ON CODER.CODER_ID = SYNONYM.CODER_ID WHERE DRUG_ID = ? """ dbConnection = get_db() cur9 = dbConnection.cursor() cur9.execute(query9,(drug_id,)) for row in cur9.fetchall(): name = row['SYNONYM'] if row['LANGUAGE'] is not None: language = '[' + row['LANGUAGE'] + ']' else: language = '' coders = str(row['CODER']).split('/') for code in coders: source = code + language + '@DrugBank' # add drug name from the drugs table notFound = True for nameObj in element.names_synonyms: if nameObj.source == source: if nameObj.synonyms == [] and source != 'DrugBank': nameObj.synonyms.append(nameObj.name) nameObj.name = None nameObj.synonyms.append(name) notFound = False if notFound: element.names_synonyms.append( Names( name = name, synonyms = [], source = source ))
def compound_info(self, sample): print(sample) drug_id = sample[0] sample_id = sample[1] smiles = sample[2] inchi_key = sample[3] pubchem_cid = PUBCHEM + str(sample[4]) if sample[4] != '' else None compound = get_compound(drug_id)[0] synonyms = get_synonyms(sample_id) compound_info = CompoundInfo( compound_id=pubchem_cid if pubchem_cid is not None else inchi_key, identifiers=CompoundInfoIdentifiers(pubchem=pubchem_cid), structure=CompoundInfoStructure(smiles=smiles, inchikey=inchi_key, source=self.info.label), names_synonyms=[ Names(name=compound[1], synonyms=[synonym[1] for synonym in synonyms], source='Drug Repurposing Hub') ], attributes=[], source=self.info.name) return compound_info
def get_drug(self, row): drug_id = str(row[0]) drug_name = row[1] cas = row[2] smiles = row[3] inchi = row[4] inchi_key = row[5] return CompoundInfo( compound_id='DrugCentral:' + drug_id, identifiers=CompoundInfoIdentifiers(drugcentral='DrugCentral:' + drug_id, cas=cas), names_synonyms=[ Names(name=drug_name, synonyms=[], source='DrugCentral', url='http://drugcentral.org/drugcard/' + drug_id) ], structure=CompoundInfoStructure(smiles=smiles, inchi=inchi, inchikey=inchi_key, source='DrugCentral'), source=self.info.name)
def add_element(self, protein_id, protein_list): # Set up identifiers identifiers = {} # Add only if HGNC_ID is present (human target) identifiers['ensembl'] = protein_id #Set up synonyms synonyms = [] names = Names(name=protein_id, synonyms=synonyms, source=SOURCE) Element() protein = Element( id=protein_id, biolink_class='Protein', identifiers=identifiers, names_synonyms=[names], attributes=[], connections=[], source=self.info.name ) self.get_element_attributes(protein) protein_list.append(protein)
def export(self, gene_list, controls): genes = dict([(entrez_gene_id(gene) if entrez_gene_id(gene) != None else gene.gene_id, None) for gene in gene_list]) #Read in the gene sets gene_set_y_gene_list_y = {} gene_set_y_gene_list_n = {} gene_set_n_gene_list_y = {} gene_set_n_gene_list_n = {} gene_set_k = {} gene_set_N = {} gene_set_gene_ids = {} all_gene_set_gene_ids = set() for msigdb_gmt_file in msigdb_gmt_files: msigdb_gmt_fh = open(msigdb_gmt_file) for line in msigdb_gmt_fh: cols = line.strip().split('\t') if len(cols) < 3: continue gene_set_id = cols[0] gene_ids = cols[2:len(cols)] overlap = len([x for x in gene_ids if x in genes]) if overlap == 0: continue gene_set_y_gene_list_y[gene_set_id] = overlap gene_set_gene_ids[gene_set_id] = gene_ids gene_set_N[gene_set_id] = len(gene_ids) gene_set_y_gene_list_n[gene_set_id] = gene_set_N[ gene_set_id] - gene_set_y_gene_list_y[gene_set_id] gene_set_n_gene_list_y[gene_set_id] = len( genes) - gene_set_y_gene_list_y[gene_set_id] for x in gene_ids: all_gene_set_gene_ids.add(x) msigdb_gmt_fh.close() M = len(all_gene_set_gene_ids) gene_set_pvalues = {} gene_set_qvalues = {} gene_set_odds_ratios = {} all_pvalues = [] all_gene_set_ids = [] for gene_set_id in gene_set_y_gene_list_y: gene_set_n_gene_list_n[gene_set_id] = M - gene_set_y_gene_list_y[ gene_set_id] - gene_set_y_gene_list_n[ gene_set_id] - gene_set_n_gene_list_y[gene_set_id] table = [[ gene_set_y_gene_list_y[gene_set_id], gene_set_y_gene_list_n[gene_set_id] ], [ gene_set_n_gene_list_y[gene_set_id], gene_set_n_gene_list_n[gene_set_id] ]] odds_ratio, pvalue = scipy.stats.fisher_exact(table) all_pvalues.append(pvalue) all_gene_set_ids.append(gene_set_id) if pvalue < controls['max p-value']: gene_set_pvalues[gene_set_id] = pvalue gene_set_odds_ratios[gene_set_id] = odds_ratio all_qvalues = correct_pvalues_for_multiple_testing( all_pvalues, correction_type="Benjamini-Hochberg") for i, gene_set_id in enumerate(all_gene_set_ids): if gene_set_id in gene_set_pvalues and all_qvalues[i] < controls[ 'max q-value']: gene_set_qvalues[gene_set_id] = all_qvalues[i] pathways = [] for gene_set_id in sorted(gene_set_qvalues.keys(), key=lambda x: gene_set_qvalues[x]): enriched_gene_set = Element( id='MSigDB:' + gene_set_id, biolink_class='Pathway', identifiers={'MSigDB': 'MSigDB:' + gene_set_id}, names_synonyms=[ Names( name=gene_set_id, synonyms=[], source='MSigDB', url= 'http://software.broadinstitute.org/gsea/msigdb/cards/{}.html' .format(gene_set_id)) ], attributes=[ Attribute(name='p-value', value=str(gene_set_pvalues[gene_set_id]), source=self.info.name), Attribute(name='q-value', value=str(gene_set_qvalues[gene_set_id]), source=self.info.name), Attribute(name='odds ratio', value=str(gene_set_odds_ratios[gene_set_id]), source=self.info.name), ], source=self.info.name) pathways.append(enriched_gene_set) return pathways
def get_mixtures(self, relationship_list, substance): substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip() source_element_id = substance.identifiers['unii'] """ Get mixtures that "has part" that includes the substance as a component, so append any mixtures to the relationship_list and annotate their Connection per biolink: (https://biolink.github.io/biolink-model/docs/has_part.html) """ query14 = """ SELECT substances._name AS substance_name, mixture_substances.uuid AS mixture_uuid, mixture_substances._name AS mixture_substance, mixture_substances.UNII AS mixture_substance_unii, mixture_substances.substanceClass, unii_lookup.RXCUI AS mixture_RXCUI, unii_lookup.PUBCHEM AS mixture_PUBCHEM, unii_lookup.INCHIKEY AS mixture_InChiKey, unii_lookup.NCBI AS mixture_NCBI FROM substances JOIN components ON substances.uuid = components.refuuid JOIN mixtures ON components.mixture_id = mixtures.uuid JOIN substances AS mixture_substances ON mixtures.uuid = mixture_substances.mixture LEFT JOIN unii_lookup ON mixture_substances.UNII = unii_lookup.UNII WHERE substances.UNII = ? ; """ connection = Inxight_Drugs_DataSupply.get_db() cur14 = connection.execute(query14, (substance_unii, )) for row in cur14.fetchall(): # loop for each mixture substance found id = "UNII:" + str(row['mixture_substance_unii']) name = row['mixture_substance'] # Create identifiers by annotating ids with appropriate CURIE prefix identifiers = {'unii': id} if (row['mixture_InChiKey']): identifiers["inchikey"] = row['mixture_InChiKey'] if (row['mixture_PUBCHEM']): identifiers["pubchem"] = "CID:" + row['mixture_PUBCHEM'] biolink_class = Inxight_Drugs_DataSupply.get_biolink_class( row['substanceClass'], row['mixture_InChiKey']) if row['mixture_RXCUI'] and biolink_class == 'Drug': identifiers["rxnorm"] = "RXCUI:" + row['mixture_RXCUI'] connection = Connection(source_element_id=source_element_id, type='has_part', relation='has_part', evidence_type="", attributes=[]) mixture = Element( id=id, biolink_class=biolink_class, identifiers=identifiers, names_synonyms=[ Names(name=name, synonyms=[], source=SOURCE) ], # add name & later synonyms from the database attributes=[], connections=[connection], source=self.info.name) if biolink_class != 'ignore': relationship_list.append(mixture)
def get_active_ingredients(self, related_list, drug): drug_rxcui = drug.identifiers['rxnorm'].split(":", 1)[1].strip() source_element_id = drug.id.strip() query7 = """ SELECT DISTINCT unii_lookup.PT, unii_lookup.RXCUI, unii_lookup.PUBCHEM, unii_lookup.NCBI, substances._name, relatedSubstances._name AS related_substance, relatedSubstances.uuid AS related_substance_uuid, relatedSubstances.UNII AS related_substance_unii, relatedSubstances.substanceClass AS related_substance_class, relationships.type AS relationships_type, relationships.qualification, relationships.amount_average AS average, relationships.amount_high AS high, relationships.amount_low AS low, relationships.amount_units, unii_lookup.INCHIKEY AS InChiKey, unii_lookup.INGREDIENT_TYPE FROM unii_lookup JOIN substances ON unii_lookup.UNII = substances.UNII LEFT JOIN relationships ON substances.uuid = relationships.substance_id JOIN substances AS relatedSubstances ON relationships.relatedSubstance_id = relatedSubstances.uuid WHERE relationships.type LIKE ("%ACTIVE%") AND NOT relationships.type LIKE ("%INACTIVE%") AND NOT relationships.type LIKE ("%PARENT->%") AND NOT relationships.type LIKE ("%PRODRUG->%") AND NOT relationships.type LIKE ("%RACEMATE->%") AND NOT relationships.type LIKE ("%SUBSTANCE->%") AND NOT relationships.type LIKE ("%METABOLITE ACTIVE%") AND NOT relationships.type LIKE ("%METABOLITE LESS ACTIVE%") AND NOT relationships.type LIKE ("%ACTIVE CONSTITUENT ALWAYS PRESENT%") AND RXCUI = ?; """ connection = Inxight_Drugs_DataSupply.get_db() cur7 = connection.execute(query7, (drug_rxcui, )) for row in cur7.fetchall(): # loop for each related substance found id = "UNII:" + str(row['related_substance_unii']) uuid = row['related_substance_uuid'] identifiers = {'unii': id} if row['InChiKey']: identifiers['inchikey'] = row['InChiKey'] if (row['PUBCHEM']): identifiers["pubchem"] = "CID:" + row['PUBCHEM'] name = row['related_substance'] biolink_class = Inxight_Drugs_DataSupply.get_biolink_class( row['related_substance_class'], row['InChiKey']) relationships_type = row['relationships_type'] if row['RXCUI'] and biolink_class == 'Drug': identifiers["rxnorm"] = "RXCUI:" + row['RXCUI'] substance = Element( id=id, biolink_class=biolink_class, identifiers=identifiers, names_synonyms=[ Names(name=name, synonyms=[], source=SOURCE) ], # add name & later synonyms from the database attributes=[], connections=[], source=self.info.name) # Append synonyms to the substance Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance) relationship = Connection(source_element_id=source_element_id, type=relationships_type, relation="has_active_ingredient", evidence_type="", attributes=[]) if (row['qualification']): # active ingredients attributes attributes_list = ['average', 'high', 'low'] for attribute in attributes_list: if row[attribute] is not None and len( row[attribute].strip()) > 0: name = attribute + ' ' + str( row['qualification']) + ' (' + str( row['amount_units']) + ')' value = str(row[attribute]) relationship.attributes.append( Attribute(provided_by=self.info.name, name=name, value=value, source=SOURCE, type=name)) substance.connections.append(relationship) if biolink_class != 'ignore': # the ingredient is not a "concept" related_list.append(substance)
def get_relationships(self, relationship_list, substance): # substance is a mixture that must have components Inxight_Drugs_DataSupply.get_components(self, relationship_list, substance) # also check if it is a component Inxight_Drugs_DataSupply.get_mixtures(self, relationship_list, substance) substance_unii = substance.identifiers['unii'].split(":", 1)[1].strip() source_element_id = substance.identifiers['unii'] """ Find relationships to other substances by a substance UNII """ query5 = """ SELECT substances._name AS substance_name, substances.mixture, relationships.type AS relationships_type, relationships.mediatorSubstance_id, relationships.interactionType AS interaction_type, relationships.qualification, relationships.amount_average AS average, relationships.amount_high AS high, relationships.amount_low AS low, relationships.amount_units, relationships.comments, related.uuid AS related_uuid, related._name AS related_substance, related.UNII AS related_substance_unii, related.substanceClass, unii_lookup.RXCUI AS relatedRXCUI, unii_lookup.PUBCHEM AS relatedPUBCHEM, unii_lookup.INCHIKEY AS InChiKey, unii_lookup.NCBI FROM substances JOIN relationships ON substances.uuid = relationships.substance_id JOIN substances AS related ON relationships.relatedSubstance_id = related.uuid LEFT JOIN unii_lookup ON related.UNII = unii_lookup.UNII WHERE substances.UNII = ?; """ connection = Inxight_Drugs_DataSupply.get_db() cur5 = connection.execute(query5, (substance_unii, )) for row in cur5.fetchall(): # loop for each related substance found id = "UNII:" + str(row['related_substance_unii']) name = row['related_substance'] # Create identifiers by annotating ids with appropriate CURIE prefix identifiers = {'unii': id} if (row['InChiKey']): identifiers["inchikey"] = row['InChiKey'] if (row['relatedPUBCHEM']): identifiers["pubchem"] = "CID:" + row['relatedPUBCHEM'] biolink_class = Inxight_Drugs_DataSupply.get_biolink_class( row['substanceClass'], row['InChiKey']) if row['relatedRXCUI'] and biolink_class == 'Drug': identifiers["rxnorm"] = "RXCUI:" + row['relatedRXCUI'] related_substances = Element( id=id, biolink_class=biolink_class, identifiers=identifiers, names_synonyms=[ Names(name=name, synonyms=[], source=SOURCE) ], # add name & later synonyms from the database attributes=[], connections=[], source=self.info.name) # Append additional attributes collected from Inxight:Drugs substances & unii tables attributes_list = ['substanceClass', 'NCBI'] for attribute in attributes_list: if row[attribute] is not None and len( row[attribute].strip()) > 0: if attribute != 'NCBI': related_substances.attributes.append( Attribute(provided_by=self.info.name, name=attribute, value=str(row[attribute]), source=SOURCE, type=attribute)) else: # NCBI id related_substances.attributes.append( Attribute(provided_by=self.info.name, name='OrganismTaxon', value='NCBITaxon:' + row[attribute], source=SOURCE, type='biolink:OrganismTaxon')) relationship = Connection(source_element_id=source_element_id, type=row['relationships_type'], relation=row['relationships_type'], evidence_type="", attributes=[]) # Append additional attributes collected from Inxight:Drugs relationships table attributes_list = ['interaction_type', 'comments'] for attribute in attributes_list: if row[attribute] is not None and len( row[attribute].strip()) > 0: relationship.attributes.append( Attribute(provided_by=self.info.name, name=attribute, value=str(row[attribute]), source=SOURCE, type=attribute)) if (row['qualification']): attributes_list = ['average', 'high', 'low'] for attribute in attributes_list: if row[attribute] is not None and len( row[attribute].strip()) > 0: # e.g., attribute.name: average IC50 (NANOMOLAR), attribute.value: 2.7 name = attribute + ' ' + str( row['qualification']) + ' (' + str( row['amount_units']) + ')' value = str(row[attribute]) relationship.attributes.append( Attribute(provided_by=self.info.name, name=name, value=value, source=SOURCE, type=name)) related_substances.connections.append(relationship) if biolink_class != 'ignore': relationship_list.append(related_substances)
def find_substance(self, substance_list, name_value): search_column = '_name' # by default, assume a search for substance by name inchikey_regex = re.compile('[A-Z]{14}-[A-Z]{10}-[A-Z]') # check if submitted name is native CURIE, e.g., CID:2244 # or InChiKey e.g., BSYNRYMUTXBXSQ-UHFFFAOYSA-N # or else just a substance name if name_value.find(':') > -1: if name_value.find( 'UNII' ) > -1: # a search for substance by UNII, i.e., column 'substanceUNII' search_column = 'substanceUNII' elif name_value.find( 'CID' ) > -1: # a search for substance by CID, i.e., column 'structurePubChem' search_column = 'structurePubChem' name = name_value.split(":", 1)[1].strip() elif inchikey_regex.match(name_value) is not None: search_column = 'structureInChiKey' # a search for substance by inchikey, i.e., column 'structureInChiKey' name = name_value else: name = name_value search_column = '_name' """ Find substance by a name """ query1 = """ SELECT DISTINCT substances.uuid AS uuid, substanceClass, substances.UNII AS substanceUNII, _name, structurallyDiverse, protein, nucleicAcid, mixture, polymer, structure_id, formula, opticalActivity, atropisomerism, stereoCenters, definedStereo, ezCenters, charge, mwt AS molecularWeight, stereochemistry, structures.InChiKey AS structureInChiKey, structures.pubChem AS structurePubChem, RXCUI, NCBI, stereoComments FROM substances LEFT JOIN structures ON substances.structure_id = structures.id LEFT JOIN unii_lookup ON substances.UNII = unii_lookup.UNII WHERE {search_column} = ?; """.format(search_column=search_column) connection = Inxight_Drugs_DataSupply.get_db() if search_column is not None: cur = connection.execute(query1, (name, )) # for each hit (i.e., of the same substance name, an unlikely but possible occurrence) for row in cur.fetchall(): uuid = row["uuid"] inchikey = None biolink_class = None substanceClass = row['substanceClass'] # Create identifiers by annotating ids with appropriate CURIE id = "UNII:" + row['substanceUNII'] identifiers = {'unii': id} if (row['structureInChiKey']): identifiers["inchikey"] = row['structureInChiKey'] if (row['structurePubChem']): identifiers["pubchem"] = "CID:" + row['structurePubChem'] # Select the correct biolink_class based on substanceClass if (row['substanceClass'] in [ 'structurallyDiverse', 'polymer', 'protein', 'nucleicAcid', 'chemical', 'mixture' ]): biolink_class = Inxight_Drugs_DataSupply.get_biolink_class( row['substanceClass'], row['structureInChiKey']) substance = Element( id=id, biolink_class=biolink_class, identifiers=identifiers, names_synonyms=[ Names(name=name_value, synonyms=[], source=SOURCE) ], # add names & synonyms from the database attributes=[ Attribute(name='query name', value=name_value, provided_by=self.info.name, source=SOURCE), ], connections=[], source=self.info.name) # Append additional attributes collected from Inxight:Drugs substances & unii tables attributes_list = [ 'substanceClass', 'formula', 'opticalActivity', 'atropisomerism', 'stereoCenters', 'definedStereo', 'ezCenters', 'charge', 'molecularWeight', 'stereochemistry', 'NCBI' ] for attribute in attributes_list: if row[attribute] is not None and len( row[attribute].strip()) > 0: if attribute != 'NCBI': substance.attributes.append( Attribute(provided_by=self.info.name, name=attribute, value=str(row[attribute]), source=SOURCE, type=attribute)) else: substance.attributes.append( Attribute(provided_by=self.info.name, name='OrganismTaxon', value='NCBITaxon:' + row[attribute], source=SOURCE, type='biolink:OrganismTaxon')) if biolink_class != 'ignore': substance_list.append(substance) # Append synonyms to the substance Inxight_Drugs_DataSupply.get_names_synonyms(uuid, substance) # Append references to the substance Inxight_Drugs_DataSupply.get_references(self, uuid, substance) # Append codes as refererences to the substance Inxight_Drugs_DataSupply.get_codes(self, uuid, substance) ##### Need to put this in a dictionary with protein, polymer, nucleic acid, ... if substanceClass == 'protein': Inxight_Drugs_DataSupply.get_protein_info( self, uuid, substance) elif substanceClass == 'nucleicAcid': Inxight_Drugs_DataSupply.get_nucleicAcid_info( self, uuid, substance)
def find_compound_by_name(info_name, compound_list, name): """ Find compound by a name """ query1 = """ SELECT DISTINCT drugs.id AS drug_id, drugs.name AS drug_name, drugs.fda_approved, drugs.immunotherapy, drugs.anti_neoplastic, drugs.chembl_id AS ChEMBL_id FROM drugs JOIN drug_aliases ON drugs.id = drug_aliases.drug_id WHERE drugs.name = upper(?) OR drug_aliases.alias = ?; """ global connection connection = DGIdbDataSupply.get_db() cur = connection.execute(query1, (name, name)) # for each hit (i.e., of the same drug name, an unlikely but possible occurrence) for row in cur.fetchall(): id = "ChEMBL:" + row['ChEMBL_id'] identifiers = {'chembl': id} type = name # Interim solution for providing "type", pending Consortium's final decision compound = Element( id=id, biolink_class='ChemicalSubstance', identifiers=identifiers, names_synonyms=[Names(name=name, synonyms=[], source=SOURCE) ], # add names & synonyms from the database attributes=[ Attribute(name='query name', value=name, provided_by=info_name, source=SOURCE), ], connections=[], source=info_name) # Append synonyms DGIdbDataSupply.get_names_synonyms(row['drug_id'], compound) # Append additional attributes collected from DGIdb drugs table if (row['fda_approved'] == 't'): compound.attributes.append( Attribute(provided_by=info_name, name='FDA approval', value="approved", source=SOURCE, type='FDA approval')) if (row['immunotherapy'] == 't'): compound.attributes.append( Attribute(provided_by=info_name, name="Drug Class", value="immunotherapy", source=SOURCE, type="Drug Class")) if (row['anti_neoplastic'] == 't'): compound.attributes.append( Attribute(provided_by=info_name, name="Drug Class", value="anti_neoplastic", source=SOURCE, type="Drug Class")) # Append additional attributes from drug attributes table DGIdbDataSupply.get_drug_attributes(info_name, row['drug_id'], compound) compound_list.append(compound)