def substituteSymbolsAndSearch(met, lSymbols, db): lOutputs = [] lSymbols2Change = [symbol for symbol in lSymbols if symbol in met] for k in lSymbols2Change: lPositions = [p.start() for p in re.finditer(k, met)] for posFound in lPositions: metSubstituted = met[:posFound] + '+' + met[posFound + 1:] try: lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols) lOutputs += lkeggId_perfectMatch except: metSubstituted = replaceSpacesWithPlusAndSearch(metSubstituted) try: lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols) lOutputs += lkeggId_perfectMatch except: print('No results') met_AllsubstAll = re.sub(k, '+', met) try: lkeggId_perfectMatch = queryKeggCompound(db, met_AllsubstAll, met_AllsubstAll, lSymbols) except: met_AllsubstAll = replaceSpacesWithPlusAndSearch(met_AllsubstAll) try: lkeggId_perfectMatch = queryKeggCompound(db, met_AllsubstAll, met_AllsubstAll, lSymbols) lOutputs += lkeggId_perfectMatch except: print('No results') else: lOutputs += lkeggId_perfectMatch lOutputs = gL.unique(lOutputs) return lOutputs
def findRxnsAfterFilter(lDfs2Concat, colNameS, colNameP, db, dfR, dfP, lReactants_ids, lProducts_ids): lPutativeRxns = [] dfAllDBs_copy_All = pd.concat(lDfs2Concat) dfAllDBs_copy_All[colNameS + '_t'] = dfAllDBs_copy_All[colNameS].apply(tuple) dfAllDBs_copy_All[colNameP + '_t'] = dfAllDBs_copy_All[colNameP].apply(tuple) lDfs = filterwExtractRows(dfAllDBs_copy_All, dfR, dfP, colNameS, colNameP) if len(lDfs) != 0: df = pd.concat(lDfs) if db == 'metacyc': if df.empty is False: lPutativeRxns = findPutativeRxns(df, colNameS, colNameP, 'metacyc') elif db == 'kegg': if df.empty is False: lPutativeRxns = findPutativeRxns(df, colNameS, colNameP, 'kegg') elif db == 'rhea': if df.empty is False: lPutativeRxns = findPutativeRxns(df, colNameS[:-5], colNameP[:-5], 'rhea') lPutativeRxns = gL.unique(lPutativeRxns) return lPutativeRxns
def generateTruthTable(lGenes1, lGenes2): ''' Generate the truth matrix ''' n1 = len(lGenes1) n2 = len(lGenes2) lGenes = gL.unique(lGenes1 + lGenes2) n = len(lGenes) mTruth = np.array([i for i in itertools.product([False, True], repeat=n)]) return mTruth, lGenes, n
def generateOrganismSpecificRegex(keggGeneList): # Compose the regex according to characters present in KEGG gene identifiers. ## Scorrere lista di geni e per ognuno stabilire se ci sono solo caratteri maiuscoli, solo minuscoli o misti; verificare poi se ci sono numeri lower = [gene for gene in keggGeneList if gene.islower()] upper = [gene for gene in keggGeneList if gene.isupper()] mixed = [ gene for gene in keggGeneList if gene.islower() and not gene.isupper() ] if len(mixed) > 0: p1 = 'A-Za-z' elif len(mixed) == 0 and len(lower) == 0 and len(upper) > 0: p1 = 'A-Z' elif len(mixed) == 0 and len(lower) > 0 and len(upper) == 0: p1 = 'a-z' else: p1 = '' numeric = False for gene in keggGeneList: if any(c.isdigit() for c in gene) is True: #True if contains at least one numeric character numeric = True if numeric is True: p2 = '0-9' else: p2 = '' # Return all non-alphanumeric characters noAlphaNumeric = [] for gene in keggGeneList: noAlphaNumeric += reModule.findall(r'\W+', gene) noAlphaNumeric = gL.unique(noAlphaNumeric) p3 = ''.join(noAlphaNumeric) # Construct the regex regexOrgSpecific = '([' + p1 + p2 + p3 + ']+)' return regexOrgSpecific
def substituteWordsAndSearch(met, dWords, db, lSymbols): lOutputs = [] lWords2Change = [k for k,v in dWords.items() if k in met] met_substAll = (met + '.')[:-1] for k in lWords2Change: for v in dWords[k]: metSubstituted = met.replace(k,v) met_substAll = met_substAll.replace(k,v) try: lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols) except: metSubstituted = replaceSpacesWithPlusAndSearch(metSubstituted) try: lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols) lOutputs += lkeggId_perfectMatch except: print('No results') else: lOutputs += lkeggId_perfectMatch try: lOutputs_changeSymbol = substituteSymbolsAndSearch(metSubstituted, lSymbols, db) lOutputs += lOutputs_changeSymbol except: print('No results') try: lkeggId_perfectMatch = queryKeggCompound(db, met_substAll, met_substAll, lSymbols) except: met_substAll = replaceSpacesWithPlusAndSearch(met_substAll) try: lkeggId_perfectMatch = queryKeggCompound(db, met_substAll, met_substAll, lSymbols) lOutputs += lkeggId_perfectMatch except: print('No results') else: lOutputs += lkeggId_perfectMatch lOutputs = gL.unique(lOutputs) return lOutputs
dfChebiCompounds = pd.read_csv(os.path.join(RAWDIR, 'chebi_compounds_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype=str) dfChebiDb = pd.read_csv(os.path.join(RAWDIR, 'chebi_database_accession_20201216.tsv.bz2'), sep = '\t',compression='bz2', dtype = {'ID': str, 'COMPOUND_ID': str, 'ACCESSION_NUMBER': str}) dfChebiRelations = pd.read_csv(os.path.join(RAWDIR, 'chebi_relation_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype = {'ID': str, 'TYPE': str, 'INIT_ID': str, 'FINAL_ID': str}) dfChebiInchi = pd.read_csv(os.path.join(RAWDIR, 'chebiId_inchi_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype=str) dfChebiInchi['InChI_splitted'] = dfChebiInchi.InChI.str.split('/') dfChebiInchi['InChI_formula'] = dfChebiInchi.InChI.str.split('/').str[1] if includeCompartment is True: df['toMatch'] = df.Name.str.replace("\[(\w*\s*)+\]$", "", regex = True) df['toMatch'] = df.toMatch.str.strip() lMets2Search = list(df['toMatch']) else: lMets2Search = list(df['Name']) lMets2Search = gL.unique(lMets2Search) dfChebiNames['NAME'] = dfChebiNames['NAME'].str.lower() dfChebiCompounds['NAME'] = dfChebiCompounds['NAME'].str.lower() dfChebiUniprot['NAME'] = dfChebiUniprot['NAME'].str.lower() dizMet2Ids = {} for met in lMets2Search: keggId = '' if includeCompartment is True: dfMet = df[df['Name'].str.startswith(met + ' ' + '[')] else: dfMet = df[df['Name'] == met] dfMet = dfMet.reset_index(drop = True) if dfMet.empty == False: inchiOriginal = dfMet.iloc[0]['Inchi']
## Compute Jaccard score for each reaction and for the entire model lAllGenes_original = [] lAllGenes_gpruler = [] lJScore_sngRxns = [] for row in dfComparison.itertuples(): originalRule = row.rule_original gprulerRule = row.rule_GPRuler if (pd.isna(originalRule) == True or originalRule == ''): lGenes_originalRule = [] else: dfGenes_originalRule = gL.extractRegexFromItem(originalRule, regexOrgSpecific) lGenes_originalRule = gL.unique(list(dfGenes_originalRule[0])) lGenes_originalRule.sort() if (pd.isna(gprulerRule) == True or gprulerRule == ''): lGenes_gprulerRule = [] else: dfGenes_gprulerRule = gL.extractRegexFromItem(gprulerRule, regexOrgSpecific) lGenes_gprulerRule = gL.unique(list(dfGenes_gprulerRule[0])) lGenes_gprulerRule.sort() lAllGenes_original += lGenes_originalRule lAllGenes_gpruler += lGenes_gprulerRule if len(lGenes_originalRule) == 0 and len(lGenes_gprulerRule) == 0: jScore = 1.0
def mergeData(df): ''' This function joins all retrieved information from explored databases and assembles them to generate the final GPR rule. Input: - df: dataframe generated by getKeggData function. Output: - dfFinal: dataframe where for each input gene the list of its AND and OR relationships is returned. ''' list_and = [] list_or = [] lTotalGenes = [] lTotalGenes = [item for elem in list(df['geneName_fromKEGG']) for item in elem] + [item for elem in list(df['proteinNames']) for item in elem] \ + [item for elem in list(df['geneNames']) for item in elem] + [item for elem in list(df['id_uniprot']) for item in elem] \ + [item for elem in list(df['subunitsFromName']) for item in elem] lTotalGenes = gL.unique(lTotalGenes) lTotalAndSubs = lTotalGenes + list(df['subunitsFromName']) dErroneousNames = {'SLCA7A7': 'SLC7A7', 'SLCA7A11': 'SLC7A11'} for r in df.itertuples(): ## clean all information retrieved from the explored databases by removing those elements that are not included into the lTotalGenes list finallDipComplex = [] finallDipBinary = [] dfinallDipBinary_names = {} for f in finallDipBinary: out = df.loc[df.uniprotId == f] for o in out.itertuples(): dfinallDipBinary_names[f] = gL.unique([o.uniprotId] + o.proteinNames + o.geneNames + o.id_uniprot + o.geneName_fromKEGG) lComplexPortal_unipId = [ x for x in list(r.complexPortal_uniprotId) if x not in list(r.id_uniprot) ] finallComplexPortal_unipId = [ x for x in lComplexPortal_unipId if x in lTotalAndSubs ] # Select only the isoforms falling within input genes list lComplexPortal_protName = [ x for x in list(r.complexPortal_protName) if x not in list(r.id_uniprot) ] finallComplexPortal_protName = [ x for x in lComplexPortal_protName if x in lTotalAndSubs ] try: ldfStructure = [ x for x in list(r.gene_from_structure) if x in lTotalAndSubs ] except: ldfStructure = [] try: ldfInteract = [x for x in list(r.interact) if x in lTotalAndSubs] except: ldfInteract = [] try: ldfSimilarity = [ x for x in list(r.by_similarity) if x in lTotalAndSubs ] except: ldfSimilarity = [] lAllIsoforms = list(r.otherIsoforms) + list(r.isoform) lAllIsoforms = gL.unique(lAllIsoforms) finalStringSubs = [ x for x in list(r.stringSubunits) if x in lTotalAndSubs ] lOtherSubunits = [ x for x in list(r.otherSubunits) if x not in list(r.id_uniprot) + list(r.proteinNames) + list(r.geneNames) + list(r.subunitsFromName) + list(r.geneName_fromKEGG) ] finalOtherSubunits = [x for x in lOtherSubunits if x in lTotalAndSubs] finallDipBinary_woIsoforms = [] for k in dfinallDipBinary_names: if all(el not in lAllIsoforms for el in dfinallDipBinary_names[k]) is True: finallDipBinary_woIsoforms.append(k) subunit = [] subunit += finallDipComplex + finallDipBinary_woIsoforms + finallComplexPortal_unipId + finallComplexPortal_protName subunit += ldfInteract + ldfSimilarity + gL.difference( finalStringSubs, lAllIsoforms) + finalOtherSubunits subunit = gL.unique(subunit) # uso il dizionario di nomi che sono sbagliati e che è necessario correggere finalSubunitSet = [] for s in subunit: if s in dErroneousNames: finalSubunitSet.append(dErroneousNames[s]) else: finalSubunitSet.append(s) lIsoforms = [ x for x in list(r.otherIsoforms) + list(r.isoform) if x not in list(r.id_uniprot) + list(r.proteinNames) + list(r.geneNames) + list(r.subunitsFromName) + list(r.geneName_fromKEGG) ] finalIsoforms = [x for x in lIsoforms if x in lTotalAndSubs] list_and.append(finalSubunitSet) list_or.append(finalIsoforms) dfFinal = pd.DataFrame({ 'gene': df['geneNames'], 'uniprotId': df['id_uniprot'] }) dfFinal['AND'] = list_and dfFinal['OR'] = list_or return (dfFinal)
def getStringData(df, gene='id_uniprot'): ''' This function retrieves from STRING database known and predicted protein-protein interactions established by each queried metabolic gene. Input: - df: dataframe generated by the previous step; - id_prot: column name of uniprot identifiers of genes. By default it is set equal to 'id_uniprot'. Output: - df: enriched input dataframe with information retrieved from STRING database. ''' dizUniprotString = {} dizlInteractors = {} stringSubunits = [] for row in df.itertuples(): lStringInteractors = [] for uniprotId in getattr(row, gene): if uniprotId not in dizUniprotString: originalUniprotNames = row.proteinNames + row.geneNames url = "https://string-db.org/api/json/network?identifiers=" + uniprotId response = requests.get(url, verify=False) while response.status_code == 524: response = requests.get(url, verify=False) net = response.json() lInteractors = [] original = '' if type(net) == list: for i in range(0, len(net)): if (net[i]['preferredName_A'] == uniprotId) or (any( net[i]['preferredName_A'] == name for name in originalUniprotNames) is True): original = net[i]['preferredName_A'] lInteractors.append(net[i]['preferredName_B']) elif (net[i]['preferredName_B'] == uniprotId) or (any( net[i]['preferredName_B'] == name for name in originalUniprotNames) is True): lInteractors.append(net[i]['preferredName_A']) dizUniprotString[uniprotId] = (original, lInteractors) else: original = dizUniprotString[uniprotId][0] lInteractors = dizUniprotString[uniprotId][1] if original != '': for interactor in lInteractors: if interactor not in dizlInteractors: url = 'https://string-db.org//api/json/enrichment?identifiers=' + original + '%0d' + interactor response = requests.get(url, verify=False) while response.status_code == 524: response = requests.get(url, verify=False) diz = response.json() dizlInteractors[interactor] = diz else: diz = dizlInteractors[interactor] for stringElement in diz: if stringElement['category'] == 'Component' and ( "complex" in str(stringElement['description']) or "chain" in str(stringElement['description']) ) and original in stringElement['inputGenes']: complesso = stringElement['description'] lStringInteractors.append(interactor) lStringInteractors = gL.unique(lStringInteractors) stringSubunits.append(lStringInteractors) df['stringSubunits'] = stringSubunits return (df)
for row in dfAll.itertuples(): dfIsolateMatches = dfChebiCompounds[dfChebiCompounds['NAME'].isin( row.Matches)] if dfIsolateMatches.empty is True: lCorrespondences = [] else: lCorrespondences = list(dfIsolateMatches['ID'].dropna()) if row.Name.strip() not in dMetMapping: dMetMapping[row.Name.strip()] = lCorrespondences else: dMetMapping[row.Name.strip()] += lCorrespondences dMetMapping_woDuplicates = {} for met in dMetMapping: dMetMapping_woDuplicates[met.strip()] = gL.unique(dMetMapping[met]) dfMatches = pd.DataFrame(dMetMapping_woDuplicates.items(), columns=['Name', 'Identifiers']) dfMatches.to_csv(os.path.join(OUTDIR, prefix_modelName + '_mappingFuzzy.tsv'), sep='\t', index=False) ## Joining FuzzyWuzzy and output from Step1 dfFuzzyMatches = pd.read_csv(os.path.join( OUTDIR, prefix_modelName + '_mappingFuzzy.tsv'), sep='\t') dfFuzzyMatches['Identifiers_fuzzy'] = dfFuzzyMatches['Identifiers'].apply( ast.literal_eval) dfClassicMatches = pd.read_csv(os.path.join(
for llUnipRhea in list(dfSearch['UniprotId'].dropna()): for lUnipRhea in llUnipRhea: if any('up:' + el in list(dfuniprot2Org['uniprot']) for el in lUnipRhea) is True: dfCorrespondingGenes = dfuniprot2Org[ dfuniprot2Org['uniprot'].isin( ['up:' + el for el in lUnipRhea])] for foundGene in list(dfCorrespondingGenes['keggGeneId']): if [foundGene.split(':')[1]] not in lMetaEnzOR: geneId2search = foundGene.split(':')[1] lMetaEnzOR, dGenesFromKegg = rxnL.checkNadNadpDependencies_or( nadp, nad, geneId2search, orgCode + ':' + geneId2search, lMetaEnzOR, dGenesFromKegg) lEc = gL.unique(lEc) for ec in lEc: if ec in dEcFromKegg: dEcs = dEcFromKegg[ec] else: dEcs = rxnL.getKeggInfo('ec:' + ec) dEcFromKegg[ec] = dEcs if dEcs != 400 and dEcs != 404 and 'GENES' in dEcs and orgCode.upper( ) in dEcs['GENES']: for item in dEcs['GENES'][orgCode.upper()].split(): par = item.find('(') if par != -1: if [item[:par]] not in lMetaEnzOR: geneId2search = item[:par] lMetaEnzOR, dGenesFromKegg = rxnL.checkNadNadpDependencies_or( nadp, nad, geneId2search,
def extractKeggIdComp(met, dfChebiNames, dfChebiDb, dfChebiRelations, dfChebiUniprot, dfChebiCompounds, dfChebiInchi, inchiOriginal): dWords = {'ic acid':['ate'], 'ate':['ic acid'], 'bisphosphate':['diphosphate'], 'diphosphate':['bisphosphate'], 'aminium': ['amine'], 'amine': ['aminium'], 'ammonia': ['nh4+', 'nh3'], 'ammonium': ['nh4+', 'nh3'], 'proton': ['H+'], 'adenosine triphosphate': ['atp'], 'adenosine diphosphate': ['adp'], 'coenzyme a': ['coa'], 'coa': ['coenzyme a'], 'apotransferin': ['apotransferrin'], "adenosine-5'-diphosphate": ['adp'], "adenosine 5'-diphosphate": ['adp'], "uridine-5'-diphosphate": ['udp'], "uridine 5'-diphosphate": ['udp'], "deoxyuridine-5'-diphosphate": ['dudp'], "deoxyuridine-5'-triphosphate": ['dutp'], 'acp': ['[acp]', 'acyl-carrier protein', '[acyl-carrier protein]', 'acyl-carrier-protein', '[acyl-carrier-protein]'], '[acp]': ['acp'], "cytidine-5'-monophosphate": ['cmp'], "cytidine 5'-monophosphate": ['cmp'], '-ld-pe-pool': [''], '-ld-ps-pool': [''], '-ld-pc-pool': [''], '-ld-pe-pool': [''], '-ld-tg1-pool': [''], '-ld-tg2-pool': [''], '-ld-tg3-pool': [''], '-ld-pi-pool': [''], '-vldl-pool': [''], '-bile-pc-pool': [''], '-uptake-pool': [''], '-pool': [''], 'ide': ['ic acid']} lSymbols = ['/', '-', "'", '\[', '\]', '\(', '\)'] keggId = [] lDbs = ['compound', 'glycan'] for database in lDbs: try: lkeggId_perfectMatch = queryKeggCompound(database, met, met, lSymbols) except: try: metNew = replaceSpacesWithPlusAndSearch(met) lkeggId_perfectMatch = queryKeggCompound(database, metNew, met, lSymbols) except: if any(symbol in met for symbol in lSymbols) is True: lOutputs = substituteSymbolsAndSearch(met, lSymbols, database) keggId += lOutputs if any(k in met for k,v in dWords.items()) is True: lOutputs = substituteWordsAndSearch(met, dWords, database, lSymbols) keggId += lOutputs else: keggId += lkeggId_perfectMatch else: if len(lkeggId_perfectMatch) != 0: keggId += lkeggId_perfectMatch if any(symbol in met for symbol in lSymbols) is True: lOutputs = substituteSymbolsAndSearch(met, lSymbols, database) keggId += lOutputs if any(k in met for k,v in dWords.items()) is True: lOutputs = substituteWordsAndSearch(met, dWords, database, lSymbols) keggId += lOutputs try: metNew_woSpaces = replaceSpacesWithNoneAndSearch(met) lkeggId_perfectMatch = queryKeggCompound(database, metNew_woSpaces, met, lSymbols) except: print('not found') else: keggId += lkeggId_perfectMatch lPositions = [p.start() for p in re.finditer(' ', met)] for posFound in lPositions: met_subst = met[:posFound] + met[posFound + 1:] try: lkeggId_perfectMatch = queryKeggCompound(database, met_subst, met_subst, lSymbols) except: print('not found') else: keggId += lkeggId_perfectMatch met_subst = (met + '.')[:-1] for posFound in lPositions: met_subst = met_subst[:posFound] + met_subst[posFound + 1:] try: lkeggId_perfectMatch = queryKeggCompound(database, met_subst, met_subst, lSymbols) except: print('not found') else: keggId += lkeggId_perfectMatch ## search the metabolite in ChEBI lchebiId = name2ChebiIds_perfectMatch(met, dfChebiNames, dfChebiUniprot, dfChebiCompounds) keggId += lchebiId ## And from the list of ChEBI identifiers, search information in ChEBI about KEGG and MetaCyc identifiers keggIdentifiers = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations) keggId += keggIdentifiers ## If the perfect match is not found, try to manipulate the name of the metabolite lchebiId = name2ChebiIds_subsSymbolsInChebiOutput(met, lSymbols, dfChebiNames, dfChebiUniprot, dfChebiCompounds) keggId += lchebiId keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations) keggId += keggIdentifier ## try to substitute words within the metabolite name to find the match in ChEBI or if not found try again to manipulate the string lAllId = name2ChebiIds_subsWords(met, dWords, lSymbols, dfChebiNames, dfChebiUniprot, dfChebiCompounds, dfChebiDb, dfChebiRelations) keggId += lAllId ## try to search the string before the parenthesis or the comma symbol if len(keggId) == 0: lchebiId_fromChebiCompounds = splitChebiResultsOnSymbol(dfChebiCompounds, 'NAME', 'ID', '(') lchebiId_fromChebiNames = splitChebiResultsOnSymbol(dfChebiNames, 'NAME', 'COMPOUND_ID', '(') lchebiId_fromChebiUniprot = splitChebiResultsOnSymbol(dfChebiUniprot, 'NAME', 'ID', '(') lchebiId = lchebiId_fromChebiCompounds + lchebiId_fromChebiNames + lchebiId_fromChebiUniprot keggId += lchebiId keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations) keggId += keggIdentifier if len(keggId) == 0: lchebiId_fromChebiCompounds = splitChebiResultsOnSymbol(dfChebiCompounds, 'NAME', 'ID', ',') lchebiId_fromChebiNames = splitChebiResultsOnSymbol(dfChebiNames, 'NAME', 'COMPOUND_ID', ',') lchebiId_fromChebiUniprot = splitChebiResultsOnSymbol(dfChebiUniprot, 'NAME', 'ID', ',') lchebiId = lchebiId_fromChebiCompounds + lchebiId_fromChebiNames + lchebiId_fromChebiUniprot keggId += lchebiId keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations) keggId += keggIdentifier if len(keggId) == 0 and pd.isna(inchiOriginal) is False and inchiOriginal != '': lParts = inchiOriginal.split('/') originalInchi_formula = lParts[1] i = 2 originalInchi_atomConnection = '' originalInchi_hydrogen = '' while i < len(lParts): if lParts[i].startswith('c'): originalInchi_atomConnection = lParts[i] elif lParts[i].startswith('h'): originalInchi_hydrogen = lParts[i] i+=1 lchebiId = [] for row in dfChebiInchi.itertuples(): currentChebiInchi = row.InChI_splitted currentInchi_formula = currentChebiInchi[1] i = 2 currentInchi_atomConnection = '' currentInchi_hydrogen = '' while i < len(currentChebiInchi): if currentChebiInchi[i].startswith('c'): currentInchi_atomConnection = currentChebiInchi[i] elif currentChebiInchi[i].startswith('h'): currentInchi_hydrogen = currentChebiInchi[i] i+=1 if originalInchi_formula != '' and currentInchi_formula != '' and originalInchi_atomConnection != '' and currentInchi_atomConnection != '' and originalInchi_hydrogen != '' and currentInchi_hydrogen != '': if originalInchi_formula == currentInchi_formula and originalInchi_atomConnection == currentInchi_atomConnection and originalInchi_hydrogen == currentInchi_hydrogen: lchebiId.append(row.CHEBI_ID) keggId += lchebiId ## From the list of ChEBI identifiers search the associated KEGG and MetaCyc identifiers keggIdentifiers = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations) keggId += keggIdentifiers if len(keggId) == 0: for database in lDbs: if '(' in met: lPositions = [p.start() for p in re.finditer('\(', met)] met_untilParenthesis = met[:lPositions[-1]].strip() try: lkeggId_perfectMatch = queryKeggCompound(database, met_untilParenthesis, met_untilParenthesis, lSymbols) except: if any(symbol in met_untilParenthesis for symbol in lSymbols) is True: lOutputs = substituteSymbolsAndSearch(met_untilParenthesis, lSymbols, database) keggId += lOutputs if any(k in met_untilParenthesis for k,v in dWords.items()) is True: lOutputs = substituteWordsAndSearch(met_untilParenthesis, dWords, database, lSymbols) keggId += lOutputs else: keggId += lkeggId_perfectMatch if ',' in met: lPositions = [p.start() for p in re.finditer(',', met)] met_untilParenthesis = met[:lPositions[-1]].strip() try: lkeggId_perfectMatch = queryKeggCompound(database, met_untilParenthesis, met_untilParenthesis, lSymbols) except: if any(symbol in met_untilParenthesis for symbol in lSymbols) is True: lOutputs = substituteSymbolsAndSearch(met_untilParenthesis, lSymbols, database) keggId += lOutputs if any(k in met_untilParenthesis for k,v in dWords.items()) is True: lOutputs = substituteWordsAndSearch(met_untilParenthesis, dWords, database, lSymbols) keggId += lOutputs else: keggId += lkeggId_perfectMatch keggId = gL.unique(keggId) keggId = [k for k in keggId if k != ''] return keggId
else: rxnEquation = '' rxnDefinition = '' dCompleteRxns_equation[ rxnName] = rxnEquation dCompleteRxns_definition[ rxnName] = rxnDefinition gL.writeLineByLineToFile(rxnFile, [ rxnName, rxnEquation, rxnDefinition ], sep='\t') if rxnName not in lAssociatedRxns: lAssociatedRxns.append(rxnName) lAssociatedRxns = gL.unique(lAssociatedRxns) dGene2RxnsList[gene] = lAssociatedRxns gL.writeLineByLineToFile(geneFile, [gene, lAssociatedRxns], sep='\t') rxnFile.close() geneFile.close() for k, v in dRxn2EcNumber.items(): gL.writeLineByLineToFile(rxn2EcFile, [k, v], sep='\t') rxn2EcFile.close() # Generate the file including reaction to the corresponding catalysing genes list flatdGene2RxnsList_values = [ rxn for associatedRxns in dGene2RxnsList.values() for rxn in associatedRxns ] flatdGene2RxnsList_values = gL.unique(flatdGene2RxnsList_values) outFile = open(os.path.join(OUTDIR, model + '_Rxns2Genes.csv'), mode='w')
elif organismChoice == '2': organismCode = input('Insert the KEGG organism code: ') dfRxnToGenes = pd.read_csv(os.path.join(OUTDIR, model + '_Rxns2Genes.csv'), sep='\t') dfRxnToGenes['Genes'] = dfRxnToGenes['Genes'].apply(literal_eval) dfKegg2UniprotId = pd.read_csv(os.path.join(OUTDIR, model + '_Kegg2UniprotGenes.csv'), sep="\t", dtype={'keggId': str}) lOrganismGenesSet = [] for gene in dfRxnToGenes['Genes']: lRxnGenes = [el for g in gene for el in g] lOrganismGenesSet += lRxnGenes lOrganismGenesSet = gL.unique(lOrganismGenesSet) ## Select from the dfKegg2UniprotId dataframe only the metabolic genes included in 'lOrganismGenesSet' list dfKegg2UniprotId = dfKegg2UniprotId[dfKegg2UniprotId.keggId.isin( lOrganismGenesSet)] dfKegg2UniprotId = dfKegg2UniprotId.reset_index(drop=True) ############################################################# # Execute getUniprotAndComplexPortalData function ############################################################# print('Get data from Uniprot and Complex Portal: DOING') dfData = gprL.getUniprotAndComplexPortalData(dfKegg2UniprotId) print('Get data from Uniprot and Complex Portal: DONE\n') ############################################################# # Execute textMiningFromUniprot function
# Filter genes associated to each reaction according to the associated compartment dfGenes2Compartment = pd.read_csv(os.path.join(OUTDIR, dfGenes2Comp + '.csv'), sep = '\t', dtype = {'Gene': str}) dfGenes2Compartment['lCompartments'] = dfGenes2Compartment['lCompartments'].apply(literal_eval) dGenes2Compartment = dfGenes2Compartment.set_index('Gene')['lCompartments'].to_dict() dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'RxnId': str}) dfModelRxns2Genes['Genes'] = dfModelRxns2Genes['Genes'].apply(literal_eval) dRxn2GeneswLocation = {} for row in dfModelRxns2Genes.itertuples(): ## get all the genes of the current reaction lAllGene_currentRxn = [] for l in row.Genes: lAllGene_currentRxn += l lAllGene_currentRxn = gL.unique(lAllGene_currentRxn) # reduce dGenes2Compartment to only consider genes of the current reaction dGenesCurrentRxn = {g: dGenes2Compartment[g] for g in lAllGene_currentRxn} # retrieved all the possible compartments of this reaction and create the same number of duplicates of the current reaction lRetrievedCompartments = [] for g in dGenesCurrentRxn: lRetrievedCompartments += dGenesCurrentRxn[g] lRetrievedCompartments = gL.unique(lRetrievedCompartments) rxnSuffix = 1 dRxn2AnnotatedCompartments = {} for putativeComp in lRetrievedCompartments: dRxn2AnnotatedCompartments[row.RxnId + '_' + str(rxnSuffix)] = putativeComp rxnSuffix += 1
dfkeggC_filter = dfkeggC[pd.DataFrame(dfkeggC.ChebiId.tolist()).isin( dizMetsIdentifiers[k]).any(1).values] if dfkeggC_filter.empty is False: lCompleteIds += list(dfkeggC_filter['Id'].dropna()) keggDbG = dfkeggG[dfkeggG['Id'].isin(dizMetsIdentifiers[k])] if keggDbG.empty is False: lCompleteIds += [ el for l in list(keggDbG['ChebiId'].dropna()) for el in l ] keggDbG_filter = keggDbG[pd.DataFrame(keggDbG.ChebiId.tolist()).isin( dizMetsIdentifiers[k]).any(1).values] if keggDbG_filter.empty is False: lCompleteIds += list(keggDbG_filter['Id'].dropna()) lCompleteIds = gL.unique(lCompleteIds) dizMetsIdentifiers[k] = lCompleteIds lIdentifiers = [] for row in dfMetsFromModel.itertuples(): lInferredIds = dizMetsIdentifiers[row.Name.strip()] lIdentifiers.append(lInferredIds) dfMetsFromModel['lIdentifiers'] = lIdentifiers dfMetsFromModel.to_csv(os.path.join(OUTDIR, dfmetsInfo + '_enriched.csv'), sep='\t', index=False) ## Read from ChEBI all the parental identifiers of each metabolite dfChebiFormula = pd.read_csv(os.path.join(RAWDIR, 'chebi_chemical_data_20201216.tsv'),
for k in lCompartmentsOrganization: if dModelCompartments[comp].lower() in lCompartmentsOrganization[k]: lRxnComps.append(k) dRxn2Compartments[r.id] = lRxnComps # Filter genes associated to each reaction according to the associated compartment dfGenes2Compartment = pd.read_csv(os.path.join(OUTDIR, dfGenes2Comp + '.csv'), sep = '\t', dtype = {'Gene': str}) dfGenes2Compartment['lCompartments'] = dfGenes2Compartment['lCompartments'].apply(literal_eval) dGenes2Compartment = dfGenes2Compartment.set_index('Gene')['lCompartments'].to_dict() dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'Rxn': str, 'KeggId': str, 'GPR': str, 'Name': str, 'IsTransport': bool, 'IsExchange': bool, 'GPRrule': str}) dfModelRxns2Genes['lGenes'] = dfModelRxns2Genes['lGenes'].apply(literal_eval) lGenesFiltered_all = [] for row in dfModelRxns2Genes.itertuples(): lCompRxnModel = gL.unique(dRxn2Compartments[row.Rxn_conv]) if len(row.lGenes) != 0 and len(lCompRxnModel) != 0: lAllGene_currentRxn = [] for l in row.lGenes: lAllGene_currentRxn += l lAllGene_currentRxn = gL.unique(lAllGene_currentRxn) lGenes2Remove = [] for g in lAllGene_currentRxn: if len(dGenes2Compartment[g]) != 0 and len(gL.intersect(gL.unique(dGenes2Compartment[g]), lCompRxnModel)) == 0: lGenes2Remove.append(g) lGenesFiltered = [[el for el in l if el not in lGenes2Remove] for l in row.lGenes] lGenesFiltered = [subL for subL in lGenesFiltered if subL != []] lGenesFiltered_all.append(lGenesFiltered) else: lGenesFiltered_all.append(row.lGenes)
def findPutativeRxns(dfEqualMets, colL, colR, db): lPutativeRxns = [] if db == 'metacyc' or db == 'kegg': dfEqualMets['fromRxn2Putative1'] = dfEqualMets.apply( lambda row: findReaction_fromRxn2Putative( lReactants_ids, lProducts_ids, row[colL], row[colR]), axis=1) dfEqualMets['fromPutative2Rxn1'] = dfEqualMets.apply( lambda row: findReaction_fromPutative2Rxn( lReactants_ids, lProducts_ids, row[colL], row[colR]), axis=1) dfEqualMets['fromRxn2Putative2'] = dfEqualMets.apply( lambda row: findReaction_fromRxn2Putative( lProducts_ids, lReactants_ids, row[colL], row[colR]), axis=1) dfEqualMets['fromPutative2Rxn2'] = dfEqualMets.apply( lambda row: findReaction_fromPutative2Rxn( lProducts_ids, lReactants_ids, row[colL], row[colR]), axis=1) elif db == 'rhea': dfEqualMets['fromRxn2Putative1'] = dfEqualMets.apply( lambda row: findReaction_rhea(lReactants_ids, lProducts_ids, row[ colL], row[colR]), axis=1) dfEqualMets['fromPutative2Rxn1'] = dfEqualMets.apply( lambda row: findReaction_rhea(row[colL], row[colR], lReactants_ids, lProducts_ids), axis=1) dfEqualMets['fromRxn2Putative2'] = dfEqualMets.apply( lambda row: findReaction_rhea(lReactants_ids, lProducts_ids, row[ colR], row[colL]), axis=1) dfEqualMets['fromPutative2Rxn2'] = dfEqualMets.apply( lambda row: findReaction_rhea(row[colR], row[colL], lReactants_ids, lProducts_ids), axis=1) dfMatchesEqual_L = dfEqualMets[( (dfEqualMets['fromRxn2Putative1'] == True) & (dfEqualMets['fromPutative2Rxn1'] == True)) | ( (dfEqualMets['fromRxn2Putative2'] == True) & (dfEqualMets['fromPutative2Rxn2'] == True))] if dfMatchesEqual_L.empty == False: lPutativeRxns = list(dfMatchesEqual_L['MetaCycId'].dropna()) + list( dfMatchesEqual_L['RheaId'].dropna() ) + list(dfMatchesEqual_L['KeggId_fromKegg'].dropna()) + [ el for l in list(dfMatchesEqual_L['OtherRheaId_fromKegg'].dropna()) for el in l ] + [ el for l in list(dfMatchesEqual_L['RheaId_master'].dropna()) for el in l ] + [ el for l in list(dfMatchesEqual_L['RheaId_lr'].dropna()) for el in l ] + [ el for l in list(dfMatchesEqual_L['RheaId_rl'].dropna()) for el in l ] + [ el for l in list(dfMatchesEqual_L['RheaId_bi'].dropna()) for el in l ] + [ el for l in list(dfMatchesEqual_L['KeggId_y'].dropna()) for sublist in l for el in sublist ] + [ el for l in list(dfMatchesEqual_L['MetaCycId_fromRhea'].dropna()) for el in l.split(',') ] lPutativeRxns = gL.unique(lPutativeRxns) return lPutativeRxns
dfrxns2Genes = '' orgCode = '' lCompartmentsOrganization = {} # Extract for the identified genes all the corresponding compartment information dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'Rxn': str, 'KeggId': str, 'GPR': str, 'Name': str, 'IsTransport': bool, 'IsExchange': bool, 'GPRrule': str}) dfModelRxns2Genes['lGenes'] = dfModelRxns2Genes['lGenes'].apply(literal_eval) lAllGenes = [] for row in dfModelRxns2Genes.itertuples(): if row.lGenes != []: for l in row.lGenes: lAllGenes += l lAllGenes = gL.unique(lAllGenes) # Convert KEGG gene identifiers to Uniprot identifiers dGene2Uniprot = {} uniprot2Org = RESTmod.kegg_conv(orgCode, "uniprot").readlines() for line in uniprot2Org: separo = line.strip().split('\t') ncbi = separo[1].split(':')[1] unip = separo[0].split(':')[1] if ncbi not in dGene2Uniprot: dGene2Uniprot[ncbi] = [unip] else: dGene2Uniprot[ncbi] += [unip] dCompartments = {'cytosolic small ribosomal subunit': 'ribosome', 'Slx1-Slx4 complex': 'nucleus', 'cytosol': 'cytoplasm',
def extractChebiIds(l): lChebi = [] for el in l: lChebi.append(el.split(';')[0][6:]) lChebi = gL.unique(lChebi) return lChebi
chebiIdentifiers)] lAllChebi2Search = [] if dfChebiExplodedFilter.empty is False: for riga in list( dfChebiExplodedFilter['ParentalChebiIds']): lAllChebi2Search += riga for riga in list(dfChebiExplodedFilter['AllChebiIds']): lAllChebi2Search += riga if dfChebiFilter.empty is False: for riga in list(dfChebiFilter['ParentalChebiIds']): lAllChebi2Search += riga for riga in list(dfChebiFilter['AllChebiIds']): lAllChebi2Search += riga lAllChebi2Search = gL.unique(lAllChebi2Search) dfChebiFormula_filtered = dfChebiFormula[ dfChebiFormula['COMPOUND_ID'].isin(lAllChebi2Search)] if dfChebiFormula_filtered.empty is False: dfChebiFormula_filtered = dfChebiFormula_filtered.reset_index( drop=True) dfChebiFormula_filtered_toKeep = dfChebiFormula_filtered[ dfChebiFormula_filtered['TYPE'] == 'FORMULA'] if dfChebiFormula_filtered_toKeep.empty is False: l += list( dfChebiFormula_filtered_toKeep['COMPOUND_ID']) for riga in list( dfChebiFormula_filtered_toKeep['COMPOUND_ID']): dfChebiExplodedFilter2 = dfChebiCompounds_exploded[ dfChebiCompounds_exploded[ 'ParentalChebiIds_exploded'].isin(
dfFinal_p2 = dfFinal_p2.drop_duplicates(subset = ['ID', 'name']) dfFinal_p2 = dfFinal_p2.reset_index(drop = True) ## join dfFinal and dfFinal_p2 dfAllMets = pd.concat([dfFinal, dfFinal_p2]) dfAllMets.to_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t') ## STEP 2. Comparison of MetaNetX and Fuzzy Wuzzy output dfMetsFromMetaNetX = pd.read_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t', dtype=str, index_col = 0) lName = [] for row in dfMetsFromMetaNetX.itertuples(): if row.name.startswith("['"): name = literal_eval(row.name) name = gL.unique(name) lName.append(name) elif row.name.startswith('["'): name = literal_eval(row.name) name = gL.unique(name) lName.append(name) else: lName.append([row.name]) dfMetsFromMetaNetX['name'] = lName dfMetsFromMetaNetX = dfMetsFromMetaNetX.explode('name') dfMetsFromMetaNetX.to_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t') dfmetsFuzzy = pd.read_csv(os.path.join(OUTDIR, inputFuzzy), sep = '\t', dtype=str) dfmetsFuzzy['Name'] = dfmetsFuzzy['Name'].str.lower() lMetsModel = dfmetsFuzzy['Name'].tolist()