Example #1
0
def substituteSymbolsAndSearch(met, lSymbols, db):
    lOutputs = []
    lSymbols2Change = [symbol for symbol in lSymbols if symbol in met]
    for k in lSymbols2Change:
        lPositions = [p.start() for p in re.finditer(k, met)]
        for posFound in lPositions:
            metSubstituted = met[:posFound] + '+' + met[posFound + 1:]
            try:
                lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols)
                lOutputs += lkeggId_perfectMatch
            except:
                metSubstituted = replaceSpacesWithPlusAndSearch(metSubstituted)
                try:
                    lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols)
                    lOutputs += lkeggId_perfectMatch
                except:
                    print('No results')

        met_AllsubstAll = re.sub(k, '+', met)
        try:
            lkeggId_perfectMatch = queryKeggCompound(db, met_AllsubstAll, met_AllsubstAll, lSymbols)
        except:
            met_AllsubstAll = replaceSpacesWithPlusAndSearch(met_AllsubstAll)
            try:
                lkeggId_perfectMatch = queryKeggCompound(db, met_AllsubstAll, met_AllsubstAll, lSymbols)
                lOutputs += lkeggId_perfectMatch
            except:
                print('No results')
        else:
            lOutputs += lkeggId_perfectMatch
    lOutputs = gL.unique(lOutputs)
    return lOutputs
Example #2
0
def findRxnsAfterFilter(lDfs2Concat, colNameS, colNameP, db, dfR, dfP,
                        lReactants_ids, lProducts_ids):
    lPutativeRxns = []
    dfAllDBs_copy_All = pd.concat(lDfs2Concat)
    dfAllDBs_copy_All[colNameS +
                      '_t'] = dfAllDBs_copy_All[colNameS].apply(tuple)
    dfAllDBs_copy_All[colNameP +
                      '_t'] = dfAllDBs_copy_All[colNameP].apply(tuple)

    lDfs = filterwExtractRows(dfAllDBs_copy_All, dfR, dfP, colNameS, colNameP)
    if len(lDfs) != 0:
        df = pd.concat(lDfs)
        if db == 'metacyc':
            if df.empty is False:
                lPutativeRxns = findPutativeRxns(df, colNameS, colNameP,
                                                 'metacyc')
        elif db == 'kegg':
            if df.empty is False:
                lPutativeRxns = findPutativeRxns(df, colNameS, colNameP,
                                                 'kegg')
        elif db == 'rhea':
            if df.empty is False:
                lPutativeRxns = findPutativeRxns(df, colNameS[:-5],
                                                 colNameP[:-5], 'rhea')

    lPutativeRxns = gL.unique(lPutativeRxns)
    return lPutativeRxns
Example #3
0
def generateTruthTable(lGenes1, lGenes2):
    '''
    Generate the truth matrix
    '''
    n1 = len(lGenes1)
    n2 = len(lGenes2)
    lGenes = gL.unique(lGenes1 + lGenes2)
    n = len(lGenes)
    mTruth = np.array([i for i in itertools.product([False, True], repeat=n)])
    return mTruth, lGenes, n
Example #4
0
def generateOrganismSpecificRegex(keggGeneList):
    # Compose the regex according to characters present in KEGG gene identifiers.
    ## Scorrere lista di geni e per ognuno stabilire se ci sono solo caratteri maiuscoli, solo minuscoli o misti; verificare poi se ci sono numeri
    lower = [gene for gene in keggGeneList if gene.islower()]
    upper = [gene for gene in keggGeneList if gene.isupper()]
    mixed = [
        gene for gene in keggGeneList if gene.islower() and not gene.isupper()
    ]

    if len(mixed) > 0:
        p1 = 'A-Za-z'
    elif len(mixed) == 0 and len(lower) == 0 and len(upper) > 0:
        p1 = 'A-Z'
    elif len(mixed) == 0 and len(lower) > 0 and len(upper) == 0:
        p1 = 'a-z'
    else:
        p1 = ''

    numeric = False
    for gene in keggGeneList:
        if any(c.isdigit() for c in
               gene) is True:  #True if contains at least one numeric character
            numeric = True

    if numeric is True:
        p2 = '0-9'
    else:
        p2 = ''

    # Return all non-alphanumeric characters
    noAlphaNumeric = []
    for gene in keggGeneList:
        noAlphaNumeric += reModule.findall(r'\W+', gene)

    noAlphaNumeric = gL.unique(noAlphaNumeric)
    p3 = ''.join(noAlphaNumeric)

    # Construct the regex
    regexOrgSpecific = '([' + p1 + p2 + p3 + ']+)'
    return regexOrgSpecific
Example #5
0
def substituteWordsAndSearch(met, dWords, db, lSymbols):
    lOutputs = []
    lWords2Change = [k for k,v in dWords.items() if k in met]
    met_substAll = (met + '.')[:-1]
    for k in lWords2Change:
        for v in dWords[k]:
            metSubstituted = met.replace(k,v)
            met_substAll = met_substAll.replace(k,v)
            try:
                lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols)
            except:
                metSubstituted = replaceSpacesWithPlusAndSearch(metSubstituted)
                try:
                    lkeggId_perfectMatch = queryKeggCompound(db, metSubstituted, metSubstituted, lSymbols)
                    lOutputs += lkeggId_perfectMatch
                except:
                    print('No results')
            else:
                lOutputs += lkeggId_perfectMatch
            try:
                lOutputs_changeSymbol = substituteSymbolsAndSearch(metSubstituted, lSymbols, db)
                lOutputs += lOutputs_changeSymbol
            except:
                print('No results')
    try:
        lkeggId_perfectMatch = queryKeggCompound(db, met_substAll, met_substAll, lSymbols)
    except:
        met_substAll = replaceSpacesWithPlusAndSearch(met_substAll)
        try:
            lkeggId_perfectMatch = queryKeggCompound(db, met_substAll, met_substAll, lSymbols)
            lOutputs += lkeggId_perfectMatch
        except:
            print('No results')
    else:
        lOutputs += lkeggId_perfectMatch
    lOutputs = gL.unique(lOutputs)
    return lOutputs
dfChebiCompounds =  pd.read_csv(os.path.join(RAWDIR, 'chebi_compounds_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype=str)

dfChebiDb = pd.read_csv(os.path.join(RAWDIR, 'chebi_database_accession_20201216.tsv.bz2'), sep = '\t',compression='bz2', dtype = {'ID': str, 'COMPOUND_ID': str, 'ACCESSION_NUMBER': str})
dfChebiRelations = pd.read_csv(os.path.join(RAWDIR, 'chebi_relation_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype = {'ID': str, 'TYPE': str, 'INIT_ID': str, 'FINAL_ID': str})

dfChebiInchi = pd.read_csv(os.path.join(RAWDIR, 'chebiId_inchi_20201216.tsv.bz2'), sep = '\t', compression='bz2', dtype=str)
dfChebiInchi['InChI_splitted'] = dfChebiInchi.InChI.str.split('/')
dfChebiInchi['InChI_formula'] = dfChebiInchi.InChI.str.split('/').str[1]

if includeCompartment is True:
    df['toMatch'] = df.Name.str.replace("\[(\w*\s*)+\]$", "", regex = True)
    df['toMatch'] = df.toMatch.str.strip()
    lMets2Search = list(df['toMatch'])
else:
    lMets2Search = list(df['Name'])
lMets2Search = gL.unique(lMets2Search)

dfChebiNames['NAME'] = dfChebiNames['NAME'].str.lower()
dfChebiCompounds['NAME'] = dfChebiCompounds['NAME'].str.lower()
dfChebiUniprot['NAME'] = dfChebiUniprot['NAME'].str.lower()

dizMet2Ids = {}
for met in lMets2Search:
    keggId = ''
    if includeCompartment is True:
        dfMet = df[df['Name'].str.startswith(met + ' ' + '[')]
    else:
        dfMet = df[df['Name'] == met]
    dfMet = dfMet.reset_index(drop = True)
    if dfMet.empty == False:
        inchiOriginal = dfMet.iloc[0]['Inchi']
Example #7
0
## Compute Jaccard score for each reaction and for the entire model
lAllGenes_original = []
lAllGenes_gpruler = []

lJScore_sngRxns = []
for row in dfComparison.itertuples():
    originalRule = row.rule_original
    gprulerRule = row.rule_GPRuler

    if (pd.isna(originalRule) == True or originalRule == ''):
        lGenes_originalRule = []
    else:
        dfGenes_originalRule = gL.extractRegexFromItem(originalRule,
                                                       regexOrgSpecific)
        lGenes_originalRule = gL.unique(list(dfGenes_originalRule[0]))
        lGenes_originalRule.sort()

    if (pd.isna(gprulerRule) == True or gprulerRule == ''):
        lGenes_gprulerRule = []
    else:
        dfGenes_gprulerRule = gL.extractRegexFromItem(gprulerRule,
                                                      regexOrgSpecific)
        lGenes_gprulerRule = gL.unique(list(dfGenes_gprulerRule[0]))
        lGenes_gprulerRule.sort()

    lAllGenes_original += lGenes_originalRule
    lAllGenes_gpruler += lGenes_gprulerRule

    if len(lGenes_originalRule) == 0 and len(lGenes_gprulerRule) == 0:
        jScore = 1.0
Example #8
0
def mergeData(df):
    '''
    This function joins all retrieved information from explored databases and assembles them to generate the final GPR rule.
    Input:
    - df: dataframe generated by getKeggData function.
    Output:
    - dfFinal: dataframe where for each input gene the list of its AND and OR relationships is returned.
    '''
    list_and = []
    list_or = []
    lTotalGenes = []
    lTotalGenes = [item for elem in list(df['geneName_fromKEGG']) for item in elem] + [item for elem in list(df['proteinNames']) for item in elem] \
                   + [item for elem in list(df['geneNames']) for item in elem] + [item for elem in list(df['id_uniprot']) for item in elem] \
                   + [item for elem in list(df['subunitsFromName']) for item in elem]
    lTotalGenes = gL.unique(lTotalGenes)

    lTotalAndSubs = lTotalGenes + list(df['subunitsFromName'])

    dErroneousNames = {'SLCA7A7': 'SLC7A7', 'SLCA7A11': 'SLC7A11'}
    for r in df.itertuples():
        ## clean all information retrieved from the explored databases by removing those elements that are not included into the lTotalGenes list
        finallDipComplex = []
        finallDipBinary = []

        dfinallDipBinary_names = {}
        for f in finallDipBinary:
            out = df.loc[df.uniprotId == f]
            for o in out.itertuples():
                dfinallDipBinary_names[f] = gL.unique([o.uniprotId] +
                                                      o.proteinNames +
                                                      o.geneNames +
                                                      o.id_uniprot +
                                                      o.geneName_fromKEGG)

        lComplexPortal_unipId = [
            x for x in list(r.complexPortal_uniprotId)
            if x not in list(r.id_uniprot)
        ]
        finallComplexPortal_unipId = [
            x for x in lComplexPortal_unipId if x in lTotalAndSubs
        ]  # Select only the isoforms falling within input genes list

        lComplexPortal_protName = [
            x for x in list(r.complexPortal_protName)
            if x not in list(r.id_uniprot)
        ]
        finallComplexPortal_protName = [
            x for x in lComplexPortal_protName if x in lTotalAndSubs
        ]

        try:
            ldfStructure = [
                x for x in list(r.gene_from_structure) if x in lTotalAndSubs
            ]
        except:
            ldfStructure = []

        try:
            ldfInteract = [x for x in list(r.interact) if x in lTotalAndSubs]
        except:
            ldfInteract = []

        try:
            ldfSimilarity = [
                x for x in list(r.by_similarity) if x in lTotalAndSubs
            ]
        except:
            ldfSimilarity = []

        lAllIsoforms = list(r.otherIsoforms) + list(r.isoform)
        lAllIsoforms = gL.unique(lAllIsoforms)

        finalStringSubs = [
            x for x in list(r.stringSubunits) if x in lTotalAndSubs
        ]

        lOtherSubunits = [
            x for x in list(r.otherSubunits) if x not in list(r.id_uniprot) +
            list(r.proteinNames) + list(r.geneNames) +
            list(r.subunitsFromName) + list(r.geneName_fromKEGG)
        ]
        finalOtherSubunits = [x for x in lOtherSubunits if x in lTotalAndSubs]

        finallDipBinary_woIsoforms = []
        for k in dfinallDipBinary_names:
            if all(el not in lAllIsoforms
                   for el in dfinallDipBinary_names[k]) is True:
                finallDipBinary_woIsoforms.append(k)

        subunit = []
        subunit += finallDipComplex + finallDipBinary_woIsoforms + finallComplexPortal_unipId + finallComplexPortal_protName
        subunit += ldfInteract + ldfSimilarity + gL.difference(
            finalStringSubs, lAllIsoforms) + finalOtherSubunits
        subunit = gL.unique(subunit)

        # uso il dizionario di nomi che sono sbagliati e che è necessario correggere
        finalSubunitSet = []
        for s in subunit:
            if s in dErroneousNames:
                finalSubunitSet.append(dErroneousNames[s])
            else:
                finalSubunitSet.append(s)

        lIsoforms = [
            x for x in list(r.otherIsoforms) + list(r.isoform)
            if x not in list(r.id_uniprot) + list(r.proteinNames) +
            list(r.geneNames) + list(r.subunitsFromName) +
            list(r.geneName_fromKEGG)
        ]
        finalIsoforms = [x for x in lIsoforms if x in lTotalAndSubs]

        list_and.append(finalSubunitSet)
        list_or.append(finalIsoforms)

    dfFinal = pd.DataFrame({
        'gene': df['geneNames'],
        'uniprotId': df['id_uniprot']
    })
    dfFinal['AND'] = list_and
    dfFinal['OR'] = list_or
    return (dfFinal)
Example #9
0
def getStringData(df, gene='id_uniprot'):
    '''
    This function retrieves from STRING database known and predicted protein-protein interactions
    established by each queried metabolic gene.
    Input:
    - df: dataframe generated by the previous step;
    - id_prot: column name of uniprot identifiers of genes. By default it is set equal to 'id_uniprot'.
    Output:
    - df: enriched input dataframe with information retrieved from STRING database.
    '''
    dizUniprotString = {}
    dizlInteractors = {}
    stringSubunits = []
    for row in df.itertuples():
        lStringInteractors = []
        for uniprotId in getattr(row, gene):
            if uniprotId not in dizUniprotString:
                originalUniprotNames = row.proteinNames + row.geneNames
                url = "https://string-db.org/api/json/network?identifiers=" + uniprotId
                response = requests.get(url, verify=False)
                while response.status_code == 524:
                    response = requests.get(url, verify=False)
                net = response.json()
                lInteractors = []
                original = ''
                if type(net) == list:
                    for i in range(0, len(net)):
                        if (net[i]['preferredName_A'] == uniprotId) or (any(
                                net[i]['preferredName_A'] == name
                                for name in originalUniprotNames) is True):
                            original = net[i]['preferredName_A']
                            lInteractors.append(net[i]['preferredName_B'])
                        elif (net[i]['preferredName_B'] == uniprotId) or (any(
                                net[i]['preferredName_B'] == name
                                for name in originalUniprotNames) is True):
                            lInteractors.append(net[i]['preferredName_A'])
                dizUniprotString[uniprotId] = (original, lInteractors)
            else:
                original = dizUniprotString[uniprotId][0]
                lInteractors = dizUniprotString[uniprotId][1]
            if original != '':
                for interactor in lInteractors:
                    if interactor not in dizlInteractors:
                        url = 'https://string-db.org//api/json/enrichment?identifiers=' + original + '%0d' + interactor
                        response = requests.get(url, verify=False)
                        while response.status_code == 524:
                            response = requests.get(url, verify=False)
                        diz = response.json()
                        dizlInteractors[interactor] = diz
                    else:
                        diz = dizlInteractors[interactor]
                    for stringElement in diz:
                        if stringElement['category'] == 'Component' and (
                                "complex" in str(stringElement['description'])
                                or "chain" in str(stringElement['description'])
                        ) and original in stringElement['inputGenes']:
                            complesso = stringElement['description']
                            lStringInteractors.append(interactor)
        lStringInteractors = gL.unique(lStringInteractors)
        stringSubunits.append(lStringInteractors)

    df['stringSubunits'] = stringSubunits
    return (df)
for row in dfAll.itertuples():
    dfIsolateMatches = dfChebiCompounds[dfChebiCompounds['NAME'].isin(
        row.Matches)]
    if dfIsolateMatches.empty is True:
        lCorrespondences = []
    else:
        lCorrespondences = list(dfIsolateMatches['ID'].dropna())
    if row.Name.strip() not in dMetMapping:
        dMetMapping[row.Name.strip()] = lCorrespondences
    else:
        dMetMapping[row.Name.strip()] += lCorrespondences

dMetMapping_woDuplicates = {}
for met in dMetMapping:
    dMetMapping_woDuplicates[met.strip()] = gL.unique(dMetMapping[met])

dfMatches = pd.DataFrame(dMetMapping_woDuplicates.items(),
                         columns=['Name', 'Identifiers'])
dfMatches.to_csv(os.path.join(OUTDIR, prefix_modelName + '_mappingFuzzy.tsv'),
                 sep='\t',
                 index=False)

## Joining FuzzyWuzzy and output from Step1
dfFuzzyMatches = pd.read_csv(os.path.join(
    OUTDIR, prefix_modelName + '_mappingFuzzy.tsv'),
                             sep='\t')
dfFuzzyMatches['Identifiers_fuzzy'] = dfFuzzyMatches['Identifiers'].apply(
    ast.literal_eval)

dfClassicMatches = pd.read_csv(os.path.join(
    for llUnipRhea in list(dfSearch['UniprotId'].dropna()):
        for lUnipRhea in llUnipRhea:
            if any('up:' + el in list(dfuniprot2Org['uniprot'])
                   for el in lUnipRhea) is True:
                dfCorrespondingGenes = dfuniprot2Org[
                    dfuniprot2Org['uniprot'].isin(
                        ['up:' + el for el in lUnipRhea])]
                for foundGene in list(dfCorrespondingGenes['keggGeneId']):
                    if [foundGene.split(':')[1]] not in lMetaEnzOR:
                        geneId2search = foundGene.split(':')[1]
                        lMetaEnzOR, dGenesFromKegg = rxnL.checkNadNadpDependencies_or(
                            nadp, nad, geneId2search,
                            orgCode + ':' + geneId2search, lMetaEnzOR,
                            dGenesFromKegg)

    lEc = gL.unique(lEc)
    for ec in lEc:
        if ec in dEcFromKegg:
            dEcs = dEcFromKegg[ec]
        else:
            dEcs = rxnL.getKeggInfo('ec:' + ec)
            dEcFromKegg[ec] = dEcs
        if dEcs != 400 and dEcs != 404 and 'GENES' in dEcs and orgCode.upper(
        ) in dEcs['GENES']:
            for item in dEcs['GENES'][orgCode.upper()].split():
                par = item.find('(')
                if par != -1:
                    if [item[:par]] not in lMetaEnzOR:
                        geneId2search = item[:par]
                        lMetaEnzOR, dGenesFromKegg = rxnL.checkNadNadpDependencies_or(
                            nadp, nad, geneId2search,
Example #12
0
def extractKeggIdComp(met, dfChebiNames, dfChebiDb, dfChebiRelations, dfChebiUniprot, dfChebiCompounds, dfChebiInchi, inchiOriginal):
    dWords = {'ic acid':['ate'], 'ate':['ic acid'], 'bisphosphate':['diphosphate'], 'diphosphate':['bisphosphate'],
                'aminium': ['amine'], 'amine': ['aminium'], 'ammonia': ['nh4+', 'nh3'], 'ammonium': ['nh4+', 'nh3'], 'proton': ['H+'],
                'adenosine triphosphate': ['atp'], 'adenosine diphosphate': ['adp'], 'coenzyme a': ['coa'], 'coa': ['coenzyme a'], 'apotransferin': ['apotransferrin'],
                "adenosine-5'-diphosphate": ['adp'], "adenosine 5'-diphosphate": ['adp'], "uridine-5'-diphosphate": ['udp'], "uridine 5'-diphosphate": ['udp'],
                "deoxyuridine-5'-diphosphate": ['dudp'], "deoxyuridine-5'-triphosphate": ['dutp'],
                'acp': ['[acp]', 'acyl-carrier protein', '[acyl-carrier protein]', 'acyl-carrier-protein', '[acyl-carrier-protein]'], '[acp]': ['acp'], "cytidine-5'-monophosphate": ['cmp'], "cytidine 5'-monophosphate": ['cmp'],
                '-ld-pe-pool': [''], '-ld-ps-pool': [''], '-ld-pc-pool': [''], '-ld-pe-pool': [''], '-ld-tg1-pool': [''], '-ld-tg2-pool': [''], '-ld-tg3-pool': [''], '-ld-pi-pool': [''], '-vldl-pool': [''], '-bile-pc-pool': [''],
                '-uptake-pool': [''], '-pool': [''], 'ide': ['ic acid']}
    lSymbols = ['/', '-', "'", '\[', '\]', '\(', '\)']
    keggId =  []

    lDbs = ['compound', 'glycan']
    for database in lDbs:
        try:
            lkeggId_perfectMatch = queryKeggCompound(database, met, met, lSymbols)
        except:
            try:
                metNew = replaceSpacesWithPlusAndSearch(met)
                lkeggId_perfectMatch = queryKeggCompound(database, metNew, met, lSymbols)
            except:
                if any(symbol in met for symbol in lSymbols) is True:
                    lOutputs = substituteSymbolsAndSearch(met, lSymbols, database)
                    keggId += lOutputs
                if any(k in met for k,v in dWords.items()) is True:
                    lOutputs = substituteWordsAndSearch(met, dWords, database, lSymbols)
                    keggId += lOutputs
            else:
                keggId += lkeggId_perfectMatch
        else:
            if len(lkeggId_perfectMatch) != 0:
                keggId += lkeggId_perfectMatch
            if any(symbol in met for symbol in lSymbols) is True:
                lOutputs = substituteSymbolsAndSearch(met, lSymbols, database)
                keggId += lOutputs
            if any(k in met for k,v in dWords.items()) is True:
                lOutputs = substituteWordsAndSearch(met, dWords, database, lSymbols)
                keggId += lOutputs

        try:
            metNew_woSpaces = replaceSpacesWithNoneAndSearch(met)
            lkeggId_perfectMatch = queryKeggCompound(database, metNew_woSpaces, met, lSymbols)
        except:
            print('not found')
        else:
            keggId += lkeggId_perfectMatch

        lPositions = [p.start() for p in re.finditer(' ', met)]
        for posFound in lPositions:
            met_subst = met[:posFound] + met[posFound + 1:]
            try:
                lkeggId_perfectMatch = queryKeggCompound(database, met_subst, met_subst, lSymbols)
            except:
                print('not found')
            else:
                keggId += lkeggId_perfectMatch

        met_subst = (met + '.')[:-1]
        for posFound in lPositions:
            met_subst = met_subst[:posFound] + met_subst[posFound + 1:]
        try:
            lkeggId_perfectMatch = queryKeggCompound(database, met_subst, met_subst, lSymbols)
        except:
            print('not found')
        else:
            keggId += lkeggId_perfectMatch

    ## search the metabolite in ChEBI
    lchebiId = name2ChebiIds_perfectMatch(met, dfChebiNames, dfChebiUniprot, dfChebiCompounds)
    keggId += lchebiId
    ## And from the list of ChEBI identifiers, search information in ChEBI about KEGG and MetaCyc identifiers
    keggIdentifiers = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations)
    keggId += keggIdentifiers
    ## If the perfect match is not found, try to manipulate the name of the metabolite
    lchebiId = name2ChebiIds_subsSymbolsInChebiOutput(met, lSymbols, dfChebiNames, dfChebiUniprot, dfChebiCompounds)
    keggId += lchebiId
    keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations)
    keggId += keggIdentifier
    ## try to substitute words within the metabolite name to find the match in ChEBI or if not found try again to manipulate the string
    lAllId = name2ChebiIds_subsWords(met, dWords, lSymbols, dfChebiNames, dfChebiUniprot, dfChebiCompounds, dfChebiDb, dfChebiRelations)
    keggId += lAllId

    ## try to search the string before the parenthesis or the comma symbol
    if len(keggId) == 0:
        lchebiId_fromChebiCompounds = splitChebiResultsOnSymbol(dfChebiCompounds, 'NAME', 'ID', '(')
        lchebiId_fromChebiNames = splitChebiResultsOnSymbol(dfChebiNames, 'NAME', 'COMPOUND_ID', '(')
        lchebiId_fromChebiUniprot = splitChebiResultsOnSymbol(dfChebiUniprot, 'NAME', 'ID', '(')
        lchebiId = lchebiId_fromChebiCompounds + lchebiId_fromChebiNames + lchebiId_fromChebiUniprot
        keggId += lchebiId
        keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations)
        keggId += keggIdentifier

    if len(keggId) == 0:
        lchebiId_fromChebiCompounds = splitChebiResultsOnSymbol(dfChebiCompounds, 'NAME', 'ID', ',')
        lchebiId_fromChebiNames = splitChebiResultsOnSymbol(dfChebiNames, 'NAME', 'COMPOUND_ID', ',')
        lchebiId_fromChebiUniprot = splitChebiResultsOnSymbol(dfChebiUniprot, 'NAME', 'ID', ',')
        lchebiId = lchebiId_fromChebiCompounds + lchebiId_fromChebiNames + lchebiId_fromChebiUniprot
        keggId += lchebiId
        keggIdentifier = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations)
        keggId += keggIdentifier

    if len(keggId) == 0 and pd.isna(inchiOriginal) is False and inchiOriginal != '':
        lParts = inchiOriginal.split('/')
        originalInchi_formula = lParts[1]
        i = 2
        originalInchi_atomConnection = ''
        originalInchi_hydrogen = ''
        while i < len(lParts):
            if lParts[i].startswith('c'):
                originalInchi_atomConnection = lParts[i]
            elif lParts[i].startswith('h'):
                originalInchi_hydrogen = lParts[i]
            i+=1

        lchebiId = []
        for row in dfChebiInchi.itertuples():
            currentChebiInchi = row.InChI_splitted
            currentInchi_formula = currentChebiInchi[1]
            i = 2
            currentInchi_atomConnection = ''
            currentInchi_hydrogen = ''
            while i < len(currentChebiInchi):
                if currentChebiInchi[i].startswith('c'):
                    currentInchi_atomConnection = currentChebiInchi[i]
                elif currentChebiInchi[i].startswith('h'):
                    currentInchi_hydrogen = currentChebiInchi[i]
                i+=1
            if originalInchi_formula != '' and currentInchi_formula != '' and originalInchi_atomConnection != '' and currentInchi_atomConnection != '' and originalInchi_hydrogen != '' and currentInchi_hydrogen != '':
                if originalInchi_formula == currentInchi_formula and originalInchi_atomConnection == currentInchi_atomConnection and originalInchi_hydrogen == currentInchi_hydrogen:
                    lchebiId.append(row.CHEBI_ID)

        keggId += lchebiId
        ## From the list of ChEBI identifiers search the associated KEGG and MetaCyc identifiers
        keggIdentifiers = fromChebi2KeggAndMetaCyc(lchebiId, dfChebiDb, dfChebiRelations)
        keggId += keggIdentifiers

    if len(keggId) == 0:
        for database in lDbs:
            if '(' in met:
                lPositions = [p.start() for p in re.finditer('\(', met)]
                met_untilParenthesis = met[:lPositions[-1]].strip()
                try:
                    lkeggId_perfectMatch = queryKeggCompound(database, met_untilParenthesis, met_untilParenthesis, lSymbols)
                except:
                    if any(symbol in met_untilParenthesis for symbol in lSymbols) is True:
                        lOutputs = substituteSymbolsAndSearch(met_untilParenthesis, lSymbols, database)
                        keggId += lOutputs
                    if any(k in met_untilParenthesis for k,v in dWords.items()) is True:
                        lOutputs = substituteWordsAndSearch(met_untilParenthesis, dWords, database, lSymbols)
                        keggId += lOutputs
                else:
                    keggId += lkeggId_perfectMatch

            if ',' in met:
                lPositions = [p.start() for p in re.finditer(',', met)]
                met_untilParenthesis = met[:lPositions[-1]].strip()
                try:
                    lkeggId_perfectMatch = queryKeggCompound(database, met_untilParenthesis, met_untilParenthesis, lSymbols)
                except:
                    if any(symbol in met_untilParenthesis for symbol in lSymbols) is True:
                        lOutputs = substituteSymbolsAndSearch(met_untilParenthesis, lSymbols, database)
                        keggId += lOutputs
                    if any(k in met_untilParenthesis for k,v in dWords.items()) is True:
                        lOutputs = substituteWordsAndSearch(met_untilParenthesis, dWords, database, lSymbols)
                        keggId += lOutputs
                else:
                    keggId += lkeggId_perfectMatch

    keggId = gL.unique(keggId)
    keggId = [k for k in keggId if k != '']
    return keggId
Example #13
0
                                        else:
                                            rxnEquation = ''
                                            rxnDefinition = ''

                                        dCompleteRxns_equation[
                                            rxnName] = rxnEquation
                                        dCompleteRxns_definition[
                                            rxnName] = rxnDefinition
                                        gL.writeLineByLineToFile(rxnFile, [
                                            rxnName, rxnEquation, rxnDefinition
                                        ],
                                                                 sep='\t')

                                    if rxnName not in lAssociatedRxns:
                                        lAssociatedRxns.append(rxnName)
    lAssociatedRxns = gL.unique(lAssociatedRxns)
    dGene2RxnsList[gene] = lAssociatedRxns
    gL.writeLineByLineToFile(geneFile, [gene, lAssociatedRxns], sep='\t')
rxnFile.close()
geneFile.close()

for k, v in dRxn2EcNumber.items():
    gL.writeLineByLineToFile(rxn2EcFile, [k, v], sep='\t')
rxn2EcFile.close()

# Generate the file including reaction to the corresponding catalysing genes list
flatdGene2RxnsList_values = [
    rxn for associatedRxns in dGene2RxnsList.values() for rxn in associatedRxns
]
flatdGene2RxnsList_values = gL.unique(flatdGene2RxnsList_values)
outFile = open(os.path.join(OUTDIR, model + '_Rxns2Genes.csv'), mode='w')
Example #14
0
    elif organismChoice == '2':
        organismCode = input('Insert the KEGG organism code: ')

dfRxnToGenes = pd.read_csv(os.path.join(OUTDIR, model + '_Rxns2Genes.csv'),
                           sep='\t')
dfRxnToGenes['Genes'] = dfRxnToGenes['Genes'].apply(literal_eval)
dfKegg2UniprotId = pd.read_csv(os.path.join(OUTDIR,
                                            model + '_Kegg2UniprotGenes.csv'),
                               sep="\t",
                               dtype={'keggId': str})

lOrganismGenesSet = []
for gene in dfRxnToGenes['Genes']:
    lRxnGenes = [el for g in gene for el in g]
    lOrganismGenesSet += lRxnGenes
lOrganismGenesSet = gL.unique(lOrganismGenesSet)

## Select from the dfKegg2UniprotId dataframe only the metabolic genes included in 'lOrganismGenesSet' list
dfKegg2UniprotId = dfKegg2UniprotId[dfKegg2UniprotId.keggId.isin(
    lOrganismGenesSet)]
dfKegg2UniprotId = dfKegg2UniprotId.reset_index(drop=True)

#############################################################
# Execute getUniprotAndComplexPortalData function
#############################################################
print('Get data from Uniprot and Complex Portal: DOING')
dfData = gprL.getUniprotAndComplexPortalData(dfKegg2UniprotId)
print('Get data from Uniprot and Complex Portal: DONE\n')

#############################################################
# Execute textMiningFromUniprot function
Example #15
0
# Filter genes associated to each reaction according to the associated compartment
dfGenes2Compartment =  pd.read_csv(os.path.join(OUTDIR, dfGenes2Comp + '.csv'), sep = '\t', dtype = {'Gene': str})
dfGenes2Compartment['lCompartments'] = dfGenes2Compartment['lCompartments'].apply(literal_eval)
dGenes2Compartment = dfGenes2Compartment.set_index('Gene')['lCompartments'].to_dict()

dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'RxnId': str})
dfModelRxns2Genes['Genes'] = dfModelRxns2Genes['Genes'].apply(literal_eval)

dRxn2GeneswLocation = {}
for row in dfModelRxns2Genes.itertuples():

    ## get all the genes of the current reaction
    lAllGene_currentRxn = []
    for l in row.Genes:
        lAllGene_currentRxn += l
    lAllGene_currentRxn = gL.unique(lAllGene_currentRxn)

    # reduce dGenes2Compartment to only consider genes of the current reaction
    dGenesCurrentRxn = {g: dGenes2Compartment[g] for g in lAllGene_currentRxn}

    # retrieved all the possible compartments of this reaction and create the same number of duplicates of the current reaction
    lRetrievedCompartments = []
    for g in dGenesCurrentRxn:
        lRetrievedCompartments += dGenesCurrentRxn[g]
    lRetrievedCompartments = gL.unique(lRetrievedCompartments)

    rxnSuffix = 1
    dRxn2AnnotatedCompartments = {}
    for putativeComp in lRetrievedCompartments:
        dRxn2AnnotatedCompartments[row.RxnId + '_' + str(rxnSuffix)] = putativeComp
        rxnSuffix += 1
Example #16
0
    dfkeggC_filter = dfkeggC[pd.DataFrame(dfkeggC.ChebiId.tolist()).isin(
        dizMetsIdentifiers[k]).any(1).values]
    if dfkeggC_filter.empty is False:
        lCompleteIds += list(dfkeggC_filter['Id'].dropna())

    keggDbG = dfkeggG[dfkeggG['Id'].isin(dizMetsIdentifiers[k])]
    if keggDbG.empty is False:
        lCompleteIds += [
            el for l in list(keggDbG['ChebiId'].dropna()) for el in l
        ]

    keggDbG_filter = keggDbG[pd.DataFrame(keggDbG.ChebiId.tolist()).isin(
        dizMetsIdentifiers[k]).any(1).values]
    if keggDbG_filter.empty is False:
        lCompleteIds += list(keggDbG_filter['Id'].dropna())
    lCompleteIds = gL.unique(lCompleteIds)
    dizMetsIdentifiers[k] = lCompleteIds

lIdentifiers = []
for row in dfMetsFromModel.itertuples():
    lInferredIds = dizMetsIdentifiers[row.Name.strip()]
    lIdentifiers.append(lInferredIds)

dfMetsFromModel['lIdentifiers'] = lIdentifiers
dfMetsFromModel.to_csv(os.path.join(OUTDIR, dfmetsInfo + '_enriched.csv'),
                       sep='\t',
                       index=False)

## Read from ChEBI all the parental identifiers of each metabolite
dfChebiFormula = pd.read_csv(os.path.join(RAWDIR,
                                          'chebi_chemical_data_20201216.tsv'),
        for k in lCompartmentsOrganization:
            if dModelCompartments[comp].lower() in lCompartmentsOrganization[k]:
                lRxnComps.append(k)
    dRxn2Compartments[r.id] = lRxnComps

# Filter genes associated to each reaction according to the associated compartment
dfGenes2Compartment =  pd.read_csv(os.path.join(OUTDIR, dfGenes2Comp + '.csv'), sep = '\t', dtype = {'Gene': str})
dfGenes2Compartment['lCompartments'] = dfGenes2Compartment['lCompartments'].apply(literal_eval)
dGenes2Compartment = dfGenes2Compartment.set_index('Gene')['lCompartments'].to_dict()

dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'Rxn': str, 'KeggId': str, 'GPR': str, 'Name': str, 'IsTransport': bool, 'IsExchange': bool, 'GPRrule': str})
dfModelRxns2Genes['lGenes'] = dfModelRxns2Genes['lGenes'].apply(literal_eval)

lGenesFiltered_all = []
for row in dfModelRxns2Genes.itertuples():
    lCompRxnModel = gL.unique(dRxn2Compartments[row.Rxn_conv])
    if len(row.lGenes) != 0 and len(lCompRxnModel) != 0:
        lAllGene_currentRxn = []
        for l in row.lGenes:
            lAllGene_currentRxn += l
        lAllGene_currentRxn = gL.unique(lAllGene_currentRxn)
        lGenes2Remove = []
        for g in lAllGene_currentRxn:
            if len(dGenes2Compartment[g]) != 0 and len(gL.intersect(gL.unique(dGenes2Compartment[g]), lCompRxnModel)) == 0:
                lGenes2Remove.append(g)
        lGenesFiltered = [[el for el in l if el not in lGenes2Remove] for l in row.lGenes]
        lGenesFiltered = [subL for subL in lGenesFiltered if subL != []]
        lGenesFiltered_all.append(lGenesFiltered)
    else:
        lGenesFiltered_all.append(row.lGenes)
Example #18
0
def findPutativeRxns(dfEqualMets, colL, colR, db):
    lPutativeRxns = []
    if db == 'metacyc' or db == 'kegg':
        dfEqualMets['fromRxn2Putative1'] = dfEqualMets.apply(
            lambda row: findReaction_fromRxn2Putative(
                lReactants_ids, lProducts_ids, row[colL], row[colR]),
            axis=1)
        dfEqualMets['fromPutative2Rxn1'] = dfEqualMets.apply(
            lambda row: findReaction_fromPutative2Rxn(
                lReactants_ids, lProducts_ids, row[colL], row[colR]),
            axis=1)
        dfEqualMets['fromRxn2Putative2'] = dfEqualMets.apply(
            lambda row: findReaction_fromRxn2Putative(
                lProducts_ids, lReactants_ids, row[colL], row[colR]),
            axis=1)
        dfEqualMets['fromPutative2Rxn2'] = dfEqualMets.apply(
            lambda row: findReaction_fromPutative2Rxn(
                lProducts_ids, lReactants_ids, row[colL], row[colR]),
            axis=1)

    elif db == 'rhea':
        dfEqualMets['fromRxn2Putative1'] = dfEqualMets.apply(
            lambda row: findReaction_rhea(lReactants_ids, lProducts_ids, row[
                colL], row[colR]),
            axis=1)
        dfEqualMets['fromPutative2Rxn1'] = dfEqualMets.apply(
            lambda row: findReaction_rhea(row[colL], row[colR], lReactants_ids,
                                          lProducts_ids),
            axis=1)
        dfEqualMets['fromRxn2Putative2'] = dfEqualMets.apply(
            lambda row: findReaction_rhea(lReactants_ids, lProducts_ids, row[
                colR], row[colL]),
            axis=1)
        dfEqualMets['fromPutative2Rxn2'] = dfEqualMets.apply(
            lambda row: findReaction_rhea(row[colR], row[colL], lReactants_ids,
                                          lProducts_ids),
            axis=1)

    dfMatchesEqual_L = dfEqualMets[(
        (dfEqualMets['fromRxn2Putative1'] == True) &
        (dfEqualMets['fromPutative2Rxn1'] == True)) | (
            (dfEqualMets['fromRxn2Putative2'] == True) &
            (dfEqualMets['fromPutative2Rxn2'] == True))]

    if dfMatchesEqual_L.empty == False:
        lPutativeRxns = list(dfMatchesEqual_L['MetaCycId'].dropna()) + list(
            dfMatchesEqual_L['RheaId'].dropna()
        ) + list(dfMatchesEqual_L['KeggId_fromKegg'].dropna()) + [
            el for l in list(dfMatchesEqual_L['OtherRheaId_fromKegg'].dropna())
            for el in l
        ] + [
            el for l in list(dfMatchesEqual_L['RheaId_master'].dropna())
            for el in l
        ] + [
            el for l in list(dfMatchesEqual_L['RheaId_lr'].dropna())
            for el in l
        ] + [
            el for l in list(dfMatchesEqual_L['RheaId_rl'].dropna())
            for el in l
        ] + [
            el for l in list(dfMatchesEqual_L['RheaId_bi'].dropna())
            for el in l
        ] + [
            el for l in list(dfMatchesEqual_L['KeggId_y'].dropna())
            for sublist in l for el in sublist
        ] + [
            el for l in list(dfMatchesEqual_L['MetaCycId_fromRhea'].dropna())
            for el in l.split(',')
        ]
        lPutativeRxns = gL.unique(lPutativeRxns)

    return lPutativeRxns
Example #19
0
    dfrxns2Genes = ''
    orgCode = ''
    lCompartmentsOrganization = {}


# Extract for the identified genes all the corresponding compartment information
dfModelRxns2Genes = pd.read_csv(os.path.join(OUTDIR, dfrxns2Genes + '.csv'), sep = '\t', dtype = {'Rxn': str, 'KeggId': str, 'GPR': str, 'Name': str, 'IsTransport': bool, 'IsExchange': bool, 'GPRrule': str})
dfModelRxns2Genes['lGenes'] = dfModelRxns2Genes['lGenes'].apply(literal_eval)

lAllGenes = []
for row in dfModelRxns2Genes.itertuples():
    if row.lGenes != []:
        for l in row.lGenes:
            lAllGenes += l

lAllGenes = gL.unique(lAllGenes)

# Convert KEGG gene identifiers to Uniprot identifiers
dGene2Uniprot = {}
uniprot2Org = RESTmod.kegg_conv(orgCode, "uniprot").readlines()
for line in uniprot2Org:
    separo = line.strip().split('\t')
    ncbi = separo[1].split(':')[1]
    unip = separo[0].split(':')[1]
    if ncbi not in dGene2Uniprot:
        dGene2Uniprot[ncbi] = [unip]
    else:
        dGene2Uniprot[ncbi] += [unip]


dCompartments = {'cytosolic small ribosomal subunit': 'ribosome', 'Slx1-Slx4 complex': 'nucleus', 'cytosol': 'cytoplasm',
Example #20
0
def extractChebiIds(l):
    lChebi = []
    for el in l:
        lChebi.append(el.split(';')[0][6:])
    lChebi = gL.unique(lChebi)
    return lChebi
Example #21
0
                    chebiIdentifiers)]

                lAllChebi2Search = []
                if dfChebiExplodedFilter.empty is False:
                    for riga in list(
                            dfChebiExplodedFilter['ParentalChebiIds']):
                        lAllChebi2Search += riga
                    for riga in list(dfChebiExplodedFilter['AllChebiIds']):
                        lAllChebi2Search += riga

                if dfChebiFilter.empty is False:
                    for riga in list(dfChebiFilter['ParentalChebiIds']):
                        lAllChebi2Search += riga
                    for riga in list(dfChebiFilter['AllChebiIds']):
                        lAllChebi2Search += riga
                lAllChebi2Search = gL.unique(lAllChebi2Search)
                dfChebiFormula_filtered = dfChebiFormula[
                    dfChebiFormula['COMPOUND_ID'].isin(lAllChebi2Search)]
                if dfChebiFormula_filtered.empty is False:
                    dfChebiFormula_filtered = dfChebiFormula_filtered.reset_index(
                        drop=True)
                    dfChebiFormula_filtered_toKeep = dfChebiFormula_filtered[
                        dfChebiFormula_filtered['TYPE'] == 'FORMULA']
                    if dfChebiFormula_filtered_toKeep.empty is False:
                        l += list(
                            dfChebiFormula_filtered_toKeep['COMPOUND_ID'])
                        for riga in list(
                                dfChebiFormula_filtered_toKeep['COMPOUND_ID']):
                            dfChebiExplodedFilter2 = dfChebiCompounds_exploded[
                                dfChebiCompounds_exploded[
                                    'ParentalChebiIds_exploded'].isin(
Example #22
0
dfFinal_p2 = dfFinal_p2.drop_duplicates(subset = ['ID', 'name'])
dfFinal_p2 = dfFinal_p2.reset_index(drop = True)

## join dfFinal and dfFinal_p2
dfAllMets = pd.concat([dfFinal, dfFinal_p2])
dfAllMets.to_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t')

## STEP 2. Comparison of MetaNetX and Fuzzy Wuzzy output
dfMetsFromMetaNetX = pd.read_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t', dtype=str, index_col = 0)

lName = []
for row in dfMetsFromMetaNetX.itertuples():
    if row.name.startswith("['"):
        name = literal_eval(row.name)
        name = gL.unique(name)
        lName.append(name)
    elif row.name.startswith('["'):
        name = literal_eval(row.name)
        name = gL.unique(name)
        lName.append(name)
    else:
        lName.append([row.name])

dfMetsFromMetaNetX['name'] = lName
dfMetsFromMetaNetX = dfMetsFromMetaNetX.explode('name')
dfMetsFromMetaNetX.to_csv(os.path.join(OUTDIR, modelName + '_mappingMetaNetX_20210901.csv'), sep = '\t')
dfmetsFuzzy =  pd.read_csv(os.path.join(OUTDIR, inputFuzzy), sep = '\t', dtype=str)
dfmetsFuzzy['Name'] = dfmetsFuzzy['Name'].str.lower()

lMetsModel = dfmetsFuzzy['Name'].tolist()