def kegg2UniprotGenesId(organismCode, model, dirPath): ## Conversion of kegg identifiers to uniprot identifiers kegg2uniprot = kegg_conv('uniprot', organismCode) dOrgKegg2Uniprot = {} for el in kegg2uniprot.readlines(): elSplt = el.strip().split('\t') if elSplt[0].split(':')[1] not in dOrgKegg2Uniprot.keys(): dOrgKegg2Uniprot[elSplt[0].split(':')[1]] = [ elSplt[1].split(':')[1] ] else: dOrgKegg2Uniprot[elSplt[0].split(':')[1]] += [ elSplt[1].split(':')[1] ] outFile = open(os.path.join(dirPath, model + '_Kegg2UniprotGenes.csv'), mode='w') gL.writeLineByLineToFile(outFile, ['keggId', 'uniprotId'], '\t') for k, v in dOrgKegg2Uniprot.items(): for vv in v: gL.writeLineByLineToFile(outFile, [k, vv], '\t') outFile.close()
'mcm complex':'cytoplasm', 'mcm core complex':'cytoplasm'} dEvidenceCodes = {'manual': ['ECO:0000269', 'ECO:0000303', 'ECO:0000305', 'ECO:0000250', 'ECO:0000255','ECO:0000256', 'ECO:0000259', 'ECO:0000312', 'ECO:0007005', 'ECO:0000315','ECO:0000244', 'ECO:0000314', 'ECO:0000304', 'ECO:0000318', 'ECO:0000501'], 'automatic': ['ECO:0000313', 'ECO:0000213']} lPossibleCompartments = ['nucleus', 'nuclear lumen', 'nucleoplasm', 'nuclear envelope', 'mitochondrion inner membrane', 'mitochondrial inner membrane', 'mitochondrial membrane', 'mitochondrion', 'cytoplasm', 'vacuole', 'vacuolar membrane', 'endoplasmic reticulum', 'lipid droplet', 'plasma membrane', 'mitochondrial matrix', 'preribosome', 'cell periphery', 'mitochondrial intermembrane space', 'Golgi apparatus', 'peroxisome', 'endoplasmic reticulum membrane', 'cytoskeleton', 'mating projection', 'replication compartment', 'peroxisomal membrane', 'mitochondrial outer membrane', 'golgi membrane', 'golgi apparatus membrane', 'Golgi membrane', 'lysosomal membrane', 'lysosome'] dname2GO = {} dAnc2Name = {} outFile = open(os.path.join(OUTDIR, dfGenesLoc + '.csv'), mode='w') gL.writeLineByLineToFile(outFile, ['Gene', 'lCompartments'], '\t') u = UniProt(verbose=False) for gene in lAllGenes: lDefinitiveCompartment = [] if gene in dGene2Uniprot: lCompartments = [] lUniprotIds = dGene2Uniprot[gene] for unip in lUniprotIds: uniprotSearch = u.search("%s" % unip, frmt="xml") if uniprotSearch != '': dUniprotSearch = xmltodict.parse(uniprotSearch) dEcoEvidence = {} # Construct dictionary of the Evidence codes if 'evidence' in dUniprotSearch['uniprot']['entry']:
sep='\t') dfMatchesEmpty.to_csv(os.path.join( OUTDIR, prefix_modelName + '_mappingMetaCyc_empty.tsv'), sep='\t') ## Automatic interaction with the user to curate the list of putative matches with score between 91 and 99 dfMatches91_99 = pd.read_csv(os.path.join( OUTDIR, prefix_modelName + '_mappingMetaCyc_91_99.tsv'), sep="\t", index_col=0) dfMatches91_99['Matches'] = dfMatches91_99['Matches'].apply(ast.literal_eval) curatedFWFile = open(os.path.join( OUTDIR, prefix_modelName + '_mappingMetaCyc_91_99.tsv'), mode='w') gL.writeLineByLineToFile(curatedFWFile, ['Name', 'Matches'], '\t') for row in dfMatches91_99.itertuples(): lMatches = [] for mat in row.Matches: print('Target metabolite:\t', row.Name) print('Proposed match:\t', mat, '\n') choice = input( 'Do you want to keep this proposed metabolite name inside the final list: yes (y) or no (n)? ' ) if choice == 'y': lMatches.append(mat) gL.writeLineByLineToFile(curatedFWFile, [row.Name, lMatches], '\t') curatedFWFile.close()
# Generate the first output: reaction --> list of catalysing genes dfRxnswGenes = pd.read_csv(os.path.join(OUTDIR, rxnswGenesFileName + '.csv'), sep='\t', dtype=str) dfRxnswGenes_filter = dfRxnswGenes[['Rxn', 'lGenes_filtered']] dfRxnswGenes_filter = dfRxnswGenes_filter.rename( columns={'lGenes_filtered': 'Genes'}) dfRxnswGenes_filter.to_csv(os.path.join(OUTDIR, outputFileName + '_Rxns2Genes.csv'), sep='\t', index=False) # Generate the second output: KEGG gene identifier --> corresponding Uniprot identifier kegg2uniprot = RESTmod.kegg_conv('uniprot', organismCode) dOrgKegg2Uniprot = {} for el in kegg2uniprot.readlines(): elSplt = el.strip().split('\t') if elSplt[0].split(':')[1] not in dOrgKegg2Uniprot.keys(): dOrgKegg2Uniprot[elSplt[0].split(':')[1]] = [elSplt[1].split(':')[1]] else: dOrgKegg2Uniprot[elSplt[0].split(':')[1]] += [elSplt[1].split(':')[1]] outFile = open(os.path.join(OUTDIR, outputFileName + '_Kegg2UniprotGenes.csv'), mode='w') gL.writeLineByLineToFile(outFile, ['keggId', 'uniprotId'], '\t') for k, v in dOrgKegg2Uniprot.items(): for vv in v: gL.writeLineByLineToFile(outFile, [k, vv], '\t') outFile.close()
'Type the correct KEGG code among the returned ones: ') elif organismChoice == '2': organismCode = input('Insert the KEGG organism code: ') # Extract the entire list of genes of the target organism according to KEGG annotation k = kegg.KEGG() keggGenes = k.list(organismCode) keggGenesSplt = keggGenes.strip().split("\n") lOrganismGenes = [] for gene in keggGenesSplt: lOrganismGenes.append(gene.split('\t')[0].split(organismCode + ':')[1]) dGene2RxnsList = {} cont = 1 geneFile = open(os.path.join(OUTDIR, model + '_GeneId2Rxns.csv'), mode='w') gL.writeLineByLineToFile(geneFile, ['GeneId', 'Rxns'], '\t') rxnFile = open(os.path.join(OUTDIR, model + '_RxnId2Equation.csv'), mode='w') gL.writeLineByLineToFile(rxnFile, ['RxnId', 'Equation', 'Definition'], '\t') rxn2EcFile = open(os.path.join(OUTDIR, model + '_RxnId2ECs.csv'), mode='w') gL.writeLineByLineToFile(rxn2EcFile, ['RxnId', 'EC number'], '\t') # Extract for each gene the list of catalysed reactions dRxn2EcNumber = {} dCompleteRxns_equation = { } # to save all the reactions candidate for the model avoiding duplicated elements (equation field of KEGG database) dCompleteRxns_definition = { } # to save all the reactions candidate for the model avoiding duplicated elements (definition field of KEGG database) for gene in lOrganismGenes: ## get for each gene its BRITE information in order to select only metabolic genes
'16908', 'C00004', '|NADH|', '57945', '25805', '|OXYGEN-MOLECULE|', '15379', 'C00007', '|NADP|', '58349', '|B-HEP-1:5|', 'C00667', '18009', 'C00006', '16474', '77312', '|NADPH|', '|CPD-16005|', 'C20745', '77177', '57783', 'C00005', 'CPD-16005', '15996', 'C00044', '37565', '|GTP|', 'C00010', '57287', '|CO-A|', '15346', 'CARBON-DIOXIDE', 'C00011', '|CARBON-DIOXIDE|', '16526', 'AMMONIUM', 'C01342', '|AMMONIUM|', '|AMMONIA|', 'C00014', 'AMMONIA', '16134', '28938', 'C00002', '15422', '22258', '|ATP|', '30616', '456216', '73342', '|ADP|', '22251', 'C00008', 'G11113', '22252', '16761' ] if testModel == 'y7' or testModel == 'y8': dfMetsFromModel['Name'] = dfMetsFromModel['Name'].str.strip() outFile = open(os.path.join(OUTDIR, dfrxnsInfo + '_wIds.csv'), mode='w') gL.writeLineByLineToFile(outFile, ['Rxn', 'PutativeIdentifiers'], '\t') lcofactors = gL.unique(lcofactors) for rowRxn in dfRxns.itertuples(): isTransport = rowRxn.IsTransport isExchange = rowRxn.IsExchange if rowRxn.Rxn.startswith('R_'): rxnId = rowRxn.Rxn[2:] else: rxnId = rowRxn.Rxn rxn = model.reactions.get_by_id(rxnId) lIdentifiersRxn = [ ] # list where all the retrieved identifiers of the current reactions are saved lReactants = [] if isTransport == True: lReactants = rowRxn.trasportedMets