#!/usr/bin/env python import os, sys temp=list(); header=1; sys.path.append('../../Libs/Python') from BiochemPy import Reactions, Compounds, InChIs compounds_helper = Compounds() compounds_dict = compounds_helper.loadCompounds() cpds_aliases_dict = compounds_helper.loadMSAliases() cpds_names_dict = compounds_helper.loadNames() # We actually don't want obsolete reactions and compounds in our database # So we're striving to remove any 'new' ones that are obsolete # Any information attached to them should be associated with their linked counterpart # We need to retain older compounds that are now obsolete as these may be present in prior published models # The number used here is the last compound entered before we re-integrated updates from KEGG and MetaCyc # In the fall of 2018, so after this point, we'll take out obsolete compounds last_cpd_str='cpd31000' last_cpd_int=int(last_cpd_str[3:]) delete_cpds=list() for cpd in compounds_dict: cpd_int = int(cpd[3:]) if(cpd_int > last_cpd_int and compounds_dict[cpd]['is_obsolete']): delete_cpds.append(cpd) for cpd in delete_cpds:
#!/usr/bin/env python import os, sys from csv import DictReader temp = list() header = 1 sys.path.append('../../Libs/Python') from BiochemPy import Reactions, Compounds, InChIs CompoundsHelper = Compounds() Compounds_Dict = CompoundsHelper.loadCompounds() Aliases_Dict = CompoundsHelper.loadMSAliases() Names_Dict = CompoundsHelper.loadNames() Source_Classes = dict() reader = DictReader( open('../../../Biochemistry/Aliases/Source_Classifiers.txt'), dialect='excel-tab') for line in reader: if (line['Source Type'] not in Source_Classes): Source_Classes[line['Source Type']] = dict() Source_Classes[line['Source Type']][line['Source ID']] = 1 for cpd in sorted(Compounds_Dict.keys()): if (cpd not in Aliases_Dict): continue Cpd_Aliases = dict() Alias_Count = 0 for source_type in 'Primary Database', 'Secondary Database', 'Published Model': for source in sorted(Aliases_Dict[cpd].keys()):
'formula': array[1], 'charge': array[2] } #Load Curated Structures Ignored_Structures = dict() with open(Structures_Root + "Ignored_ModelSEED_Structures.txt") as ignore_file: for line in ignore_file.readlines(): array = line.split('\t') Ignored_Structures[array[0]] = 1 ignore_file.close() #Load Structures and Aliases Structures_Dict = CompoundsHelper.loadStructures( ["SMILE", "InChIKey", "InChI"], ["KEGG", "MetaCyc"]) MS_Aliases_Dict = CompoundsHelper.loadMSAliases(["KEGG", "MetaCyc"]) master_structs_file = open(Structures_Root + "All_ModelSEED_Structures.txt", 'w') unique_structs_file = open(Structures_Root + "Unique_ModelSEED_Structures.txt", 'w') unique_structs_file.write("ID\tType\tAliases\tFormula\tCharge\tStructure\n") structure_conflicts_file = open("Structure_Conflicts.txt", 'w') formula_conflicts_file = open("Formula_Conflicts.txt", 'w') for msid in sorted(MS_Aliases_Dict.keys()): #Build collection of all structures for the ModelSEED ID Structs = dict() Formulas = dict() for source in 'KEGG', 'MetaCyc': if (source not in MS_Aliases_Dict[msid].keys()):
if (compounds_dict[disambiguating_cpd]['is_obsolete'] == 1): print("Warning: compound " + disambiguating_cpd + " is obsolete, consider using the non-obsolete version") Disambiguation_Object['from'] = { 'id': disambiguating_cpd, 'structures': {}, 'aliases': {}, 'names': {}, 'formula': compounds_dict[disambiguating_cpd]['formula'], 'charge': compounds_dict[disambiguating_cpd]['charge'], 'mass': compounds_dict[disambiguating_cpd]['mass'] } Aliases_Dict = compounds_helper.loadMSAliases() Names_Dict = compounds_helper.loadNames() Structures_Dict = compounds_helper.loadStructures(["InChI", "SMILE"], ["KEGG", "MetaCyc"]) #For reverse lookup reverse_aliases_dict = dict() for cpd in Aliases_Dict: for source in Aliases_Dict[cpd]: for alias in Aliases_Dict[cpd][source]: if (alias not in reverse_aliases_dict): reverse_aliases_dict[alias] = dict() if (source not in reverse_aliases_dict[alias]): reverse_aliases_dict[alias][source] = dict() reverse_aliases_dict[alias][source][cpd] = 1
compounds_dict = compounds_helper.loadCompounds() names_dict = compounds_helper.loadNames() searchnames_dict = dict() all_names_dict = dict() new_name_count = dict() for msid in sorted(names_dict): for name in names_dict[msid]: all_names_dict[name] = 1 searchname = compounds_helper.searchname(name) #Avoid redundancy where possible if (searchname not in searchnames_dict): searchnames_dict[searchname] = msid original_alias_dict = compounds_helper.loadMSAliases() source_alias_dict = dict() all_aliases = dict() new_alias_count = dict() for msid in original_alias_dict: for source in original_alias_dict[msid]: if (source not in source_alias_dict): source_alias_dict[source] = dict() for alias in original_alias_dict[msid][source]: if (alias not in all_aliases): all_aliases[alias] = dict() all_aliases[alias][msid] = 1 if (alias not in source_alias_dict[source]): source_alias_dict[source][alias] = list()
#!/usr/bin/env python import os import sys import json from BiochemPy import Compounds, Reactions #Load Compounds CompoundsHelper = Compounds() Compounds_Dict = CompoundsHelper.loadCompounds() MS_Aliases_Dict = CompoundsHelper.loadMSAliases(["MetaCyc", "PlantCyc"]) for cpd in MS_Aliases_Dict: if ('PlantCyc' not in MS_Aliases_Dict[cpd]): MS_Aliases_Dict[cpd]['PlantCyc'] = [] if ('MetaCyc' not in MS_Aliases_Dict[cpd]): MS_Aliases_Dict[cpd]['MetaCyc'] = [] print("\t".join([ cpd, "|".join(MS_Aliases_Dict[cpd]['PlantCyc']), "|".join(MS_Aliases_Dict[cpd]["MetaCyc"]) ])) #Load Reactions ReactionsHelper = Reactions() Reactions_Dict = ReactionsHelper.loadReactions() MS_Aliases_Dict = ReactionsHelper.loadMSAliases(["MetaCyc", "PlantCyc"]) for rxn in MS_Aliases_Dict: if ('PlantCyc' not in MS_Aliases_Dict[rxn]): MS_Aliases_Dict[rxn]['PlantCyc'] = [] if ('MetaCyc' not in MS_Aliases_Dict[rxn]):