def main(): ptable = PsuedoisomerTableThermodynamics.FromCsvFile( FormationEnergyFileName, label='testing') kegg = Kegg.getInstance() pH, I, pMg, T = (7.0, 0.25, 14, 298.15) output_csv = csv.writer( open('../res/formation_energies_transformed.csv', 'w')) output_csv.writerow([ "cid", "name", "dG'0", "pH", "I", "pMg", "T", "anchor", "compound_ref", "remark" ]) for cid in ptable.get_all_cids(): pmap = ptable.cid2PseudoisomerMap(cid) dG0_prime = pmap.Transform(pH=pH, I=I, pMg=pMg, T=T) output_csv.writerow([ cid, kegg.cid2name(cid), "%.1f" % dG0_prime, pH, I, pMg, T, 1, ptable.cid2source_string[cid] ])
def main(): pH, I, pMg, T = 7.0, 0.25, 14.0, 298.15 dissociation = DissociationConstants.FromPublicDB() kegg = Kegg.getInstance() obs_fname = "../data/thermodynamics/formation_energies.csv" res_fname = "../res/formation_energies_transformed.csv" train_species = PsuedoisomerTableThermodynamics.FromCsvFile( obs_fname, label='testing') csv_out = csv.writer(open(res_fname, 'w')) csv_out.writerow([ 'cid', 'name', "dG'0", 'pH', 'I', 'pMg', 'T', 'anchor', 'compound_ref', 'remark' ]) for cid in train_species.get_all_cids(): pmap = train_species.cid2PseudoisomerMap(cid) source = train_species.cid2source_string[cid] pmatrix = pmap.ToMatrix( ) # ToMatrix returns tuples of (nH, z, nMg, dG0) if len(pmatrix) != 1: raise Exception("multiple training species for C%05d" % cid) nH, charge, nMg, dG0 = pmatrix[0] name = "%s (%d)" % (kegg.cid2name(cid), nH) logging.info('Adding the formation energy of %s', name) diss_table = dissociation.GetDissociationTable(cid, create_if_missing=True) if diss_table is None: raise Exception("%s [C%05d, nH=%d, nMg=%d] does not have a " "dissociation table" % (name, cid, nH, nMg)) diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg) diss_table.SetCharge(nH, charge, nMg) dG0_prime = diss_table.Transform(pH, I, pMg, T) csv_out.writerow([ cid, kegg.cid2name(cid), "%.1f" % dG0_prime, pH, I, pMg, T, True, source, None ])
def ReadFormationEnergies(self): """ Reads the entire table of formation energies which are to be used later both to add them directly to the observed data table and to be used for normalizing NIST data. """ self.formation_dict = {} for label in ['training', 'testing']: ptable = PsuedoisomerTableThermodynamics.FromCsvFile( self.FormationEnergyFileName, label=label) for cid in ptable.get_all_cids(): pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix() if len(pmatrix) != 1: raise Exception("multiple training species for C%05d" % cid) nH, charge, nMg, dG0 = pmatrix[0] if cid in self.cid2nH_nMg: if (nH, nMg) != self.cid2nH_nMg[cid]: raise Exception( "The pseudoisomer of C%05d " "in the formation energy table (nH=%d) " "is not consistent with the pKa table (nH=%d)." % (cid, nH, self.cid2nH_nMg[cid][0])) else: self.cid2nH_nMg[cid] = (nH, nMg) diss_table = self.dissociation.GetDissociationTable(cid, False) if diss_table is None: raise Exception("C%05d has no pKa data" % cid) diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg) dG0_prime = diss_table.Transform(pH=self.pH, I=self.I, pMg=self.pMg, T=self.T) ref = ptable.cid2SourceString(cid) self.formation_dict[cid] = (label, ref, dG0_prime, dG0, nH, charge, nMg)
def ExportJSONFiles(): estimators = LoadAllEstimators() options, _ = MakeOpts(estimators).parse_args(sys.argv) thermo_list = [] thermo_list.append(estimators[options.thermodynamics_source]) thermo_list.append( PsuedoisomerTableThermodynamics.FromCsvFile( options.thermodynamics_csv)) # Make sure we have all the data. kegg = Kegg.getInstance() for i, thermo in enumerate(thermo_list): print "Priority %d - formation energies of: %s" % (i + 1, thermo.name) kegg.AddThermodynamicData(thermo, priority=(i + 1)) db = SqliteDatabase('../res/gibbs.sqlite') print 'Exporting Group Contribution Nullspace matrix as JSON.' nullspace_vectors = [] for row in db.DictReader('ugc_conservations'): d = {'msg': row['msg']} sparse = json.loads(row['json']) d['reaction'] = [] for cid, coeff in sparse.iteritems(): d['reaction'].append([coeff, "C%05d" % int(cid)]) nullspace_vectors.append(d) WriteJSONFile(nullspace_vectors, options.nullspace_out_filename) print 'Exporting KEGG compounds as JSON.' WriteJSONFile(kegg.AllCompounds(), options.compounds_out_filename) print 'Exporting KEGG reactions as JSON.' WriteJSONFile(kegg.AllReactions(), options.reactions_out_filename) print 'Exporting KEGG enzymes as JSON.' WriteJSONFile(kegg.AllEnzymes(), options.enzymes_out_filename)
def EstimateKeggCids(self): """ Uses the Group Contributions to estimate the entire set of compounds in KEGG, and then writes the results to the database as 'gc_pseudoisomers' table Options: override_all_observed_compounds - If True, any observed formation energy is used instead of the GC estimation. If False, only 'test' compounds are used. """ logging.info("Estimating formation energies for all KEGG") observed_species = PsuedoisomerTableThermodynamics.FromCsvFile( '../data/thermodynamics/formation_energies.csv', label='testing') for rc in RedoxCarriers().itervalues(): observed_species.AddPseudoisomer(rc.cid_ox, nH=rc.nH_ox, z=rc.z_ox, nMg=0, dG0=0.0, ref=rc.ref) observed_species.AddPseudoisomer(rc.cid_red, nH=rc.nH_red, z=rc.z_red, nMg=0, dG0=rc.ddG0, ref=rc.ref) observed_species.cid2source_string[rc.cid_ox] = rc.ref observed_species.cid2source_string[rc.cid_red] = rc.ref self.cid2pmap_dict = {} self.cid2source_string = {} self.html_writer.write( '</br><b>Estimated formation energies for KEGG compounds</b>\n') self.html_writer.insert_toggle(start_here=True) for cid in sorted(self.kegg.get_all_cids()): self.html_writer.write('<b>C%05d - %s</b></br>\n' % (cid, self.kegg.cid2name(cid))) diss_table = self.GetDissociationTable(cid) if cid in observed_species.get_all_cids(): pmap_obs = observed_species.cid2PseudoisomerMap(cid) self.cid2source_string[ cid] = observed_species.cid2SourceString(cid) pmatrix = pmap_obs.ToMatrix( ) # returns a list of (nH, z, nMg, dG0) if len(pmatrix) == 1 and diss_table is not None: # assume that only the most abundant pseudoisomer is given # and complete the formation energies of the others using the # pKa values in the dissociation table nH, _z, nMg, dG0 = pmatrix[0] diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0, nH=nH, nMg=nMg) pmap = diss_table.GetPseudoisomerMap() self.SetPseudoisomerMap(cid, pmap) else: if diss_table is not None: logging.warning("C%05d has multiple training species, " "overriding the dissociation table" % cid) self.SetPseudoisomerMap(cid, pmap_obs) elif diss_table is None: self.html_writer.write('Warning: no dissociation table</br>\n') continue else: nH, nMg = self.cid2nH_nMg[cid] groupvector = self.cid2groupvec.get(cid, None) if groupvector is None: self.html_writer.write( 'Warning: no group vector (%s)</br>\n' % self.cid2error[cid]) continue try: dG0 = self.groupvec2val(groupvector) except GroupMissingTrainDataError as e: # in this case we do not care if a compound violated the group # conservation laws because it might cancel out later when we # use it to calculate reactions. dG0 = e.value self.html_writer.write('Warning: %s</br>\n' % str(e)) logging.debug("C%05d: %s" % (cid, str(e))) self.cid2source_string[cid] = "Group Contribution" if self.transformed: diss_table.SetTransformedFormationEnergy(dG0_tag=dG0, pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) else: diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0, nH=nH, nMg=nMg) pmap = diss_table.GetPseudoisomerMap() self.SetPseudoisomerMap(cid, pmap) self.cid2PseudoisomerMap(cid).WriteToHTML(self.html_writer) self.html_writer.div_end() logging.info("Writing the results to the database") self.ToDatabase(self.db, self.THERMODYNAMICS_TABLE_NAME)
# dG0 = -E'*nE*F - R*T*ln(10)*nH*pH # Where: # F = 96.48 kC/mol # nE - change in e- # nH - change in H+ # pH - the conditions in which the E' was measured # # Ferredoxin ox/red: E' = -0.380V (nE = 1, nH = 0) -> dG0 = 38.0 kJ/mol [1] # Ubiqinone ox/red: E' = 0.113V (nE = 2, nH = 2) -> dG0 = -103.2 kJ/mol [1] # Menaquinone ox/red: E' = -0.074V (nE = 2, nH = 2) -> dG0 = -65.8 kJ/mol [1] # # [1] - Thauer 1977 observed_thermo_fname = options.thermodynamics_filename print 'Loading observed thermodynamic data from %s' % observed_thermo_fname observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile( observed_thermo_fname) if thermo_source == 'hatzi_only': thermo = PsuedoisomerTableThermodynamics.FromDatabase( db, 'hatzi_thermodynamics') thermo.AddPseudoisomer(139, nH=0, z=1, nMg=0, dG0=0) # Ferrodoxin(ox) thermo.AddPseudoisomer(138, nH=0, z=0, nMg=0, dG0=38.0) # Ferrodoxin(red) thermo.AddPseudoisomer(399, nH=90, z=0, nMg=0, dG0=0) # Ubiquinone-10(ox) thermo.AddPseudoisomer(390, nH=92, z=0, nMg=0, dG0=-103.2) # Ubiquinone-10(red) thermo.AddPseudoisomer(828, nH=16, z=0, nMg=0, dG0=0) # Menaquinone(ox) thermo.AddPseudoisomer(5819, nH=18, z=0, nMg=0, dG0=-65.8) # Menaquinone(red)
def main(): options, _ = MakeOpts().parse_args(sys.argv) db = SqliteDatabase(options.db_file) kegg = Kegg.getInstance() if options.override_table: db.Execute("DROP TABLE IF EXISTS " + options.table_name) DissociationConstants._CreateDatabase( db, options.table_name, drop_if_exists=options.override_table) cids_to_calculate = set() if options.nist: cids_to_calculate.update(Nist().GetAllCids()) cids_to_calculate.update(RedoxCarriers().GetAllCids()) ptable = PsuedoisomerTableThermodynamics.FromCsvFile( "../data/thermodynamics/formation_energies.csv") cids_to_calculate.update(ptable.get_all_cids()) else: cids_to_calculate.update(kegg.get_all_cids()) for row in db.Execute("SELECT distinct(cid) FROM %s" % options.table_name): if row[0] in cids_to_calculate: cids_to_calculate.remove(row[0]) cid2smiles_and_mw = {} for cid in cids_to_calculate: # the compound CO is a special case where the conversion from InChI # to SMILES fails, so we add a specific override for it only if cid == 237: cid2smiles_and_mw[cid] = ("[C-]#[O+]", 28) continue try: comp = kegg.cid2compound(cid) mol = comp.GetMolecule() cid2smiles_and_mw[cid] = (mol.ToSmiles(), mol.GetExactMass()) except KeggParseException: logging.debug("%s (C%05d) has no SMILES, skipping..." % (kegg.cid2name(cid), cid)) except OpenBabelError: logging.debug( "%s (C%05d) cannot be converted to SMILES, skipping..." % (kegg.cid2name(cid), cid)) # Do not recalculate pKas for CIDs that are already in the database cids_to_calculate = cid2smiles_and_mw.keys() cids_to_calculate.sort(key=lambda (cid): (cid2smiles_and_mw[cid][1], cid)) db_lock = threading.Lock() semaphore = threading.Semaphore(options.n_threads) for cid in cids_to_calculate: smiles, _ = cid2smiles_and_mw[cid] if not smiles: logging.info("The following compound is blacklisted: C%05d" % cid) continue thread = DissociationThreads(group=None, target=None, name=None, args=(cid, smiles, semaphore, db_lock, options), kwargs={}) thread.start()
help="Group Contribution Table Name") return opt_parser options, _ = MakeOpts().parse_args(sys.argv) if options.sbml_model_filename == None: raise ValueError("Must provide a SBML model") print 'SBML model filename:', options.sbml_model_filename print 'CSV output filename:', options.csv_output_filename print 'KEGG Database filename:', options.kegg_db_filename print 'Observed Thermodynamics filename:', options.thermo_filename print 'Thermodynamic Database filename:', options.db_filename print 'Group Contribution Table Name:', options.gc_table_name db = SqliteDatabase(options.db_filename) observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile( options.thermo_filename) if not db.DoesTableExist(options.gc_table_name): raise ValueError('The table %s does not exist in the database. ' 'Please run the groups.py script and try again.' % options.gc_table_name) thermo = PsuedoisomerTableThermodynamics.FromDatabase( db, options.gc_table_name) thermo.override_data(observed_thermo) kegg = Kegg.getInstance() document = libsbml.readSBML(options.sbml_model_filename) if document.getNumErrors(): raise Exception('cannot read SBML model from file %s due to error: %s' % (options.sbml_model_filename, document.getError(0).getMessage())) model = document.getModel() logging.info('Done parsing the model: ' + model.getName())
nist_row_data.pMg = 14 nist_row_data.evaluation = "A" nist_row_data.url = "" nist_row_data.ref_id = "" #nist_row_data.reaction = Reaction(["triose isomerase"], {111:-1, 118:1}) #cid2nH = {111:6, 118:5} #cid2min_dG0 = {111:-1296.3, 118:-1288.6} nist_row_data.reaction = Reaction(["triose isomerase"], { 1: -1, 78: -1, 22: 1, 14: 1, 463: 1 }) train_species = PsuedoisomerTableThermodynamics.FromCsvFile( '../data/thermodynamics/dG0.csv') cid2nH = {} cid2dG0 = {} for cid in train_species.get_all_cids(): pmap = train_species.cid2PseudoisomerMap(cid) pmatrix = pmap.ToMatrix() if len(pmatrix) != 1: raise Exception("C%05d has more than one species in the training set" % cid) cid2nH[cid] = pmatrix[0][0] # ToMatrix returns tuples of (nH, z, nMg, dG0) cid2dG0[cid] = pmatrix[0][3] ddG0_r = 0 dG0_r = 0 dG0_tag_r = 0 for cid, coeff in nist_row_data.reaction.sparse.iteritems():
def ConvertFormation2Reaction(self, output_fname): logging.info("Converting all formation energies to reactions") output_csv = csv.writer(open(output_fname, 'w')) # keep the format used for TECRDB output_csv.writerow( ('ref', 'ID', 'method', 'eval', 'EC', 'name', 'kegg_reaction', 'reaction', 'dG0\'', 'T', 'I', 'pH', 'pMg')) atom2cid = {} for atom, (name, stoich) in KeggObservation.ATOM2ELEMENT.iteritems(): cid, _, _ = self.kegg.name2cid(name, 0) if cid is None: raise Exception( "Cannot find the element %s in the KEGG database" % name) atom2cid[atom] = (cid, stoich) #output_csv.writerow(('element', # 'C%05d' % cid, 'formation', 'A', '', # 'formation of %s' % self.kegg.cid2name(cid), # "C%05d" % cid, # name, 0, self.T, self.I, self.pH, self.pMg)) for label in ['training', 'testing']: ptable = PsuedoisomerTableThermodynamics.FromCsvFile( self.FormationEnergyFileName, label=label) for cid in ptable.get_all_cids(): pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix() if len(pmatrix) != 1: raise Exception("multiple training species for C%05d" % cid) nH, _charge, nMg, dG0 = pmatrix[0] diss_table = dissociation.GetDissociationTable(cid, False) if diss_table is None: continue diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg) dG0_prime = diss_table.Transform(pH=self.pH, I=self.I, pMg=self.pMg, T=self.T) ref = ptable.cid2SourceString(cid) atom_bag = self.kegg.cid2atom_bag(cid) if not atom_bag: continue ne = self.kegg.cid2num_electrons(cid) elem_ne = 0 sparse = {cid: 1} for elem, count in atom_bag.iteritems(): if elem == 'H': continue elem_ne += count * Molecule.GetAtomicNum(elem) elem_cid, elem_coeff = atom2cid[elem] sparse.setdefault(elem_cid, 0) sparse[elem_cid] += -count * elem_coeff # use the H element to balance the electrons in the formation # reactions (we don't need to balance protons since this is # a biochemical reaction, so H+ are 'free'). H_cid, H_coeff = atom2cid['H'] sparse[H_cid] = (elem_ne - ne) * H_coeff reaction = Reaction( "formation of %s" % self.kegg.cid2name(cid), sparse) output_csv.writerow( (ref, 'C%05d' % cid, 'formation', 'A', '', 'formation of %s' % self.kegg.cid2name(cid), reaction.FullReactionString(), reaction.FullReactionString(show_cids=False), '%.2f' % dG0_prime, self.T, self.I, self.pH, self.pMg))