def AddNistDatabase(self, assert_decomposition=True): """ Add the observations based on equilibrium constants from the NIST database. If using non-transformed group contribution, it is required to reverse Legendre- transform the data in order to get chemical reaction energies. This methods tries to use the same pseudoisomers for each compound as the table of formation energies, in order to have less columns in the final stoichiometric matrix. """ nist = Nist() # create a dictionary from each unique reaction to the list of measured dG0' # and subtract from dG0' the formation energies of the anchored compounds for r, nist_row_data in enumerate(nist.SelectRowsFromNist()): obs_id = "NIST%03d" % r msg = "" try: if not self.transformed: dG0 = self.dissociation.ReverseTransformNistRow( nist_row_data, self.cid2nH_nMg) else: dG0 = nist_row_data.dG0_r # we are using transformed energies self.AddObservation(obs_id=obs_id, obs_type=KeggObservation.TYPE_REACTION, anchored=False, dG0=dG0, sparse=nist_row_data.reaction.sparse, url=nist_row_data.url) except MissingDissociationConstantError as e: msg = 'Cannot reverse transform NIST%03d because of' \ ' of a missing dissociation constant for C%05d' % (r, e.cid) logging.warning(msg) dG0 = None html_text = "" html_text += "<b id=%s>%s</b></br>\n" % (obs_id, obs_id) html_text += '<font size="1">\n' html_text += "NIST conditions: pH = %g, I = %g, pMg = %g, T = %g</br>\n" % \ (nist_row_data.pH, nist_row_data.I, nist_row_data.pMg, nist_row_data.T) html_text += 'NIST reference: <a href="%s">%s</a></br>\n' % \ (nist_row_data.url, nist_row_data.ref_id) html_text += 'EC = %s</br>\n' % nist_row_data.ec html_text += "Reaction: %s</br>\n" % \ nist_row_data.reaction.to_hypertext(show_cids=False) html_text += "%s = %.1f</br>\n" % \ (thermodynamic_constants.symbol_dr_G0_prime, nist_row_data.dG0_r) if dG0 is None: html_text += 'WARNING: %s</br>\n' % msg else: html_text += '%s = %.1f</br>\n' % \ (self.gibbs_symbol, dG0) html_text += '</font></br></br>\n' self.html_writer.write(html_text)
def main(): html_writer = HtmlWriter("../res/formation_resolve.html") estimators = LoadAllEstimators() for name in ['alberty']: thermo = estimators[name] nist = Nist() nist.verify_formation(html_writer=html_writer, thermodynamics=thermo, name=name) html_writer.close()
def __init__(self, db, dissociation=None, html_writer=None, nist=None): PsuedoisomerTableThermodynamics.__init__(self) self.db = db self.html_writer = html_writer or NullHtmlWriter() self.nist = nist or Nist() self.dissociation = None self.cid2pmap_dict = {} self.assume_no_pKa_by_default = False self.std_diff_threshold = np.inf
def main(): options, _ = MakeOpts().parse_args(sys.argv) db = SqliteDatabase("../res/gibbs.sqlite") public_db = SqliteDatabase("../data/public_data.sqlite") output_filename = os.path.abspath(options.output_filename) logging.info('Will write output to %s' % output_filename) html_writer = HtmlWriter(output_filename) nist = Nist(T_range=None) nist_regression = NistRegression(db, html_writer=html_writer, nist=nist) nist_regression.std_diff_threshold = 5 # the threshold over which to print an analysis of a reaction #nist_regression.nist.T_range = None(273.15 + 24, 273.15 + 40) #nist_regression.nist.override_I = 0.25 #nist_regression.nist.override_pMg = 14.0 html_writer.write("<h2>NIST regression:</h2>") if options.use_prior: logging.info('Using the data from Alberty as fixed prior') prior_thermo = PsuedoisomerTableThermodynamics.FromDatabase( public_db, 'alberty_pseudoisomers', name="Alberty") else: prior_thermo = None html_writer.write('</br><b>Regression Tables</b>\n') html_writer.insert_toggle(start_here=True) nist_regression.Train(options.from_database, prior_thermo) html_writer.div_end() html_writer.write('</br><b>PRC results</b>\n') html_writer.insert_toggle(start_here=True) nist_regression.WriteDataToHtml(html_writer) html_writer.div_end() html_writer.write('</br><b>Transformed reaction energies - PRC vs. Observed</b>\n') html_writer.insert_toggle(start_here=True) N, rmse = nist_regression.VerifyResults() html_writer.div_end() logging.info("Regression results for transformed data:") logging.info("N = %d, RMSE = %.1f" % (N, rmse)) html_writer.close()
from toolbox.database import SqliteDatabase from pygibbs.kegg_reaction import Reaction from pygibbs.kegg import Kegg from pygibbs.thermodynamic_constants import R import logging import csv DATA_FNAME = '../res/compare_ugcm_to_hatzi.txt' CSV_FNAME = '../res/compare_ugcm_to_hatzi.csv' FIG_FNAME = '../res/compare_ugcm_to_iAF1260' pH, pMg, I, T = (7.0, 14.0, 0.25, 298.15) pH_min, pH_max = (6.0, 8.0) if True: estimators = LoadAllEstimators() nist = Nist() feist = Feist.FromFiles() reactions = {} csv_writer = csv.DictWriter(open(CSV_FNAME, 'w'), fieldnames=[ 'name', 'reaction', 'E[dG' '0] nist', 'dG' '0 iAF1260', 'dG' '0 UGCM' ], extrasaction='ignore') csv_writer.writeheader() data = []
def main(): db = database.SqliteDatabase('../res/gibbs.sqlite') html_writer = HtmlWriter("../res/nist/report.html") gc = GroupContribution(db) gc.override_gc_with_measurements = True gc.init() grad = GradientAscent(gc) nist = Nist(db, html_writer, gc.kegg()) nist.FromDatabase() alberty = Alberty() hatzi = Hatzi() if True: grad.load_nist_data(nist, alberty, skip_missing_reactions=False, T_range=(298, 314)) grad.verify_results("Alberty", alberty, html_writer) #grad.write_pseudoisomers("../res/nist/nist_dG0_f.csv") #html_writer.write("<h2>Using Group Contribution (Hatzimanikatis' implementation)</h2>") #html_writer.write("<h3>Correlation with the reduced NIST database (containing only compounds that appear in Alberty's list)</h3>") #logging.info("calculate the correlation between Hatzimanikatis' predictions and the reduced NIST database") #grad.verify_results("Hatzimanikatis_Reduced", hatzi, html_writer) #grad.load_nist_data(nist, hatzi, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Hatzimanikatis", hatzi, html_writer) #grad.load_nist_data(nist, gc, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Milo", gc, html_writer) elif False: # Run the gradient ascent algorithm, where the starting point is the same file used for training the GC algorithm grad.load_dG0_data("../data/thermodynamics/dG0.csv") # load the data for the anchors (i.e. compounds whose dG0 should not be changed - usually their value will be 0). grad.anchors = grad.load_dG0_data( "../data/thermodynamics/nist_anchors.csv") grad.load_nist_data(nist, grad, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient1") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) grad.load_nist_data(nist, alberty, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient2") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) # Use DETERMINISTIC gradient ascent grad.load_nist_data(nist, alberty, skip_missing_reactions=True, T_range=(24 + 273.15, 40 + 273.15)) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.deterministic_hill_climb(max_i=200) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient_deterministic") elif False: # Run the gradient ascent algorithm, where the starting point arbitrary (predict all of the NIST compounds) grad = GradientAscent(gc) grad.load_nist_data(nist, skip_missing_reactions=False) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient3") elif False: # Use Alberty's table from (Mathematica 2006) to calculate the dG0 of all possible reactions in KEGG grad = GradientAscent(gc) grad.cid2pmap_dict = alberty.cid2pmap_dict (pH, I, T) = (7, 0, 300) counter = 0 for rid in grad.kegg.get_all_rids(): sparse_reaction = grad.kegg.rid2sparse_reaction(rid) try: dG0 = grad.reaction_to_dG0(sparse_reaction, pH, I, T) print "R%05d: dG0_r = %.2f [kJ/mol]" % (rid, dG0) counter += 1 except MissingCompoundFormationEnergy as e: #print "R%05d: missing formation energy of C%05d" % (rid, e.cid) pass print "Managed to calculate the dG0 of %d reactions" % counter elif False: util._mkdir("../res/nist/fig") csv_writer = csv.writer(open("../res/nist/pseudoisomers.csv", "w")) cid_set = set() for row in nist.data: sparce_reaction = row['sparse'] cid_set.update(sparce_reaction.keys()) html_writer.write("<table border=1>\n") for cid in sorted(list(cid_set)): html_writer.write(" <tr><td>C%05d</td><td>%s</td><td>" % (cid, grad.kegg.cid2name(cid))) try: mol = grad.kegg.cid2mol(cid) img_fname = '../res/nist/fig/C%05d.png' % cid html_writer.embed_img(img_fname, "C%05d" % cid) mol.draw(show=False, filename=img_fname) except AssertionError as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) except KeggParseException as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) html_writer.write("</td><td>") if (cid in alberty.cid2pmap_dict): for (nH, z) in alberty.cid2pmap_dict[cid].keys(): html_writer.write("(nH=%d, z=%d)<br>" % (nH, z)) csv_writer.writerow((cid, nH, z)) else: nH = grad.kegg.cid2num_hydrogens(cid) z = grad.kegg.cid2charge(cid) html_writer.write("unknown pseudoisomers<br>") html_writer.write("(nH=%d, z=%d)" % (nH, z)) csv_writer.writerow((cid, nH, z)) html_writer.write("</td></tr>\n") html_writer.write("</table>\n") html_writer.close()
def main(): html_writer = HtmlWriter("../res/nist/report.html") estimators = LoadAllEstimators() nist = Nist() nist.T_range = (273.15 + 24, 273.15 + 40) #nist.override_I = 0.25 #nist.override_pMg = 14.0 #nist.override_T = 298.15 html_writer.write('<p>\n') html_writer.write("Total number of reaction in NIST: %d</br>\n" % len(nist.data)) html_writer.write("Total number of reaction in range %.1fK < T < %.1fK: %d</br>\n" % \ (nist.T_range[0], nist.T_range[1], len(nist.SelectRowsFromNist()))) html_writer.write('</p>\n') reactions = {} reactions['KEGG'] = [] for reaction in Kegg.getInstance().AllReactions(): try: reaction.Balance(balance_water=True, exception_if_unknown=True) reactions['KEGG'].append(reaction) except (KeggReactionNotBalancedException, KeggParseException, OpenBabelError): pass reactions['FEIST'] = Feist.FromFiles().reactions reactions['NIST'] = nist.GetUniqueReactionSet() pairs = [] #pairs += [('hatzi_gc', 'UGC')], ('PGC', 'PRC'), ('alberty', 'PRC')] for t1, t2 in pairs: logging.info('Writing the NIST report for %s vs. %s' % (estimators[t1].name, estimators[t2].name)) html_writer.write('<p><b>%s vs. %s</b> ' % (estimators[t1].name, estimators[t2].name)) html_writer.insert_toggle(start_here=True) two_way_comparison(html_writer=html_writer, thermo1=estimators[t1], thermo2=estimators[t2], reaction_list=reactions['FEIST'], name='%s_vs_%s' % (t1, t2)) html_writer.div_end() html_writer.write('</p>') if False: estimators['alberty'].CompareOverKegg( html_writer, other=estimators['PRC'], fig_name='kegg_compare_alberty_vs_nist') rowdicts = [] rowdict = {'Method': 'Total'} for db_name, reaction_list in reactions.iteritems(): rowdict[db_name + ' coverage'] = len(reaction_list) rowdicts.append(rowdict) for name in ['UGC', 'PGC', 'PRC', 'alberty', 'merged', 'hatzi_gc']: thermo = estimators[name] logging.info('Writing the NIST report for %s' % thermo.name) html_writer.write('<p><b>%s</b> ' % thermo.name) html_writer.insert_toggle(start_here=True) num_estimations, rmse = nist.verify_results(html_writer=html_writer, thermodynamics=thermo, name=name) html_writer.div_end() html_writer.write('N = %d, RMSE = %.1f</p>\n' % (num_estimations, rmse)) logging.info('N = %d, RMSE = %.1f' % (num_estimations, rmse)) rowdict = { 'Method': thermo.name, 'RMSE (kJ/mol)': "%.1f (N=%d)" % (rmse, num_estimations) } for db_name, reaction_list in reactions.iteritems(): n_covered = thermo.CalculateCoverage(reaction_list) percent = n_covered * 100.0 / len(reaction_list) rowdict[db_name + " coverage"] = "%.1f%% (%d)" % (percent, n_covered) logging.info(db_name + " coverage = %.1f%%" % percent) rowdicts.append(rowdict) headers = ['Method', 'RMSE (kJ/mol)'] + \ [db_name + ' coverage' for db_name in reactions.keys()] html_writer.write_table(rowdicts, headers=headers)
def main(): options, _ = MakeOpts().parse_args(sys.argv) db = SqliteDatabase(options.db_file) kegg = Kegg.getInstance() if options.override_table: db.Execute("DROP TABLE IF EXISTS " + options.table_name) DissociationConstants._CreateDatabase( db, options.table_name, drop_if_exists=options.override_table) cids_to_calculate = set() if options.nist: cids_to_calculate.update(Nist().GetAllCids()) cids_to_calculate.update(RedoxCarriers().GetAllCids()) ptable = PsuedoisomerTableThermodynamics.FromCsvFile( "../data/thermodynamics/formation_energies.csv") cids_to_calculate.update(ptable.get_all_cids()) else: cids_to_calculate.update(kegg.get_all_cids()) for row in db.Execute("SELECT distinct(cid) FROM %s" % options.table_name): if row[0] in cids_to_calculate: cids_to_calculate.remove(row[0]) cid2smiles_and_mw = {} for cid in cids_to_calculate: # the compound CO is a special case where the conversion from InChI # to SMILES fails, so we add a specific override for it only if cid == 237: cid2smiles_and_mw[cid] = ("[C-]#[O+]", 28) continue try: comp = kegg.cid2compound(cid) mol = comp.GetMolecule() cid2smiles_and_mw[cid] = (mol.ToSmiles(), mol.GetExactMass()) except KeggParseException: logging.debug("%s (C%05d) has no SMILES, skipping..." % (kegg.cid2name(cid), cid)) except OpenBabelError: logging.debug( "%s (C%05d) cannot be converted to SMILES, skipping..." % (kegg.cid2name(cid), cid)) # Do not recalculate pKas for CIDs that are already in the database cids_to_calculate = cid2smiles_and_mw.keys() cids_to_calculate.sort(key=lambda (cid): (cid2smiles_and_mw[cid][1], cid)) db_lock = threading.Lock() semaphore = threading.Semaphore(options.n_threads) for cid in cids_to_calculate: smiles, _ = cid2smiles_and_mw[cid] if not smiles: logging.info("The following compound is blacklisted: C%05d" % cid) continue thread = DissociationThreads(group=None, target=None, name=None, args=(cid, smiles, semaphore, db_lock, options), kwargs={}) thread.start()
def main(): kegg = Kegg.getInstance() options, args = MakeOpts().parse_args(sys.argv) print ('Parameters: T=%f K, pMg=%.2g, I=%.2gM' % (options.T, options.pMg, options.I)) print "reaction:", args[-1] estimators = LoadAllEstimators() plt.rcParams['legend.fontsize'] = 8 plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 12 plt.rcParams['lines.linewidth'] = 2 colormap = {} colormap['markers'] = (64.0/255, 111.0/255, 29.0/255, 3.0) #colormap['hatzi_gc'] = (54.0/255, 182.0/255, 202.0/255, 1.0) colormap['UGC'] = (202.0/255, 101.0/255, 54.0/255, 1.0) #colormap['alberty'] = (202.0/255, 54.0/255, 101.0/255, 1.0) colormap['PGC'] = (101.0/255, 202.0/255, 54.0/255, 1.0) fig = plt.figure(figsize=(6,6), dpi=90) fig.hold(True) if options.rid is None: reaction = GetSparseReactionInput(args[-1], kegg) else: reaction = kegg.rid2reaction(options.rid) reaction.Balance() print 'Reaction: %s' % reaction.FullReactionString() nist = Nist() nist_rows = nist.SelectRowsFromNist(reaction, check_reverse=True) pH_min = 7.0 - options.pH/2.0 pH_max = 7.0 + options.pH/2.0 if nist_rows: dG0_list = [] pH_list = [] for row_data in nist_rows: pH_list.append(row_data.pH) if row_data.reaction == reaction: dG0_list.append(row_data.dG0_r) else: dG0_list.append(-row_data.dG0_r) plt.plot(pH_list, dG0_list, marker='.', linestyle='none', label='measured data', markeredgecolor='none', markerfacecolor=colormap['markers'], markersize=5) pH_max = max(pH_list + [pH_max]) pH_min = min(pH_list + [pH_min]) pH_range = np.arange(pH_min-0.1, pH_max+0.1, 0.02) for key, thermo in estimators.iteritems(): if key not in colormap: continue print key, 'dG0 at pH=7: %.2f' % reaction.PredictReactionEnergy(thermo, pH=7.0, pMg=options.pMg, I=options.I, T=options.T) dG0 = [] for pH in pH_range: dG0.append(reaction.PredictReactionEnergy(thermo, pH=pH, pMg=options.pMg, I=options.I, T=options.T)) plt.plot(pH_range, dG0, marker='None', linestyle='solid', color=colormap[key], figure=fig, label=thermo.name) plt.xlabel('pH') plt.ylabel(r'$\Delta_r G^\circ$ [kJ/mol]') plt.title(kegg.reaction2string(reaction), fontsize=8) plt.legend(loc='lower left') if not options.output: plt.tight_layout() plt.show() else: fig.savefig(options.output, format='svg')
def main(): html_writer = HtmlWriter("../res/nist/report.html") estimators = LoadAllEstimators() nist = Nist() nist.T_range = (273.15 + 24, 273.15 + 40) #nist.override_I = 0.25 #nist.override_pMg = 14.0 #nist.override_T = 298.15 html_writer.write('<p>\n') html_writer.write("Total number of reaction in NIST: %d</br>\n" % len(nist.data)) html_writer.write("Total number of reaction in range %.1fK < T < %.1fK: %d</br>\n" % \ (nist.T_range[0], nist.T_range[1], len(nist.SelectRowsFromNist()))) html_writer.write('</p>\n') reactions = {} reactions['KEGG'] = [] for reaction in Kegg.getInstance().AllReactions(): try: reaction.Balance(balance_water=True, exception_if_unknown=True) reactions['KEGG'].append(reaction) except (KeggReactionNotBalancedException, KeggParseException, OpenBabelError): pass reactions['FEIST'] = Feist.FromFiles().reactions reactions['NIST'] = nist.GetUniqueReactionSet() pairs = [] #pairs += [('hatzi_gc', 'UGC')], ('PGC', 'PRC'), ('alberty', 'PRC')] for t1, t2 in pairs: logging.info('Writing the NIST report for %s vs. %s' % (estimators[t1].name, estimators[t2].name)) html_writer.write('<p><b>%s vs. %s</b> ' % (estimators[t1].name, estimators[t2].name)) html_writer.insert_toggle(start_here=True) two_way_comparison(html_writer=html_writer, thermo1=estimators[t1], thermo2=estimators[t2], reaction_list=reactions['FEIST'], name='%s_vs_%s' % (t1, t2)) html_writer.div_end() html_writer.write('</p>') if False: estimators['alberty'].CompareOverKegg(html_writer, other=estimators['PRC'], fig_name='kegg_compare_alberty_vs_nist') rowdicts = [] rowdict = {'Method': 'Total'} for db_name, reaction_list in reactions.iteritems(): rowdict[db_name + ' coverage'] = len(reaction_list) rowdicts.append(rowdict) for name in ['UGC', 'PGC', 'PRC', 'alberty', 'merged', 'hatzi_gc']: thermo = estimators[name] logging.info('Writing the NIST report for %s' % thermo.name) html_writer.write('<p><b>%s</b> ' % thermo.name) html_writer.insert_toggle(start_here=True) num_estimations, rmse = nist.verify_results(html_writer=html_writer, thermodynamics=thermo, name=name) html_writer.div_end() html_writer.write('N = %d, RMSE = %.1f</p>\n' % (num_estimations, rmse)) logging.info('N = %d, RMSE = %.1f' % (num_estimations, rmse)) rowdict = {'Method':thermo.name, 'RMSE (kJ/mol)':"%.1f (N=%d)" % (rmse, num_estimations)} for db_name, reaction_list in reactions.iteritems(): n_covered = thermo.CalculateCoverage(reaction_list) percent = n_covered * 100.0 / len(reaction_list) rowdict[db_name + " coverage"] = "%.1f%% (%d)" % (percent, n_covered) logging.info(db_name + " coverage = %.1f%%" % percent) rowdicts.append(rowdict) headers = ['Method', 'RMSE (kJ/mol)'] + \ [db_name + ' coverage' for db_name in reactions.keys()] html_writer.write_table(rowdicts, headers=headers)