def GetC1Thermodynamics( html_writer, reaction_fname='../data/thermodynamics/c1_reaction_thermodynamics.csv' ): html_writer.write("<h1>C1 thermodynamics</h1>\n") dict_list = [] db_public = SqliteDatabase('../data/public_data.sqlite') alberty = PsuedoisomerTableThermodynamics.FromDatabase(\ db_public, 'alberty_pseudoisomers', name='alberty') alberty.AddPseudoisomer(101, nH=23, z=0, nMg=0, dG0=0) reacthermo = ReactionThermodynamics(alberty, 'C1') reacthermo.pH = 7 reacthermo.I = 0.1 reacthermo.T = 298.15 reacthermo.pMg = 14 c1_reactions = [] for row in csv.DictReader(open(reaction_fname, 'r')): r = Reaction.FromFormula(row['formula']) r.Balance(balance_water=False) r.SetNames(row['enzyme']) dG0_r_prime = float(row['dG0_r_prime']) pH, I, pMg, T = [float(row[k]) for k in ['pH', 'I', 'pMg', 'T']] reacthermo.AddReaction(r, dG0_r_prime, pH=pH, I=I, pMg=pMg, T=T) c1_reactions.append(r) row['formula'] = r.to_hypertext(show_cids=False) dict_list.append(row) html_writer.write_table( dict_list, headers=['acronym', 'enzyme', 'formula', 'dG0_r_prime']) reacthermo._Recalculate() return reacthermo
def LoadAllEstimators(): db_public = SqliteDatabase('../data/public_data.sqlite') db_gibbs = SqliteDatabase('../res/gibbs.sqlite') if not db_gibbs.DoesTableExist('prc_pseudoisomers'): nist_regression = NistRegression(db_gibbs) nist_regression.Train() tables = { 'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'), 'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)') } estimators = {} for key, (db, table_name, thermo_name) in tables.iteritems(): if db.DoesTableExist(table_name): estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase( db, table_name, name=thermo_name) else: logging.warning('The table %s does not exist in %s' % (table_name, str(db))) estimators['hatzi_gc'] = Hatzi(use_pKa=False) #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True) if db.DoesTableExist('bgc_pseudoisomers'): estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True) estimators['BGC'].init() estimators['BGC'].name = 'our method (BGC)' if db.DoesTableExist('pgc_pseudoisomers'): estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False) estimators['PGC'].init() estimators['PGC'].name = 'our method (PGC)' estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs) estimators['UGC'].init() estimators['UGC'].name = 'our method (UGC)' estimators['C1'] = ReactionThermodynamics.FromCsv( '../data/thermodynamics/c1_reaction_thermodynamics.csv', estimators['alberty']) if 'PGC' in estimators: estimators['merged'] = BinaryThermodynamics(estimators['alberty'], estimators['PGC']) estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'], estimators['PGC']) for thermo in estimators.values(): thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv') return estimators
def compare_charges(): #db_public = SqliteDatabase('../data/public_data.sqlite') db_gibbs = SqliteDatabase('../res/gibbs.sqlite') print "Writing Compare Charges report to ../res/groups_report.html" html_writer = HtmlWriter("../res/groups_report.html") kegg = Kegg.getInstance() #pH, I, pMg, T = default_pH, default_I, default_pMg, default_T pH, I, pMg, T = default_pH, 0, 14, default_T cid2error = {} for row_dict in db_gibbs.DictReader("gc_errors"): cid = int(row_dict['cid']) cid2error[cid] = row_dict['error'] estimators = {} estimators['hatzi'] = Hatzi(use_pKa=False) estimators['milo'] = PsuedoisomerTableThermodynamics.FromDatabase( db_gibbs, 'gc_pseudoisomers', name='Milo Group Contribution') all_cids = set(lsum([e.get_all_cids() for e in estimators.values()])) dict_list = [] for cid in all_cids: try: name = kegg.cid2name(cid) link = kegg.cid2compound(cid).get_link() except KeyError: name = "unknown" link = "" row_dict = { 'cid': '<a href="%s">C%05d</a>' % (link, cid), 'name': name, 'error': cid2error.get(cid, None) } for key, est in estimators.iteritems(): try: pmap = est.cid2PseudoisomerMap(cid) dG0, dG0_tag, nH, z, nMg = pmap.GetMostAbundantPseudoisomer( pH, I, pMg, T) except MissingCompoundFormationEnergy: dG0, dG0_tag, nH, z, nMg = "", "", "", "", "" row_dict['nH_' + key] = nH row_dict['charge_' + key] = z row_dict['nMg_' + key] = nMg row_dict['dG0_' + key] = dG0 row_dict['dG0_tag_' + key] = dG0_tag dict_list.append(row_dict) html_writer.write_table( dict_list, headers=['cid', 'name', 'charge_hatzi', 'charge_milo', 'error']) html_writer.close()
def main(): options, _ = MakeOpts().parse_args(sys.argv) db = SqliteDatabase("../res/gibbs.sqlite") public_db = SqliteDatabase("../data/public_data.sqlite") output_filename = os.path.abspath(options.output_filename) logging.info('Will write output to %s' % output_filename) html_writer = HtmlWriter(output_filename) nist = Nist(T_range=None) nist_regression = NistRegression(db, html_writer=html_writer, nist=nist) nist_regression.std_diff_threshold = 5 # the threshold over which to print an analysis of a reaction #nist_regression.nist.T_range = None(273.15 + 24, 273.15 + 40) #nist_regression.nist.override_I = 0.25 #nist_regression.nist.override_pMg = 14.0 html_writer.write("<h2>NIST regression:</h2>") if options.use_prior: logging.info('Using the data from Alberty as fixed prior') prior_thermo = PsuedoisomerTableThermodynamics.FromDatabase( public_db, 'alberty_pseudoisomers', name="Alberty") else: prior_thermo = None html_writer.write('</br><b>Regression Tables</b>\n') html_writer.insert_toggle(start_here=True) nist_regression.Train(options.from_database, prior_thermo) html_writer.div_end() html_writer.write('</br><b>PRC results</b>\n') html_writer.insert_toggle(start_here=True) nist_regression.WriteDataToHtml(html_writer) html_writer.div_end() html_writer.write('</br><b>Transformed reaction energies - PRC vs. Observed</b>\n') html_writer.insert_toggle(start_here=True) N, rmse = nist_regression.VerifyResults() html_writer.div_end() logging.info("Regression results for transformed data:") logging.info("N = %d, RMSE = %.1f" % (N, rmse)) html_writer.close()
def AnalyzeStats(self, html_writer): """ Produces a set of plots that show some statistics about the NIST database """ logging.info('Calculating statistics for NIST database (%d rows)' % len(self.data)) if not self.data: raise Exception("The database has no rows in it") T_list = [] I_list = [] pH_list = [] pMg_list = [] year_list = [] for nist_row_data in self.data: pH_list.append(nist_row_data.pH) T_list.append(nist_row_data.T - 273.15) if nist_row_data.I: I_list.append(nist_row_data.I) if nist_row_data.pMg: pMg_list.append(nist_row_data.pMg) year = nist_row_data.GetYear() if year: year_list.append(year) html_writer.write("<p><h2>NIST database statistics</h2>\n") fig = plt.figure() plt.title("Temperature histogram") plt.hist(T_list, np.arange(int(min(T_list)), int(max(T_list) + 1), 2.5)) plt.xlabel("Temperature (C)") plt.ylabel("No. of measurements") html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='hist_T') fig = plt.figure() plt.hist(pMg_list, np.arange(0, 10.1, 0.1)) plt.title("pMg histogram") plt.xlabel("pMg") plt.ylabel("No. of measurements") html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='hist_pMg') fig = plt.figure() plt.hist(pH_list, np.arange(4, 11, 0.1)) plt.title("pH histogram") plt.xlabel("pH") plt.ylabel("No. of measurements") html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='hist_pH') fig = plt.figure() plt.hist(I_list, np.arange(0, 1, 0.025)) plt.title("Ionic Strength histogram") plt.xlabel("Ionic Strength [M]") plt.ylabel("No. of measurements") html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='hist_I') # histogram of publication years fig = plt.figure() plt.hist(year_list, np.arange(1930, 2010, 5)) plt.title("Year of publication histogram") plt.xlabel("Year of publication") plt.ylabel("No. of measurements") html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='hist_year') db_public = SqliteDatabase('../data/public_data.sqlite') alberty = PsuedoisomerTableThermodynamics.FromDatabase( db_public, 'alberty_pseudoisomers') alberty_cids = set(alberty.get_all_cids()) nist_cids = set(self.GetAllCids()) count_list = [ "Alberty #compounds = %d" % len(alberty_cids), "NIST #compounds = %d" % len(nist_cids), "intersection #compounds = %d" % len(alberty_cids.intersection(nist_cids)) ] html_writer.write_ul(count_list) N = 60 # cutoff for the number of counts in the histogram hist_a = np.zeros(N) hist_b = np.zeros(N) for cid, cnt in self.cid2count.iteritems(): if cnt >= N: cnt = N - 1 if cid in alberty_cids: hist_a[cnt] += 1 else: hist_b[cnt] += 1 hist_a[0] = len(alberty_cids.difference(self.cid2count.keys())) fig = plt.figure() plt.rc('font', size=10) plt.hold(True) p1 = plt.bar(range(N), hist_a, color='b') p2 = plt.bar(range(N), hist_b, color='r', bottom=hist_a[0:N]) plt.text(N - 1, hist_a[N - 1] + hist_b[N - 1], '> %d' % (N - 1), fontsize=10, horizontalalignment='right', verticalalignment='baseline') plt.title("Overlap with Alberty's database") plt.xlabel("N reactions") plt.ylabel("no. of compounds measured in N reactions") plt.legend((p1[0], p2[0]), ("Exist in Alberty's database", "New compounds")) html_writer.embed_matplotlib_figure(fig, width=320, height=240, name='connectivity')
# nH - change in H+ # pH - the conditions in which the E' was measured # # Ferredoxin ox/red: E' = -0.380V (nE = 1, nH = 0) -> dG0 = 38.0 kJ/mol [1] # Ubiqinone ox/red: E' = 0.113V (nE = 2, nH = 2) -> dG0 = -103.2 kJ/mol [1] # Menaquinone ox/red: E' = -0.074V (nE = 2, nH = 2) -> dG0 = -65.8 kJ/mol [1] # # [1] - Thauer 1977 observed_thermo_fname = options.thermodynamics_filename print 'Loading observed thermodynamic data from %s' % observed_thermo_fname observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile( observed_thermo_fname) if thermo_source == 'hatzi_only': thermo = PsuedoisomerTableThermodynamics.FromDatabase( db, 'hatzi_thermodynamics') thermo.AddPseudoisomer(139, nH=0, z=1, nMg=0, dG0=0) # Ferrodoxin(ox) thermo.AddPseudoisomer(138, nH=0, z=0, nMg=0, dG0=38.0) # Ferrodoxin(red) thermo.AddPseudoisomer(399, nH=90, z=0, nMg=0, dG0=0) # Ubiquinone-10(ox) thermo.AddPseudoisomer(390, nH=92, z=0, nMg=0, dG0=-103.2) # Ubiquinone-10(red) thermo.AddPseudoisomer(828, nH=16, z=0, nMg=0, dG0=0) # Menaquinone(ox) thermo.AddPseudoisomer(5819, nH=18, z=0, nMg=0, dG0=-65.8) # Menaquinone(red) thermo.SetPseudoisomerMap(101, PseudoisomerMap(nH=23, z=0, nMg=0, dG0=0.0)) # THF thermo.SetPseudoisomerMap(234,
print 'SBML model filename:', options.sbml_model_filename print 'CSV output filename:', options.csv_output_filename print 'KEGG Database filename:', options.kegg_db_filename print 'Observed Thermodynamics filename:', options.thermo_filename print 'Thermodynamic Database filename:', options.db_filename print 'Group Contribution Table Name:', options.gc_table_name db = SqliteDatabase(options.db_filename) observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile( options.thermo_filename) if not db.DoesTableExist(options.gc_table_name): raise ValueError('The table %s does not exist in the database. ' 'Please run the groups.py script and try again.' % options.gc_table_name) thermo = PsuedoisomerTableThermodynamics.FromDatabase( db, options.gc_table_name) thermo.override_data(observed_thermo) kegg = Kegg.getInstance() document = libsbml.readSBML(options.sbml_model_filename) if document.getNumErrors(): raise Exception('cannot read SBML model from file %s due to error: %s' % (options.sbml_model_filename, document.getError(0).getMessage())) model = document.getModel() logging.info('Done parsing the model: ' + model.getName()) model_annotation = semanticSBML.annotate.ModelElementsAnnotations(model, suppress_errors=True) species_ids_to_names = dict([(s.getId(), s.getName()) for s in model.getListOfSpecies()]) reaction_ids_to_names = dict([(r.getId(), r.getName()) for r in model.getListOfReactions()]) rowdicts = []
def main(): kegg = Kegg.getInstance() prefix = '../res/prc_' fixed_cids = {} # a dictionary from CID to pairs of (nH, dG0) # Alberty formation energies directly measured, linearly independent: fixed_cids[1] = (2, -237.19) # H2O fixed_cids[9] = (1, -1096.1) # HPO3(-2) fixed_cids[14] = (4, -79.31) # NH4(+1) fixed_cids[59] = (0, -744.53) # SO4(-2) fixed_cids[288] = (1, -586.77) # HCO3(-1) # Alberty zeros: fixed_cids[3] = (26, 0.0) # NAD(ox) fixed_cids[10] = (32, 0.0) # CoA fixed_cids[127] = (30, 0.0) # glutathione(ox) fixed_cids[376] = (28, 0.0) # retinal(ox) # Directly measured values fixed_cids[4] = (27, 22.65) # NAD(red) -- relative to NAD(ox) fixed_cids[212] = (13, -194.5) # adenosine #fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors' # Alberty zeros which are not in NIST: #fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox) #fixed_cids[16] = (31, 0.0) # FAD(ox) #fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox) #fixed_cids[61] = (19, 0.0) # FMN(ox) #fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox) #fixed_cids[399] = (90, 0.0) # ubiquinone(ox) public_db = SqliteDatabase("../data/public_data.sqlite") alberty = PsuedoisomerTableThermodynamics.FromDatabase( public_db, 'alberty_pseudoisomers', label=None, name='Alberty') alberty_cid2dG0 = {} alberty_cid2nH = {} for cid in alberty.get_all_cids(): pmap = alberty.cid2PseudoisomerMap(cid) dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer( pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) alberty_cid2nH[cid] = nH alberty_cid2dG0[cid] = dG0 if not os.path.exists(prefix + 'S.txt'): db = SqliteDatabase("../res/gibbs.sqlite") nist_regression = NistRegression(db) cid2nH = {} for cid in nist_regression.nist.GetAllCids(): if cid in fixed_cids: cid2nH[cid] = fixed_cids[cid][0] elif cid in alberty_cid2nH: cid2nH[cid] = alberty_cid2nH[cid] else: tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer( cid, pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) if tmp is not None: cid2nH[cid] = tmp[0] else: logging.warning( 'The most abundant pseudoisomer of %s (C%05d) ' 'cannot be resolved. Using nH = 0.' % (kegg.cid2name(cid), cid)) cid2nH[cid] = 0 #nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction #nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40) S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH) # export the raw data matrices to text files C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids]) np.savetxt(prefix + 'CID.txt', C, fmt='%d', delimiter=',') np.savetxt(prefix + 'S.txt', S, fmt='%g', delimiter=',') np.savetxt(prefix + 'dG0.txt', dG0, fmt='%.2f', delimiter=',') else: C = np.loadtxt(prefix + 'CID.txt', delimiter=',') cids = [int(cid) for cid in C[:, 0]] cid2nH = {} for i, cid in enumerate(cids): cid2nH[cid] = int(C[i, 1]) S = np.loadtxt(prefix + 'S.txt', delimiter=',') dG0 = np.loadtxt(prefix + 'dG0.txt', delimiter=',') dG0 = np.reshape(dG0, (dG0.shape[0], 1)) html_writer = HtmlWriter('../res/regression_fast.html') html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n") html_writer.write("<p>The stoichiometric matrix (S):") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, S, cids) html_writer.div_end() html_writer.write('</p>') index2value = {} S_extended = S # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds for cid in fixed_cids.keys(): i = cids.index(cid) e_i = np.zeros((1, len(cids))) e_i[0, i] = 1.0 S_extended = np.vstack([S_extended, e_i]) nH, dG0_fixed = fixed_cids[cid] index2value[i] = dG0_fixed x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value) cid2dG0 = {} for i, cid in enumerate(cids): cid2dG0[cid] = x[i] # Calculate the Kernel of the reduced stoichiometric matrix (after removing # the columns of the fixed compounds). cids_red = [cid for cid in cids if cid not in fixed_cids] index_red = [i for i in xrange(len(cids)) if i not in index2value] S_red = S[:, index_red] K_red = LinearRegression.Kernel(S_red) #print "Reduced Stoichiometric Matrix:" #print matrix2string(S_red, cids_red, kegg) #print '-'*80 # Find all CIDs that are completely determined and do not depend on any # free variable. In other words, all zeros columns in K2. dict_list = [] determined_indices = np.where( np.sum(abs(K_red), 0) < 1e-10)[0] # all zero-columns in reducedK determined_cids = [cids_red[i] for i in determined_indices] plot_data = [] for i, cid in enumerate(cids): d = { 'CID': 'C%05d' % cid, 'Compound': kegg.cid2name(cid), 'nH': '%d' % cid2nH[cid], 'dG0 (PRC)': '%.1f' % cid2dG0[cid] } if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] if cid not in fixed_cids: plot_data.append( (alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid))) else: d['dG0 (Alberty)'] = '' if cid in fixed_cids: d['Depends on'] = 'anchored' elif cid in determined_cids: d['Depends on'] = 'fixed compounds' else: d['Depends on'] = 'kernel dimensions' dict_list.append(d) dict_list.sort(key=lambda (x): (x['Depends on'], x['CID'])) html_writer.write( "<p>Formation energies determined by the linear constraints:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'Compound', 'CID', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)', 'Depends on' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') # Plot a comparison between PRC and Alberty formation energies fig = plt.figure(figsize=(8, 8), dpi=80) plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], 'b.', figure=fig) for x, y, name in plot_data: plt.text(x, y, name, fontsize=6) plt.xlabel('Alberty $\Delta_f G^\circ$') plt.ylabel('PRC $\Delta_f G^\circ$') html_writer.write("<p>Plot comparing PRC and Alberty results:") html_writer.insert_toggle(start_here=True) html_writer.embed_matplotlib_figure(fig) html_writer.div_end() html_writer.write("</p>") K_sparse = SparseKernel(S_red).Solve() html_writer.write( "<p>The sparse null-space of the reduced stoichiometric matrix:") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, K_sparse, cids_red) html_writer.div_end() html_writer.write("</p>") dict_list = [] index2string_html = dict( (i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0])) index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0])) for i, cid in enumerate(cids_red): d = {} d['KEGG ID'] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid) d['KEGG ID plain'] = 'C%05d' % cid d['Compound'] = kegg.cid2name(cid) d['nH'] = '%d' % cid2nH[cid] if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] else: d['dG0 (Alberty)'] = '' d['dG0 (PRC)'] = '%.1f' % cid2dG0[cid] d['dG0 (PRC) plain'] = '%.1f' % cid2dG0[cid] indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist() indic.reverse() d['order_key'] = indic if mlab.rms_flat(K_sparse[:, i]) > 1e-10: d['dG0 (PRC)'] += " + (" + vector2string(K_sparse[:, i], index2string_html) + ")" d['dG0 (PRC) plain'] += " + (" + vector2string( K_sparse[:, i], index2string) + ")" dict_list.append(d) dict_list.sort(key=lambda (d): (d['order_key'], d['KEGG ID plain'])) # Export the results to CSV csv_writer = csv.writer(open('../res/prc_results.csv', 'w')) csv_writer.writerow( ['KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)']) for d in dict_list: csv_writer.writerow([ d['KEGG ID plain'], d['Compound'], d['nH'], d['dG0 (PRC) plain'], d['dG0 (Alberty)'] ]) html_writer.write( "<p>All formation energies as a function of the free variables:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') fp = open('../res/prc_latex.txt', 'w') fp.write( latex.table2LaTeX(dict_list, headers=[ '#', 'KEGG ID plain', 'Compound', 'nH', 'dG0 (PRC) plain', 'dG0 (Alberty)' ])) fp.close()