def __init__(self,
                 db,
                 html_writer=None,
                 dissociation=None,
                 anchor_all=False):
        PsuedoisomerTableThermodynamics.__init__(
            self, name="Unified Group Contribution")
        self.db = db
        self.html_writer = html_writer or NullHtmlWriter()
        self.dissociation = dissociation
        self.transformed = False
        self.CollapseReactions = False
        self.epsilon = 1e-10
        self.kegg = Kegg.getInstance()

        self.STOICHIOMETRIC_TABLE_NAME = 'ugc_S'
        self.GROUP_TABLE_NAME = 'ugc_G'
        self.GIBBS_ENERGY_TABLE_NAME = 'ugc_b'
        self.ANCHORED_TABLE_NAME = 'ugc_anchored'
        self.COMPOUND_TABLE_NAME = 'ugc_compounds'
        self.OBSERVATION_TABLE_NAME = 'ugc_observations'
        self.GROUPVEC_TABLE_NAME = 'ugc_groupvectors'
        self.UNIQUE_OBSERVATION_TABLE_NAME = 'ugc_unique_observations'
        self.THERMODYNAMICS_TABLE_NAME = 'ugc_pseudoisomers'
        self.ERRORS_TABLE_NAME = 'ugc_errors'
        self.CONSERVATIONS_TABLE_NAME = 'ugc_conservations'

        if anchor_all:
            self.FORMATION_ENERGY_FILENAME = '../data/thermodynamics/formation_energies_anchor_all.csv'
        else:
            self.FORMATION_ENERGY_FILENAME = '../data/thermodynamics/formation_energies.csv'
 def init(self):
     if self.db.DoesTableExist(self.THERMODYNAMICS_TABLE_NAME):
         logging.info('Reading thermodynamic data from database')
         reader = self.db.DictReader(self.THERMODYNAMICS_TABLE_NAME)
         PsuedoisomerTableThermodynamics._FromDictReader(
             reader, self, label=None, name="Unified Group Contribution",
             warn_for_conflicting_refs=False)
 
         conservation_rows = []        
         for row in self.db.DictReader(self.CONSERVATIONS_TABLE_NAME):
             sparse = dict((int(cid), coeff) for (cid, coeff) in json.loads(row['json']).iteritems())
             msg = row['msg']
             conservation_rows.append((msg, sparse))
 
         logging.info('Reading conservation matrix data from database')
         all_cids = sorted(self.kegg.get_all_cids())
         cid_dict = dict((cid, i) for (i, cid) in enumerate(all_cids))
         self.P_L_tot = np.matrix(np.zeros((len(conservation_rows), len(all_cids))))
         for i, (msg, sparse) in enumerate(conservation_rows):
             for cid, coeff in sparse.iteritems():
                 if cid not in cid_dict:
                     raise Exception("ERROR: C%05d is not found in KEGG but appears in our database"
                                     % cid)
                 self.P_L_tot[i, cid_dict[cid]] = float(coeff)
     else:
         self.LoadGroups(True)
         self.LoadObservations(True)
         self.LoadGroupVectors(True)
         self.LoadData(True)
         self.EstimateKeggCids()
 def __init__(self, db, html_writer=None, dissociation=None, anchor_all=False):
     PsuedoisomerTableThermodynamics.__init__(self, name="Unified Group Contribution")
     self.db = db
     self.html_writer = html_writer or NullHtmlWriter()
     self.dissociation = dissociation
     self.transformed = False
     self.CollapseReactions = False
     self.epsilon = 1e-10
     self.kegg = Kegg.getInstance()
     
     self.STOICHIOMETRIC_TABLE_NAME = 'ugc_S'
     self.GROUP_TABLE_NAME = 'ugc_G'
     self.GIBBS_ENERGY_TABLE_NAME = 'ugc_b'
     self.ANCHORED_TABLE_NAME = 'ugc_anchored'
     self.COMPOUND_TABLE_NAME = 'ugc_compounds'
     self.OBSERVATION_TABLE_NAME = 'ugc_observations'
     self.GROUPVEC_TABLE_NAME = 'ugc_groupvectors'
     self.UNIQUE_OBSERVATION_TABLE_NAME = 'ugc_unique_observations'
     self.THERMODYNAMICS_TABLE_NAME = 'ugc_pseudoisomers'
     self.ERRORS_TABLE_NAME = 'ugc_errors'
     self.CONSERVATIONS_TABLE_NAME = 'ugc_conservations'
     
     if anchor_all:
         self.FORMATION_ENERGY_FILENAME = '../data/thermodynamics/formation_energies_anchor_all.csv'
     else:
         self.FORMATION_ENERGY_FILENAME = '../data/thermodynamics/formation_energies.csv'
class TestThermoJsonOutput(unittest.TestCase):
    def setUp(self):
        fake_csv_file = StringIO(CSV_DATA)
        csv_reader = csv.DictReader(fake_csv_file)
        self.fake_thermo_csv = PsuedoisomerTableThermodynamics()
        self.fake_thermo_csv = PsuedoisomerTableThermodynamics._FromDictReader(
            csv_reader, self.fake_thermo_csv, warn_for_conflicting_refs=False)

        db = SqliteDatabase(PUBLIC_DB_FNAME)
        db_reader = db.DictReader('fake_pseudoisomers')
        self.fake_thermo_db = PsuedoisomerTableThermodynamics()
        self.fake_thermo_db = PsuedoisomerTableThermodynamics._FromDictReader(
            db_reader, self.fake_thermo_db, warn_for_conflicting_refs=False)

    def testGetJsonDictionary(self):
        json_list = [
            self.fake_thermo_csv.GetJSONDictionary(),
            self.fake_thermo_db.GetJSONDictionary()
        ]

        for json_data in json_list:
            self.assertEqual(json_data[0]['cid'], 1)
            self.assertEqual(json_data[0]['source'], 'Alberty (2003)')
            self.assertEqual(json_data[0]['inchi'], 'InChI=1S/H2O/h1H2')
            self.assertEqual(json_data[0]['num_electrons'], 10)
            self.assertAlmostEqual(json_data[0]['species'][0]['dG0_f'],
                                   -237.19,
                                   places=5)

            self.assertEqual(json_data[1]['cid'], 2)
            self.assertEqual(json_data[1]['source'], 'Alberty (2003)')
            self.assertEqual(
                json_data[1]['inchi'],
                'InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25-30(21,22)28-31(23,24)27-29(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H,23,24)(H2,11,12,13)(H2,18,19,20)/t4-,6-,7-,10-/m1/s1'
            )
            self.assertEqual(json_data[1]['num_electrons'], 260)
            self.assertEqual(json_data[1]['species'][0]['nH'], 12)
            self.assertEqual(json_data[1]['species'][0]['nMg'], 0)
            self.assertEqual(json_data[1]['species'][0]['z'], -4)
            self.assertAlmostEqual(json_data[1]['species'][0]['dG0_f'],
                                   -2768.1,
                                   places=5)
            self.assertEqual(json_data[1]['species'][1]['nH'], 12)
            self.assertEqual(json_data[1]['species'][1]['nMg'], 1)
            self.assertEqual(json_data[1]['species'][1]['z'], -2)
            self.assertAlmostEqual(json_data[1]['species'][1]['dG0_f'],
                                   -3258.7,
                                   places=5)

            self.assertEqual(json_data[2]['cid'], 7)
            self.assertEqual(json_data[2]['source'], 'Alberty (2003)')
            self.assertEqual(json_data[2]['inchi'], 'InChI=1S/O2/c1-2')
            self.assertEqual(json_data[2]['num_electrons'], 16)
            self.assertAlmostEqual(json_data[2]['species'][0]['dG0_f'],
                                   16.4,
                                   places=5)
Example #5
0
 def __init__(self, db, dissociation=None,
              html_writer=None, nist=None):
     PsuedoisomerTableThermodynamics.__init__(self)
     self.db = db
     self.html_writer = html_writer or NullHtmlWriter()
     self.nist = nist or Nist()
     self.dissociation = None
     
     self.cid2pmap_dict = {}
     
     self.assume_no_pKa_by_default = False
     self.std_diff_threshold = np.inf
Example #6
0
 def init(self):
     self.LoadGroups(True)
     self.LoadObservations(True)
     self.LoadGroupVectors(True)
     if self.db.DoesTableExist(self.CONTRIBUTION_TABLE_NAME):
         self.LoadContributionsFromDB()
     else:
         self.Train()
         self.EstimateKeggCids()
     
     reader = self.db.DictReader(self.THERMODYNAMICS_TABLE_NAME)
     PsuedoisomerTableThermodynamics._FromDictReader(
         reader, self, label=None, name="Group Contribution",
         warn_for_conflicting_refs=False)
Example #7
0
def main():
    pH, pMg, I, T = (7.0, 3, 0.1, 298.15)

    db = SqliteDatabase('../res/gibbs.sqlite')
    kegg = Kegg.getInstance()
    alberty = PsuedoisomerTableThermodynamics(
        '../data/thermodynamics/alberty_pseudoisomers.csv')

    cids = alberty.get_all_cids()
    dG0_f = pylab.zeros((len(cids), 1))

    for i, cid in enumerate(cids):
        dG0_f[i, 0] = alberty.cid2dG0_tag(cid, pH=pH, pMg=pMg, I=I, T=T)

    S = pylab.zeros((0, len(cids)))
    rids = []
    ec_numbers = []

    for rid in kegg.get_all_rids():
        sparse = kegg.rid2sparse_reaction(rid)
        if not set(cids).issuperset(sparse.keys()):
            continue

        rids.append(rid)
        ec_numbers.append(kegg.rid2ec_list(rid))
        S_row = pylab.zeros((1, len(cids)))
        for cid, coeff in sparse.iteritems():
            S_row[0, cids.index(cid)] = coeff
        S = pylab.vstack([S, S_row])

    dG0_r = pylab.dot(S, dG0_f)

    util._mkdir('../res/arren')
    s_writer = csv.writer(open('../res/arren/stoichiomety.csv', 'w'))
    r_writer = csv.writer(open('../res/arren/reactions.csv', 'w'))
    e_writer = csv.writer(open('../res/arren/ec_numbers.csv', 'w'))
    r_writer.writerow(['rid', 'dG0_r'])
    e_writer.writerow(['rid', 'ec0', 'ec1', 'ec2', 'ec3'])
    for i in xrange(S.shape[0]):
        s_writer.writerow(["%d" % x for x in S[i, :]])
        for ec in ec_numbers[i].split(';'):
            e_writer.writerow(['%d' % rids[i]] + ec.split('.'))
        r_writer.writerow(["%d" % rids[i], '%.1f' % dG0_r[i, 0]])

    c_writer = csv.writer(open('../res/arren/compounds.csv', 'w'))
    c_writer.writerow(['cid', 'dG0_f'])
    for j in xrange(len(cids)):
        c_writer.writerow(['%d' % cids[j], '%.1f' % dG0_f[j, 0]])
Example #8
0
def GetC1Thermodynamics(
        html_writer,
        reaction_fname='../data/thermodynamics/c1_reaction_thermodynamics.csv'
):
    html_writer.write("<h1>C1 thermodynamics</h1>\n")

    dict_list = []
    db_public = SqliteDatabase('../data/public_data.sqlite')
    alberty = PsuedoisomerTableThermodynamics.FromDatabase(\
                        db_public, 'alberty_pseudoisomers', name='alberty')
    alberty.AddPseudoisomer(101, nH=23, z=0, nMg=0, dG0=0)
    reacthermo = ReactionThermodynamics(alberty, 'C1')
    reacthermo.pH = 7
    reacthermo.I = 0.1
    reacthermo.T = 298.15
    reacthermo.pMg = 14

    c1_reactions = []
    for row in csv.DictReader(open(reaction_fname, 'r')):
        r = Reaction.FromFormula(row['formula'])
        r.Balance(balance_water=False)
        r.SetNames(row['enzyme'])
        dG0_r_prime = float(row['dG0_r_prime'])
        pH, I, pMg, T = [float(row[k]) for k in ['pH', 'I', 'pMg', 'T']]
        reacthermo.AddReaction(r, dG0_r_prime, pH=pH, I=I, pMg=pMg, T=T)
        c1_reactions.append(r)

        row['formula'] = r.to_hypertext(show_cids=False)
        dict_list.append(row)

    html_writer.write_table(
        dict_list, headers=['acronym', 'enzyme', 'formula', 'dG0_r_prime'])

    reacthermo._Recalculate()
    return reacthermo
Example #9
0
def main():
    pH, pMg, I, T = (7.0, 3, 0.1, 298.15)
    
    db = SqliteDatabase('../res/gibbs.sqlite')
    kegg = Kegg.getInstance()
    alberty = PsuedoisomerTableThermodynamics('../data/thermodynamics/alberty_pseudoisomers.csv')
    
    cids = alberty.get_all_cids()
    dG0_f = pylab.zeros((len(cids), 1))

    for i, cid in enumerate(cids):
        dG0_f[i, 0] = alberty.cid2dG0_tag(cid, pH=pH, pMg=pMg, I=I, T=T)
    
    S = pylab.zeros((0, len(cids)))
    rids = []
    ec_numbers = []
    
    for rid in kegg.get_all_rids():
        sparse = kegg.rid2sparse_reaction(rid)
        if not set(cids).issuperset(sparse.keys()):
            continue
        
        rids.append(rid)
        ec_numbers.append(kegg.rid2ec_list(rid))
        S_row = pylab.zeros((1, len(cids)))
        for cid, coeff in sparse.iteritems():
            S_row[0, cids.index(cid)] = coeff
        S = pylab.vstack([S, S_row])
    
    dG0_r = pylab.dot(S, dG0_f)

    util._mkdir('../res/arren')
    s_writer = csv.writer(open('../res/arren/stoichiomety.csv', 'w'))
    r_writer = csv.writer(open('../res/arren/reactions.csv', 'w'))
    e_writer = csv.writer(open('../res/arren/ec_numbers.csv', 'w'))
    r_writer.writerow(['rid', 'dG0_r'])
    e_writer.writerow(['rid', 'ec0', 'ec1', 'ec2', 'ec3'])
    for i in xrange(S.shape[0]):
        s_writer.writerow(["%d" % x for x in S[i,:]])
        for ec in ec_numbers[i].split(';'):
            e_writer.writerow(['%d' % rids[i]] + ec.split('.'))
        r_writer.writerow(["%d" % rids[i], '%.1f' % dG0_r[i,0]])
    
    c_writer = csv.writer(open('../res/arren/compounds.csv', 'w'))
    c_writer.writerow(['cid', 'dG0_f'])
    for j in xrange(len(cids)):
        c_writer.writerow(['%d' % cids[j], '%.1f' % dG0_f[j, 0]])
Example #10
0
    def init(self):
        self.LoadGroups(True)
        self.LoadObservations(True)
        self.LoadGroupVectors(True)
        if self.db.DoesTableExist(self.CONTRIBUTION_TABLE_NAME):
            self.LoadContributionsFromDB()
        else:
            self.Train()
            self.EstimateKeggCids()

        reader = self.db.DictReader(self.THERMODYNAMICS_TABLE_NAME)
        PsuedoisomerTableThermodynamics._FromDictReader(
            reader,
            self,
            label=None,
            name="Group Contribution",
            warn_for_conflicting_refs=False)
Example #11
0
    def __init__(self, db, html_writer=None, transformed=False):
        """Construct a GroupContribution instance.
        
        Args:
            db: the database handle to read from.
            html_writer: the HtmlWriter to write to.
            kegg: a Kegg instance if you don't want to use the default one.
        """
        PsuedoisomerTableThermodynamics.__init__(self,
                                                 name="Group Contribution")
        self.db = db
        self.html_writer = html_writer or NullHtmlWriter()
        self.dissociation = None
        self.transformed = transformed

        self.epsilon = 1e-10

        self.kegg = Kegg.getInstance()
        self.bounds = deepcopy(self.kegg.cid2bounds)

        self.group_nullspace = None
        self.group_contributions = None
        self.obs_collection = None

        self.cid2error = {}
        self.cid2groupvec = None

        if transformed:
            prefix = 'bgc'
        else:
            prefix = 'pgc'

        self.OBSERVATION_TABLE_NAME = prefix + '_observations'
        self.GROUPVEC_TABLE_NAME = prefix + '_groupvector'
        self.NULLSPACE_TABLE_NAME = prefix + '_nullspace'
        self.CONTRIBUTION_TABLE_NAME = prefix + '_contribution'
        self.REGRESSION_TABLE_NAME = prefix + '_regression'

        self.THERMODYNAMICS_TABLE_NAME = prefix + '_pseudoisomers'
        self.STOICHIOMETRIC_MATRIX_TABLE_NAME = prefix + '_stoichiometry'
        self.ANCHORED_CONTRIBUTIONS_TALBE_NAME = prefix + '_anchored_g'
        self.ANCHORED_CIDS_TABLE_NAME = prefix + '_anchored_cids'
        self.ANCHORED_P_L_TALBE_NAME = prefix + '_anchored_P_L'
Example #12
0
    def __init__(self, db, html_writer=None, transformed=False):
        """Construct a GroupContribution instance.
        
        Args:
            db: the database handle to read from.
            html_writer: the HtmlWriter to write to.
            kegg: a Kegg instance if you don't want to use the default one.
        """
        PsuedoisomerTableThermodynamics.__init__(self, name="Group Contribution")
        self.db = db
        self.html_writer = html_writer or NullHtmlWriter()
        self.dissociation = None
        self.transformed = transformed
        
        self.epsilon = 1e-10

        self.kegg = Kegg.getInstance()
        self.bounds = deepcopy(self.kegg.cid2bounds)

        self.group_nullspace = None
        self.group_contributions = None
        self.obs_collection = None
        
        self.cid2error = {}
        self.cid2groupvec = None

        if transformed:
            prefix = 'bgc'
        else:
            prefix = 'pgc'
        
        self.OBSERVATION_TABLE_NAME = prefix + '_observations'
        self.GROUPVEC_TABLE_NAME = prefix + '_groupvector'
        self.NULLSPACE_TABLE_NAME = prefix + '_nullspace'
        self.CONTRIBUTION_TABLE_NAME = prefix + '_contribution'
        self.REGRESSION_TABLE_NAME = prefix + '_regression'
        
        self.THERMODYNAMICS_TABLE_NAME = prefix + '_pseudoisomers'
        self.STOICHIOMETRIC_MATRIX_TABLE_NAME = prefix + '_stoichiometry'
        self.ANCHORED_CONTRIBUTIONS_TALBE_NAME = prefix + '_anchored_g'
        self.ANCHORED_CIDS_TABLE_NAME = prefix + '_anchored_cids'
        self.ANCHORED_P_L_TALBE_NAME = prefix + '_anchored_P_L'
Example #13
0
def LoadAllEstimators():
    db_public = SqliteDatabase('../data/public_data.sqlite')
    db_gibbs = SqliteDatabase('../res/gibbs.sqlite')

    if not db_gibbs.DoesTableExist('prc_pseudoisomers'):
        nist_regression = NistRegression(db_gibbs)
        nist_regression.Train()

    tables = {
        'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'),
        'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)')
    }
    estimators = {}
    for key, (db, table_name, thermo_name) in tables.iteritems():
        if db.DoesTableExist(table_name):
            estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase(
                db, table_name, name=thermo_name)
        else:
            logging.warning('The table %s does not exist in %s' %
                            (table_name, str(db)))

    estimators['hatzi_gc'] = Hatzi(use_pKa=False)
    #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True)

    if db.DoesTableExist('bgc_pseudoisomers'):
        estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True)
        estimators['BGC'].init()
        estimators['BGC'].name = 'our method (BGC)'

    if db.DoesTableExist('pgc_pseudoisomers'):
        estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False)
        estimators['PGC'].init()
        estimators['PGC'].name = 'our method (PGC)'

    estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs)
    estimators['UGC'].init()
    estimators['UGC'].name = 'our method (UGC)'

    estimators['C1'] = ReactionThermodynamics.FromCsv(
        '../data/thermodynamics/c1_reaction_thermodynamics.csv',
        estimators['alberty'])

    if 'PGC' in estimators:
        estimators['merged'] = BinaryThermodynamics(estimators['alberty'],
                                                    estimators['PGC'])
        estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'],
                                                       estimators['PGC'])

    for thermo in estimators.values():
        thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv')

    return estimators
Example #14
0
def compare_charges():
    #db_public = SqliteDatabase('../data/public_data.sqlite')
    db_gibbs = SqliteDatabase('../res/gibbs.sqlite')
    print "Writing Compare Charges report to ../res/groups_report.html"
    html_writer = HtmlWriter("../res/groups_report.html")
    kegg = Kegg.getInstance()

    #pH, I, pMg, T = default_pH, default_I, default_pMg, default_T
    pH, I, pMg, T = default_pH, 0, 14, default_T

    cid2error = {}
    for row_dict in db_gibbs.DictReader("gc_errors"):
        cid = int(row_dict['cid'])
        cid2error[cid] = row_dict['error']

    estimators = {}
    estimators['hatzi'] = Hatzi(use_pKa=False)
    estimators['milo'] = PsuedoisomerTableThermodynamics.FromDatabase(
        db_gibbs, 'gc_pseudoisomers', name='Milo Group Contribution')

    all_cids = set(lsum([e.get_all_cids() for e in estimators.values()]))
    dict_list = []
    for cid in all_cids:
        try:
            name = kegg.cid2name(cid)
            link = kegg.cid2compound(cid).get_link()
        except KeyError:
            name = "unknown"
            link = ""
        row_dict = {
            'cid': '<a href="%s">C%05d</a>' % (link, cid),
            'name': name,
            'error': cid2error.get(cid, None)
        }
        for key, est in estimators.iteritems():
            try:
                pmap = est.cid2PseudoisomerMap(cid)
                dG0, dG0_tag, nH, z, nMg = pmap.GetMostAbundantPseudoisomer(
                    pH, I, pMg, T)
            except MissingCompoundFormationEnergy:
                dG0, dG0_tag, nH, z, nMg = "", "", "", "", ""
            row_dict['nH_' + key] = nH
            row_dict['charge_' + key] = z
            row_dict['nMg_' + key] = nMg
            row_dict['dG0_' + key] = dG0
            row_dict['dG0_tag_' + key] = dG0_tag
        dict_list.append(row_dict)

    html_writer.write_table(
        dict_list,
        headers=['cid', 'name', 'charge_hatzi', 'charge_milo', 'error'])
    html_writer.close()
    def init(self):
        if self.db.DoesTableExist(self.THERMODYNAMICS_TABLE_NAME):
            logging.info('Reading thermodynamic data from database')
            reader = self.db.DictReader(self.THERMODYNAMICS_TABLE_NAME)
            PsuedoisomerTableThermodynamics._FromDictReader(
                reader,
                self,
                label=None,
                name="Unified Group Contribution",
                warn_for_conflicting_refs=False)

            conservation_rows = []
            for row in self.db.DictReader(self.CONSERVATIONS_TABLE_NAME):
                sparse = dict(
                    (int(cid), coeff)
                    for (cid, coeff) in json.loads(row['json']).iteritems())
                msg = row['msg']
                conservation_rows.append((msg, sparse))

            logging.info('Reading conservation matrix data from database')
            all_cids = sorted(self.kegg.get_all_cids())
            cid_dict = dict((cid, i) for (i, cid) in enumerate(all_cids))
            self.P_L_tot = np.matrix(
                np.zeros((len(conservation_rows), len(all_cids))))
            for i, (msg, sparse) in enumerate(conservation_rows):
                for cid, coeff in sparse.iteritems():
                    if cid not in cid_dict:
                        raise Exception(
                            "ERROR: C%05d is not found in KEGG but appears in our database"
                            % cid)
                    self.P_L_tot[i, cid_dict[cid]] = float(coeff)
        else:
            self.LoadGroups(True)
            self.LoadObservations(True)
            self.LoadGroupVectors(True)
            self.LoadData(True)
            self.EstimateKeggCids()
    def setUp(self):
        fake_csv_file = StringIO(CSV_DATA)
        csv_reader = csv.DictReader(fake_csv_file)
        self.fake_thermo_csv = PsuedoisomerTableThermodynamics()
        self.fake_thermo_csv = PsuedoisomerTableThermodynamics._FromDictReader(
            csv_reader, self.fake_thermo_csv, warn_for_conflicting_refs=False)

        db = SqliteDatabase(PUBLIC_DB_FNAME)
        db_reader = db.DictReader('fake_pseudoisomers')
        self.fake_thermo_db = PsuedoisomerTableThermodynamics()
        self.fake_thermo_db = PsuedoisomerTableThermodynamics._FromDictReader(
            db_reader, self.fake_thermo_db, warn_for_conflicting_refs=False)
Example #17
0
def main():
    options, _ = MakeOpts().parse_args(sys.argv)
    
    db = SqliteDatabase("../res/gibbs.sqlite")
    public_db = SqliteDatabase("../data/public_data.sqlite")
    output_filename = os.path.abspath(options.output_filename)
    logging.info('Will write output to %s' % output_filename)
    
    html_writer = HtmlWriter(output_filename)
    nist = Nist(T_range=None)
    nist_regression = NistRegression(db, html_writer=html_writer, nist=nist)
    nist_regression.std_diff_threshold = 5 # the threshold over which to print an analysis of a reaction
    #nist_regression.nist.T_range = None(273.15 + 24, 273.15 + 40)
    #nist_regression.nist.override_I = 0.25
    #nist_regression.nist.override_pMg = 14.0

    html_writer.write("<h2>NIST regression:</h2>")
    if options.use_prior:
        logging.info('Using the data from Alberty as fixed prior')
        prior_thermo = PsuedoisomerTableThermodynamics.FromDatabase(
            public_db, 'alberty_pseudoisomers', name="Alberty")
    else:
        prior_thermo = None
    html_writer.write('</br><b>Regression Tables</b>\n')
    html_writer.insert_toggle(start_here=True)
    nist_regression.Train(options.from_database, prior_thermo)
    html_writer.div_end()
 
    html_writer.write('</br><b>PRC results</b>\n')
    html_writer.insert_toggle(start_here=True)
    nist_regression.WriteDataToHtml(html_writer)
    html_writer.div_end()

    html_writer.write('</br><b>Transformed reaction energies - PRC vs. Observed</b>\n')
    html_writer.insert_toggle(start_here=True)
    N, rmse = nist_regression.VerifyResults()
    html_writer.div_end()
    
    logging.info("Regression results for transformed data:")
    logging.info("N = %d, RMSE = %.1f" % (N, rmse))

    html_writer.close()
def main():
    ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
        FormationEnergyFileName, label='testing')
    kegg = Kegg.getInstance()
    pH, I, pMg, T = (7.0, 0.25, 14, 298.15)

    output_csv = csv.writer(
        open('../res/formation_energies_transformed.csv', 'w'))
    output_csv.writerow([
        "cid", "name", "dG'0", "pH", "I", "pMg", "T", "anchor", "compound_ref",
        "remark"
    ])
    for cid in ptable.get_all_cids():
        pmap = ptable.cid2PseudoisomerMap(cid)
        dG0_prime = pmap.Transform(pH=pH, I=I, pMg=pMg, T=T)
        output_csv.writerow([
            cid,
            kegg.cid2name(cid),
            "%.1f" % dG0_prime, pH, I, pMg, T, 1, ptable.cid2source_string[cid]
        ])
def main():
    pH, I, pMg, T = 7.0, 0.25, 14.0, 298.15

    dissociation = DissociationConstants.FromPublicDB()
    kegg = Kegg.getInstance()
    obs_fname = "../data/thermodynamics/formation_energies.csv"
    res_fname = "../res/formation_energies_transformed.csv"

    train_species = PsuedoisomerTableThermodynamics.FromCsvFile(
        obs_fname, label='testing')
    csv_out = csv.writer(open(res_fname, 'w'))
    csv_out.writerow([
        'cid', 'name', "dG'0", 'pH', 'I', 'pMg', 'T', 'anchor', 'compound_ref',
        'remark'
    ])
    for cid in train_species.get_all_cids():
        pmap = train_species.cid2PseudoisomerMap(cid)
        source = train_species.cid2source_string[cid]
        pmatrix = pmap.ToMatrix(
        )  # ToMatrix returns tuples of (nH, z, nMg, dG0)
        if len(pmatrix) != 1:
            raise Exception("multiple training species for C%05d" % cid)
        nH, charge, nMg, dG0 = pmatrix[0]
        name = "%s (%d)" % (kegg.cid2name(cid), nH)
        logging.info('Adding the formation energy of %s', name)
        diss_table = dissociation.GetDissociationTable(cid,
                                                       create_if_missing=True)
        if diss_table is None:
            raise Exception("%s [C%05d, nH=%d, nMg=%d] does not have a "
                            "dissociation table" % (name, cid, nH, nMg))

        diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
        diss_table.SetCharge(nH, charge, nMg)
        dG0_prime = diss_table.Transform(pH, I, pMg, T)
        csv_out.writerow([
            cid,
            kegg.cid2name(cid),
            "%.1f" % dG0_prime, pH, I, pMg, T, True, source, None
        ])
Example #20
0
    def ReadFormationEnergies(self):
        """
            Reads the entire table of formation energies which are to be used
            later both to add them directly to the observed data table and to
            be used for normalizing NIST data.
        """

        self.formation_dict = {}

        for label in ['training', 'testing']:
            ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
                self.FormationEnergyFileName, label=label)
            for cid in ptable.get_all_cids():
                pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix()
                if len(pmatrix) != 1:
                    raise Exception("multiple training species for C%05d" %
                                    cid)
                nH, charge, nMg, dG0 = pmatrix[0]
                if cid in self.cid2nH_nMg:
                    if (nH, nMg) != self.cid2nH_nMg[cid]:
                        raise Exception(
                            "The pseudoisomer of C%05d "
                            "in the formation energy table (nH=%d) "
                            "is not consistent with the pKa table (nH=%d)." %
                            (cid, nH, self.cid2nH_nMg[cid][0]))
                else:
                    self.cid2nH_nMg[cid] = (nH, nMg)
                diss_table = self.dissociation.GetDissociationTable(cid, False)
                if diss_table is None:
                    raise Exception("C%05d has no pKa data" % cid)
                diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
                dG0_prime = diss_table.Transform(pH=self.pH,
                                                 I=self.I,
                                                 pMg=self.pMg,
                                                 T=self.T)
                ref = ptable.cid2SourceString(cid)
                self.formation_dict[cid] = (label, ref, dG0_prime, dG0, nH,
                                            charge, nMg)
    def GetTransfromedReactionEnergies(self,
                                       S,
                                       cids,
                                       pH=None,
                                       I=None,
                                       pMg=None,
                                       T=None,
                                       conc=1):
        dG0_r = PsuedoisomerTableThermodynamics.GetTransfromedReactionEnergies(
            self, S, cids, pH, I, pMg, T)
        if conc != 1:
            pH, I, pMg, T = self.GetConditions(pH, I, pMg, T)
            dG0_r += AddConcentrationsToReactionEnergies(S, cids, T, conc)

        # find the mapping between the list of all CIDs and the list that the
        # new S is using
        all_cids = sorted(self.kegg.get_all_cids())
        inds = [all_cids.index(cid) for cid in cids]

        # test to see if any of the reactions in S violate any conservation laws
        violations = abs(self.P_L_tot[:, inds] * S).sum(0) > self.epsilon
        dG0_r[violations] = np.nan
        return dG0_r
Example #22
0
def ExportJSONFiles():
    estimators = LoadAllEstimators()
    options, _ = MakeOpts(estimators).parse_args(sys.argv)

    thermo_list = []
    thermo_list.append(estimators[options.thermodynamics_source])
    thermo_list.append(
        PsuedoisomerTableThermodynamics.FromCsvFile(
            options.thermodynamics_csv))

    # Make sure we have all the data.
    kegg = Kegg.getInstance()
    for i, thermo in enumerate(thermo_list):
        print "Priority %d - formation energies of: %s" % (i + 1, thermo.name)
        kegg.AddThermodynamicData(thermo, priority=(i + 1))

    db = SqliteDatabase('../res/gibbs.sqlite')

    print 'Exporting Group Contribution Nullspace matrix as JSON.'
    nullspace_vectors = []
    for row in db.DictReader('ugc_conservations'):
        d = {'msg': row['msg']}
        sparse = json.loads(row['json'])
        d['reaction'] = []
        for cid, coeff in sparse.iteritems():
            d['reaction'].append([coeff, "C%05d" % int(cid)])
        nullspace_vectors.append(d)
    WriteJSONFile(nullspace_vectors, options.nullspace_out_filename)

    print 'Exporting KEGG compounds as JSON.'
    WriteJSONFile(kegg.AllCompounds(), options.compounds_out_filename)

    print 'Exporting KEGG reactions as JSON.'
    WriteJSONFile(kegg.AllReactions(), options.reactions_out_filename)

    print 'Exporting KEGG enzymes as JSON.'
    WriteJSONFile(kegg.AllEnzymes(), options.enzymes_out_filename)
Example #23
0
nist_row_data.pMg = 14
nist_row_data.evaluation = "A"
nist_row_data.url = ""
nist_row_data.ref_id = ""
#nist_row_data.reaction = Reaction(["triose isomerase"], {111:-1, 118:1})
#cid2nH = {111:6, 118:5}
#cid2min_dG0 = {111:-1296.3, 118:-1288.6}

nist_row_data.reaction = Reaction(["triose isomerase"], {
    1: -1,
    78: -1,
    22: 1,
    14: 1,
    463: 1
})
train_species = PsuedoisomerTableThermodynamics.FromCsvFile(
    '../data/thermodynamics/dG0.csv')
cid2nH = {}
cid2dG0 = {}
for cid in train_species.get_all_cids():
    pmap = train_species.cid2PseudoisomerMap(cid)
    pmatrix = pmap.ToMatrix()
    if len(pmatrix) != 1:
        raise Exception("C%05d has more than one species in the training set" %
                        cid)
    cid2nH[cid] = pmatrix[0][0]  # ToMatrix returns tuples of (nH, z, nMg, dG0)
    cid2dG0[cid] = pmatrix[0][3]

ddG0_r = 0
dG0_r = 0
dG0_tag_r = 0
for cid, coeff in nist_row_data.reaction.sparse.iteritems():
Example #24
0
    def EstimateKeggCids(self):
        """
            Uses the Group Contributions to estimate the entire set of compounds in KEGG,
            and then writes the results to the database as 'gc_pseudoisomers' table
            
            Options:
                override_all_observed_compounds - If True, any observed formation energy is 
                    used instead of the GC estimation. If False, only 'test' compounds are used.
        """
        logging.info("Estimating formation energies for all KEGG")

        observed_species = PsuedoisomerTableThermodynamics.FromCsvFile(
            '../data/thermodynamics/formation_energies.csv', label='testing')

        for rc in RedoxCarriers().itervalues():
            observed_species.AddPseudoisomer(rc.cid_ox,
                                             nH=rc.nH_ox,
                                             z=rc.z_ox,
                                             nMg=0,
                                             dG0=0.0,
                                             ref=rc.ref)
            observed_species.AddPseudoisomer(rc.cid_red,
                                             nH=rc.nH_red,
                                             z=rc.z_red,
                                             nMg=0,
                                             dG0=rc.ddG0,
                                             ref=rc.ref)
            observed_species.cid2source_string[rc.cid_ox] = rc.ref
            observed_species.cid2source_string[rc.cid_red] = rc.ref

        self.cid2pmap_dict = {}
        self.cid2source_string = {}

        self.html_writer.write(
            '</br><b>Estimated formation energies for KEGG compounds</b>\n')
        self.html_writer.insert_toggle(start_here=True)
        for cid in sorted(self.kegg.get_all_cids()):
            self.html_writer.write('<b>C%05d - %s</b></br>\n' %
                                   (cid, self.kegg.cid2name(cid)))

            diss_table = self.GetDissociationTable(cid)
            if cid in observed_species.get_all_cids():
                pmap_obs = observed_species.cid2PseudoisomerMap(cid)
                self.cid2source_string[
                    cid] = observed_species.cid2SourceString(cid)
                pmatrix = pmap_obs.ToMatrix(
                )  # returns a list of (nH, z, nMg, dG0)
                if len(pmatrix) == 1 and diss_table is not None:
                    # assume that only the most abundant pseudoisomer is given
                    # and complete the formation energies of the others using the
                    # pKa values in the dissociation table
                    nH, _z, nMg, dG0 = pmatrix[0]
                    diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0,
                                                                nH=nH,
                                                                nMg=nMg)
                    pmap = diss_table.GetPseudoisomerMap()
                    self.SetPseudoisomerMap(cid, pmap)
                else:
                    if diss_table is not None:
                        logging.warning("C%05d has multiple training species, "
                                        "overriding the dissociation table" %
                                        cid)
                    self.SetPseudoisomerMap(cid, pmap_obs)
            elif diss_table is None:
                self.html_writer.write('Warning: no dissociation table</br>\n')
                continue
            else:
                nH, nMg = self.cid2nH_nMg[cid]
                groupvector = self.cid2groupvec.get(cid, None)
                if groupvector is None:
                    self.html_writer.write(
                        'Warning: no group vector (%s)</br>\n' %
                        self.cid2error[cid])
                    continue
                try:
                    dG0 = self.groupvec2val(groupvector)
                except GroupMissingTrainDataError as e:
                    # in this case we do not care if a compound violated the group
                    # conservation laws because it might cancel out later when we
                    # use it to calculate reactions.
                    dG0 = e.value
                    self.html_writer.write('Warning: %s</br>\n' % str(e))
                    logging.debug("C%05d: %s" % (cid, str(e)))

                self.cid2source_string[cid] = "Group Contribution"

                if self.transformed:
                    diss_table.SetTransformedFormationEnergy(dG0_tag=dG0,
                                                             pH=default_pH,
                                                             I=default_I,
                                                             pMg=default_pMg,
                                                             T=default_T)
                else:
                    diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0,
                                                                nH=nH,
                                                                nMg=nMg)
                pmap = diss_table.GetPseudoisomerMap()
                self.SetPseudoisomerMap(cid, pmap)
            self.cid2PseudoisomerMap(cid).WriteToHTML(self.html_writer)

        self.html_writer.div_end()

        logging.info("Writing the results to the database")
        self.ToDatabase(self.db, self.THERMODYNAMICS_TABLE_NAME)
Example #25
0
    def AnalyzeStats(self, html_writer):
        """
            Produces a set of plots that show some statistics about the NIST database
        """
        logging.info('Calculating statistics for NIST database (%d rows)' %
                     len(self.data))

        if not self.data:
            raise Exception("The database has no rows in it")

        T_list = []
        I_list = []
        pH_list = []
        pMg_list = []
        year_list = []
        for nist_row_data in self.data:
            pH_list.append(nist_row_data.pH)
            T_list.append(nist_row_data.T - 273.15)

            if nist_row_data.I:
                I_list.append(nist_row_data.I)
            if nist_row_data.pMg:
                pMg_list.append(nist_row_data.pMg)

            year = nist_row_data.GetYear()
            if year:
                year_list.append(year)

        html_writer.write("<p><h2>NIST database statistics</h2>\n")
        fig = plt.figure()
        plt.title("Temperature histogram")
        plt.hist(T_list, np.arange(int(min(T_list)), int(max(T_list) + 1),
                                   2.5))
        plt.xlabel("Temperature (C)")
        plt.ylabel("No. of measurements")
        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='hist_T')

        fig = plt.figure()
        plt.hist(pMg_list, np.arange(0, 10.1, 0.1))
        plt.title("pMg histogram")
        plt.xlabel("pMg")
        plt.ylabel("No. of measurements")
        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='hist_pMg')

        fig = plt.figure()
        plt.hist(pH_list, np.arange(4, 11, 0.1))
        plt.title("pH histogram")
        plt.xlabel("pH")
        plt.ylabel("No. of measurements")
        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='hist_pH')

        fig = plt.figure()
        plt.hist(I_list, np.arange(0, 1, 0.025))
        plt.title("Ionic Strength histogram")
        plt.xlabel("Ionic Strength [M]")
        plt.ylabel("No. of measurements")
        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='hist_I')

        # histogram of publication years
        fig = plt.figure()
        plt.hist(year_list, np.arange(1930, 2010, 5))
        plt.title("Year of publication histogram")
        plt.xlabel("Year of publication")
        plt.ylabel("No. of measurements")
        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='hist_year')

        db_public = SqliteDatabase('../data/public_data.sqlite')
        alberty = PsuedoisomerTableThermodynamics.FromDatabase(
            db_public, 'alberty_pseudoisomers')
        alberty_cids = set(alberty.get_all_cids())
        nist_cids = set(self.GetAllCids())

        count_list = [
            "Alberty #compounds = %d" % len(alberty_cids),
            "NIST #compounds = %d" % len(nist_cids),
            "intersection #compounds = %d" %
            len(alberty_cids.intersection(nist_cids))
        ]
        html_writer.write_ul(count_list)

        N = 60  # cutoff for the number of counts in the histogram
        hist_a = np.zeros(N)
        hist_b = np.zeros(N)
        for cid, cnt in self.cid2count.iteritems():
            if cnt >= N:
                cnt = N - 1
            if cid in alberty_cids:
                hist_a[cnt] += 1
            else:
                hist_b[cnt] += 1
        hist_a[0] = len(alberty_cids.difference(self.cid2count.keys()))

        fig = plt.figure()
        plt.rc('font', size=10)
        plt.hold(True)
        p1 = plt.bar(range(N), hist_a, color='b')
        p2 = plt.bar(range(N), hist_b, color='r', bottom=hist_a[0:N])
        plt.text(N - 1,
                 hist_a[N - 1] + hist_b[N - 1],
                 '> %d' % (N - 1),
                 fontsize=10,
                 horizontalalignment='right',
                 verticalalignment='baseline')
        plt.title("Overlap with Alberty's database")
        plt.xlabel("N reactions")
        plt.ylabel("no. of compounds measured in N reactions")
        plt.legend((p1[0], p2[0]),
                   ("Exist in Alberty's database", "New compounds"))

        html_writer.embed_matplotlib_figure(fig,
                                            width=320,
                                            height=240,
                                            name='connectivity')
    # dG0 =  -E'*nE*F - R*T*ln(10)*nH*pH
    # Where:
    #    F  = 96.48 kC/mol
    #    nE - change in e-
    #    nH - change in H+
    #    pH - the conditions in which the E' was measured
    #
    # Ferredoxin  ox/red: E' = -0.380V (nE = 1, nH = 0) -> dG0 = 38.0 kJ/mol [1]
    # Ubiqinone   ox/red: E' =  0.113V (nE = 2, nH = 2) -> dG0 = -103.2 kJ/mol [1]
    # Menaquinone ox/red: E' = -0.074V (nE = 2, nH = 2) -> dG0 = -65.8 kJ/mol [1]
    #
    # [1] - Thauer 1977

    observed_thermo_fname = options.thermodynamics_filename
    print 'Loading observed thermodynamic data from %s' % observed_thermo_fname
    observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile(
        observed_thermo_fname)

    if thermo_source == 'hatzi_only':
        thermo = PsuedoisomerTableThermodynamics.FromDatabase(
            db, 'hatzi_thermodynamics')
        thermo.AddPseudoisomer(139, nH=0, z=1, nMg=0, dG0=0)  # Ferrodoxin(ox)
        thermo.AddPseudoisomer(138, nH=0, z=0, nMg=0,
                               dG0=38.0)  # Ferrodoxin(red)
        thermo.AddPseudoisomer(399, nH=90, z=0, nMg=0,
                               dG0=0)  # Ubiquinone-10(ox)
        thermo.AddPseudoisomer(390, nH=92, z=0, nMg=0,
                               dG0=-103.2)  # Ubiquinone-10(red)
        thermo.AddPseudoisomer(828, nH=16, z=0, nMg=0,
                               dG0=0)  # Menaquinone(ox)
        thermo.AddPseudoisomer(5819, nH=18, z=0, nMg=0,
                               dG0=-65.8)  # Menaquinone(red)
Example #27
0
    def ConvertFormation2Reaction(self, output_fname):
        logging.info("Converting all formation energies to reactions")
        output_csv = csv.writer(open(output_fname, 'w'))

        # keep the format used for TECRDB
        output_csv.writerow(
            ('ref', 'ID', 'method', 'eval', 'EC', 'name', 'kegg_reaction',
             'reaction', 'dG0\'', 'T', 'I', 'pH', 'pMg'))

        atom2cid = {}
        for atom, (name, stoich) in KeggObservation.ATOM2ELEMENT.iteritems():
            cid, _, _ = self.kegg.name2cid(name, 0)
            if cid is None:
                raise Exception(
                    "Cannot find the element %s in the KEGG database" % name)
            atom2cid[atom] = (cid, stoich)
            #output_csv.writerow(('element',
            #                     'C%05d' % cid, 'formation', 'A', '',
            #                     'formation of %s' % self.kegg.cid2name(cid),
            #                     "C%05d" % cid,
            #                     name, 0, self.T, self.I, self.pH, self.pMg))

        for label in ['training', 'testing']:
            ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
                self.FormationEnergyFileName, label=label)
            for cid in ptable.get_all_cids():
                pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix()
                if len(pmatrix) != 1:
                    raise Exception("multiple training species for C%05d" %
                                    cid)
                nH, _charge, nMg, dG0 = pmatrix[0]
                diss_table = dissociation.GetDissociationTable(cid, False)
                if diss_table is None:
                    continue
                diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
                dG0_prime = diss_table.Transform(pH=self.pH,
                                                 I=self.I,
                                                 pMg=self.pMg,
                                                 T=self.T)
                ref = ptable.cid2SourceString(cid)

                atom_bag = self.kegg.cid2atom_bag(cid)
                if not atom_bag:
                    continue

                ne = self.kegg.cid2num_electrons(cid)
                elem_ne = 0
                sparse = {cid: 1}
                for elem, count in atom_bag.iteritems():
                    if elem == 'H':
                        continue
                    elem_ne += count * Molecule.GetAtomicNum(elem)
                    elem_cid, elem_coeff = atom2cid[elem]
                    sparse.setdefault(elem_cid, 0)
                    sparse[elem_cid] += -count * elem_coeff

                # use the H element to balance the electrons in the formation
                # reactions (we don't need to balance protons since this is
                # a biochemical reaction, so H+ are 'free').
                H_cid, H_coeff = atom2cid['H']
                sparse[H_cid] = (elem_ne - ne) * H_coeff
                reaction = Reaction(
                    "formation of %s" % self.kegg.cid2name(cid), sparse)

                output_csv.writerow(
                    (ref, 'C%05d' % cid, 'formation', 'A', '',
                     'formation of %s' % self.kegg.cid2name(cid),
                     reaction.FullReactionString(),
                     reaction.FullReactionString(show_cids=False),
                     '%.2f' % dG0_prime, self.T, self.I, self.pH, self.pMg))
Example #28
0
def main():
    kegg = Kegg.getInstance()
    prefix = '../res/prc_'

    fixed_cids = {}  # a dictionary from CID to pairs of (nH, dG0)

    # Alberty formation energies directly measured, linearly independent:
    fixed_cids[1] = (2, -237.19)  # H2O
    fixed_cids[9] = (1, -1096.1)  # HPO3(-2)
    fixed_cids[14] = (4, -79.31)  # NH4(+1)
    fixed_cids[59] = (0, -744.53)  # SO4(-2)
    fixed_cids[288] = (1, -586.77)  # HCO3(-1)

    # Alberty zeros:
    fixed_cids[3] = (26, 0.0)  # NAD(ox)
    fixed_cids[10] = (32, 0.0)  # CoA
    fixed_cids[127] = (30, 0.0)  # glutathione(ox)
    fixed_cids[376] = (28, 0.0)  # retinal(ox)

    # Directly measured values
    fixed_cids[4] = (27, 22.65)  # NAD(red) -- relative to NAD(ox)
    fixed_cids[212] = (13, -194.5)  # adenosine
    #fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors'

    # Alberty zeros which are not in NIST:
    #fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox)
    #fixed_cids[16]  = (31, 0.0) # FAD(ox)
    #fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox)
    #fixed_cids[61]  = (19, 0.0) # FMN(ox)
    #fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox)
    #fixed_cids[399] = (90, 0.0) # ubiquinone(ox)

    public_db = SqliteDatabase("../data/public_data.sqlite")
    alberty = PsuedoisomerTableThermodynamics.FromDatabase(
        public_db, 'alberty_pseudoisomers', label=None, name='Alberty')
    alberty_cid2dG0 = {}
    alberty_cid2nH = {}
    for cid in alberty.get_all_cids():
        pmap = alberty.cid2PseudoisomerMap(cid)
        dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer(
            pH=default_pH, I=default_I, pMg=default_pMg, T=default_T)
        alberty_cid2nH[cid] = nH
        alberty_cid2dG0[cid] = dG0

    if not os.path.exists(prefix + 'S.txt'):
        db = SqliteDatabase("../res/gibbs.sqlite")
        nist_regression = NistRegression(db)

        cid2nH = {}
        for cid in nist_regression.nist.GetAllCids():
            if cid in fixed_cids:
                cid2nH[cid] = fixed_cids[cid][0]
            elif cid in alberty_cid2nH:
                cid2nH[cid] = alberty_cid2nH[cid]
            else:
                tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer(
                    cid,
                    pH=default_pH,
                    I=default_I,
                    pMg=default_pMg,
                    T=default_T)
                if tmp is not None:
                    cid2nH[cid] = tmp[0]
                else:
                    logging.warning(
                        'The most abundant pseudoisomer of %s (C%05d) '
                        'cannot be resolved. Using nH = 0.' %
                        (kegg.cid2name(cid), cid))
                    cid2nH[cid] = 0

        #nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction
        #nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40)
        S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH)

        # export the raw data matrices to text files

        C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids])
        np.savetxt(prefix + 'CID.txt', C, fmt='%d', delimiter=',')
        np.savetxt(prefix + 'S.txt', S, fmt='%g', delimiter=',')
        np.savetxt(prefix + 'dG0.txt', dG0, fmt='%.2f', delimiter=',')
    else:
        C = np.loadtxt(prefix + 'CID.txt', delimiter=',')
        cids = [int(cid) for cid in C[:, 0]]
        cid2nH = {}
        for i, cid in enumerate(cids):
            cid2nH[cid] = int(C[i, 1])
        S = np.loadtxt(prefix + 'S.txt', delimiter=',')
        dG0 = np.loadtxt(prefix + 'dG0.txt', delimiter=',')
        dG0 = np.reshape(dG0, (dG0.shape[0], 1))

    html_writer = HtmlWriter('../res/regression_fast.html')
    html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n")
    html_writer.write("<p>The stoichiometric matrix (S):")
    html_writer.insert_toggle(start_here=True)
    stoichiometric_matrix2html(html_writer, S, cids)
    html_writer.div_end()
    html_writer.write('</p>')

    index2value = {}
    S_extended = S  # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds
    for cid in fixed_cids.keys():
        i = cids.index(cid)
        e_i = np.zeros((1, len(cids)))
        e_i[0, i] = 1.0
        S_extended = np.vstack([S_extended, e_i])
        nH, dG0_fixed = fixed_cids[cid]
        index2value[i] = dG0_fixed

    x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value)
    cid2dG0 = {}
    for i, cid in enumerate(cids):
        cid2dG0[cid] = x[i]

    # Calculate the Kernel of the reduced stoichiometric matrix (after removing
    # the columns of the fixed compounds).
    cids_red = [cid for cid in cids if cid not in fixed_cids]
    index_red = [i for i in xrange(len(cids)) if i not in index2value]
    S_red = S[:, index_red]
    K_red = LinearRegression.Kernel(S_red)

    #print "Reduced Stoichiometric Matrix:"
    #print matrix2string(S_red, cids_red, kegg)
    #print '-'*80

    # Find all CIDs that are completely determined and do not depend on any
    # free variable. In other words, all zeros columns in K2.
    dict_list = []

    determined_indices = np.where(
        np.sum(abs(K_red), 0) < 1e-10)[0]  # all zero-columns in reducedK
    determined_cids = [cids_red[i] for i in determined_indices]
    plot_data = []
    for i, cid in enumerate(cids):
        d = {
            'CID': 'C%05d' % cid,
            'Compound': kegg.cid2name(cid),
            'nH': '%d' % cid2nH[cid],
            'dG0 (PRC)': '%.1f' % cid2dG0[cid]
        }
        if cid in alberty_cid2dG0:
            d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid]
            if cid not in fixed_cids:
                plot_data.append(
                    (alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid)))
        else:
            d['dG0 (Alberty)'] = ''

        if cid in fixed_cids:
            d['Depends on'] = 'anchored'
        elif cid in determined_cids:
            d['Depends on'] = 'fixed compounds'
        else:
            d['Depends on'] = 'kernel dimensions'

        dict_list.append(d)

    dict_list.sort(key=lambda (x): (x['Depends on'], x['CID']))
    html_writer.write(
        "<p>Formation energies determined by the linear constraints:")
    html_writer.insert_toggle(start_here=True)
    html_writer.write('<font size="1">')
    html_writer.write_table(dict_list,
                            headers=[
                                '#', 'Compound', 'CID', 'nH', 'dG0 (PRC)',
                                'dG0 (Alberty)', 'Depends on'
                            ])
    html_writer.write('</font>')
    html_writer.div_end()
    html_writer.write('</p>')

    # Plot a comparison between PRC and Alberty formation energies
    fig = plt.figure(figsize=(8, 8), dpi=80)
    plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data],
             'b.',
             figure=fig)
    for x, y, name in plot_data:
        plt.text(x, y, name, fontsize=6)
    plt.xlabel('Alberty $\Delta_f G^\circ$')
    plt.ylabel('PRC $\Delta_f G^\circ$')
    html_writer.write("<p>Plot comparing PRC and Alberty results:")
    html_writer.insert_toggle(start_here=True)
    html_writer.embed_matplotlib_figure(fig)
    html_writer.div_end()
    html_writer.write("</p>")

    K_sparse = SparseKernel(S_red).Solve()
    html_writer.write(
        "<p>The sparse null-space of the reduced stoichiometric matrix:")
    html_writer.insert_toggle(start_here=True)
    stoichiometric_matrix2html(html_writer, K_sparse, cids_red)
    html_writer.div_end()
    html_writer.write("</p>")

    dict_list = []
    index2string_html = dict(
        (i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0]))
    index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0]))
    for i, cid in enumerate(cids_red):
        d = {}
        d['KEGG ID'] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid)
        d['KEGG ID plain'] = 'C%05d' % cid
        d['Compound'] = kegg.cid2name(cid)
        d['nH'] = '%d' % cid2nH[cid]

        if cid in alberty_cid2dG0:
            d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid]
        else:
            d['dG0 (Alberty)'] = ''

        d['dG0 (PRC)'] = '%.1f' % cid2dG0[cid]
        d['dG0 (PRC) plain'] = '%.1f' % cid2dG0[cid]

        indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist()
        indic.reverse()
        d['order_key'] = indic
        if mlab.rms_flat(K_sparse[:, i]) > 1e-10:
            d['dG0 (PRC)'] += " + (" + vector2string(K_sparse[:, i],
                                                     index2string_html) + ")"
            d['dG0 (PRC) plain'] += " + (" + vector2string(
                K_sparse[:, i], index2string) + ")"
        dict_list.append(d)

    dict_list.sort(key=lambda (d): (d['order_key'], d['KEGG ID plain']))

    # Export the results to CSV
    csv_writer = csv.writer(open('../res/prc_results.csv', 'w'))
    csv_writer.writerow(
        ['KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)'])
    for d in dict_list:
        csv_writer.writerow([
            d['KEGG ID plain'], d['Compound'], d['nH'], d['dG0 (PRC) plain'],
            d['dG0 (Alberty)']
        ])

    html_writer.write(
        "<p>All formation energies as a function of the free variables:")
    html_writer.insert_toggle(start_here=True)
    html_writer.write('<font size="1">')
    html_writer.write_table(dict_list,
                            headers=[
                                '#', 'KEGG ID', 'Compound', 'nH', 'dG0 (PRC)',
                                'dG0 (Alberty)'
                            ])
    html_writer.write('</font>')
    html_writer.div_end()
    html_writer.write('</p>')

    fp = open('../res/prc_latex.txt', 'w')
    fp.write(
        latex.table2LaTeX(dict_list,
                          headers=[
                              '#', 'KEGG ID plain', 'Compound', 'nH',
                              'dG0 (PRC) plain', 'dG0 (Alberty)'
                          ]))
    fp.close()
Example #29
0
                          help="Group Contribution Table Name")    
    return opt_parser

options, _ = MakeOpts().parse_args(sys.argv)
if options.sbml_model_filename == None:
    raise ValueError("Must provide a SBML model")

print 'SBML model filename:', options.sbml_model_filename
print 'CSV output filename:', options.csv_output_filename
print 'KEGG Database filename:', options.kegg_db_filename
print 'Observed Thermodynamics filename:', options.thermo_filename
print 'Thermodynamic Database filename:', options.db_filename
print 'Group Contribution Table Name:', options.gc_table_name

db = SqliteDatabase(options.db_filename)
observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile(
    options.thermo_filename)
if not db.DoesTableExist(options.gc_table_name):
    raise ValueError('The table %s does not exist in the database. '
                     'Please run the groups.py script and try again.'
                     % options.gc_table_name)
thermo = PsuedoisomerTableThermodynamics.FromDatabase(
    db, options.gc_table_name)
thermo.override_data(observed_thermo)
kegg = Kegg.getInstance()

document = libsbml.readSBML(options.sbml_model_filename)
if document.getNumErrors():
    raise Exception('cannot read SBML model from file %s due to error: %s' % 
                    (options.sbml_model_filename, document.getError(0).getMessage()))
model = document.getModel()
logging.info('Done parsing the model: ' + model.getName())
def main():
    options, _ = MakeOpts().parse_args(sys.argv)
    db = SqliteDatabase(options.db_file)
    kegg = Kegg.getInstance()

    if options.override_table:
        db.Execute("DROP TABLE IF EXISTS " + options.table_name)

    DissociationConstants._CreateDatabase(
        db, options.table_name, drop_if_exists=options.override_table)

    cids_to_calculate = set()
    if options.nist:
        cids_to_calculate.update(Nist().GetAllCids())
        cids_to_calculate.update(RedoxCarriers().GetAllCids())

        ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
            "../data/thermodynamics/formation_energies.csv")
        cids_to_calculate.update(ptable.get_all_cids())
    else:
        cids_to_calculate.update(kegg.get_all_cids())

    for row in db.Execute("SELECT distinct(cid) FROM %s" % options.table_name):
        if row[0] in cids_to_calculate:
            cids_to_calculate.remove(row[0])

    cid2smiles_and_mw = {}
    for cid in cids_to_calculate:
        # the compound CO is a special case where the conversion from InChI
        # to SMILES fails, so we add a specific override for it only
        if cid == 237:
            cid2smiles_and_mw[cid] = ("[C-]#[O+]", 28)
            continue

        try:
            comp = kegg.cid2compound(cid)
            mol = comp.GetMolecule()
            cid2smiles_and_mw[cid] = (mol.ToSmiles(), mol.GetExactMass())
        except KeggParseException:
            logging.debug("%s (C%05d) has no SMILES, skipping..." %
                          (kegg.cid2name(cid), cid))
        except OpenBabelError:
            logging.debug(
                "%s (C%05d) cannot be converted to SMILES, skipping..." %
                (kegg.cid2name(cid), cid))

    # Do not recalculate pKas for CIDs that are already in the database
    cids_to_calculate = cid2smiles_and_mw.keys()
    cids_to_calculate.sort(key=lambda (cid): (cid2smiles_and_mw[cid][1], cid))

    db_lock = threading.Lock()
    semaphore = threading.Semaphore(options.n_threads)
    for cid in cids_to_calculate:
        smiles, _ = cid2smiles_and_mw[cid]
        if not smiles:
            logging.info("The following compound is blacklisted: C%05d" % cid)
            continue

        thread = DissociationThreads(group=None,
                                     target=None,
                                     name=None,
                                     args=(cid, smiles, semaphore, db_lock,
                                           options),
                                     kwargs={})
        thread.start()