def main():
    ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
        FormationEnergyFileName, label='testing')
    kegg = Kegg.getInstance()
    pH, I, pMg, T = (7.0, 0.25, 14, 298.15)

    output_csv = csv.writer(
        open('../res/formation_energies_transformed.csv', 'w'))
    output_csv.writerow([
        "cid", "name", "dG'0", "pH", "I", "pMg", "T", "anchor", "compound_ref",
        "remark"
    ])
    for cid in ptable.get_all_cids():
        pmap = ptable.cid2PseudoisomerMap(cid)
        dG0_prime = pmap.Transform(pH=pH, I=I, pMg=pMg, T=T)
        output_csv.writerow([
            cid,
            kegg.cid2name(cid),
            "%.1f" % dG0_prime, pH, I, pMg, T, 1, ptable.cid2source_string[cid]
        ])
def main():
    pH, I, pMg, T = 7.0, 0.25, 14.0, 298.15

    dissociation = DissociationConstants.FromPublicDB()
    kegg = Kegg.getInstance()
    obs_fname = "../data/thermodynamics/formation_energies.csv"
    res_fname = "../res/formation_energies_transformed.csv"

    train_species = PsuedoisomerTableThermodynamics.FromCsvFile(
        obs_fname, label='testing')
    csv_out = csv.writer(open(res_fname, 'w'))
    csv_out.writerow([
        'cid', 'name', "dG'0", 'pH', 'I', 'pMg', 'T', 'anchor', 'compound_ref',
        'remark'
    ])
    for cid in train_species.get_all_cids():
        pmap = train_species.cid2PseudoisomerMap(cid)
        source = train_species.cid2source_string[cid]
        pmatrix = pmap.ToMatrix(
        )  # ToMatrix returns tuples of (nH, z, nMg, dG0)
        if len(pmatrix) != 1:
            raise Exception("multiple training species for C%05d" % cid)
        nH, charge, nMg, dG0 = pmatrix[0]
        name = "%s (%d)" % (kegg.cid2name(cid), nH)
        logging.info('Adding the formation energy of %s', name)
        diss_table = dissociation.GetDissociationTable(cid,
                                                       create_if_missing=True)
        if diss_table is None:
            raise Exception("%s [C%05d, nH=%d, nMg=%d] does not have a "
                            "dissociation table" % (name, cid, nH, nMg))

        diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
        diss_table.SetCharge(nH, charge, nMg)
        dG0_prime = diss_table.Transform(pH, I, pMg, T)
        csv_out.writerow([
            cid,
            kegg.cid2name(cid),
            "%.1f" % dG0_prime, pH, I, pMg, T, True, source, None
        ])
Example #3
0
    def ReadFormationEnergies(self):
        """
            Reads the entire table of formation energies which are to be used
            later both to add them directly to the observed data table and to
            be used for normalizing NIST data.
        """

        self.formation_dict = {}

        for label in ['training', 'testing']:
            ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
                self.FormationEnergyFileName, label=label)
            for cid in ptable.get_all_cids():
                pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix()
                if len(pmatrix) != 1:
                    raise Exception("multiple training species for C%05d" %
                                    cid)
                nH, charge, nMg, dG0 = pmatrix[0]
                if cid in self.cid2nH_nMg:
                    if (nH, nMg) != self.cid2nH_nMg[cid]:
                        raise Exception(
                            "The pseudoisomer of C%05d "
                            "in the formation energy table (nH=%d) "
                            "is not consistent with the pKa table (nH=%d)." %
                            (cid, nH, self.cid2nH_nMg[cid][0]))
                else:
                    self.cid2nH_nMg[cid] = (nH, nMg)
                diss_table = self.dissociation.GetDissociationTable(cid, False)
                if diss_table is None:
                    raise Exception("C%05d has no pKa data" % cid)
                diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
                dG0_prime = diss_table.Transform(pH=self.pH,
                                                 I=self.I,
                                                 pMg=self.pMg,
                                                 T=self.T)
                ref = ptable.cid2SourceString(cid)
                self.formation_dict[cid] = (label, ref, dG0_prime, dG0, nH,
                                            charge, nMg)
Example #4
0
def ExportJSONFiles():
    estimators = LoadAllEstimators()
    options, _ = MakeOpts(estimators).parse_args(sys.argv)

    thermo_list = []
    thermo_list.append(estimators[options.thermodynamics_source])
    thermo_list.append(
        PsuedoisomerTableThermodynamics.FromCsvFile(
            options.thermodynamics_csv))

    # Make sure we have all the data.
    kegg = Kegg.getInstance()
    for i, thermo in enumerate(thermo_list):
        print "Priority %d - formation energies of: %s" % (i + 1, thermo.name)
        kegg.AddThermodynamicData(thermo, priority=(i + 1))

    db = SqliteDatabase('../res/gibbs.sqlite')

    print 'Exporting Group Contribution Nullspace matrix as JSON.'
    nullspace_vectors = []
    for row in db.DictReader('ugc_conservations'):
        d = {'msg': row['msg']}
        sparse = json.loads(row['json'])
        d['reaction'] = []
        for cid, coeff in sparse.iteritems():
            d['reaction'].append([coeff, "C%05d" % int(cid)])
        nullspace_vectors.append(d)
    WriteJSONFile(nullspace_vectors, options.nullspace_out_filename)

    print 'Exporting KEGG compounds as JSON.'
    WriteJSONFile(kegg.AllCompounds(), options.compounds_out_filename)

    print 'Exporting KEGG reactions as JSON.'
    WriteJSONFile(kegg.AllReactions(), options.reactions_out_filename)

    print 'Exporting KEGG enzymes as JSON.'
    WriteJSONFile(kegg.AllEnzymes(), options.enzymes_out_filename)
Example #5
0
    def EstimateKeggCids(self):
        """
            Uses the Group Contributions to estimate the entire set of compounds in KEGG,
            and then writes the results to the database as 'gc_pseudoisomers' table
            
            Options:
                override_all_observed_compounds - If True, any observed formation energy is 
                    used instead of the GC estimation. If False, only 'test' compounds are used.
        """
        logging.info("Estimating formation energies for all KEGG")

        observed_species = PsuedoisomerTableThermodynamics.FromCsvFile(
            '../data/thermodynamics/formation_energies.csv', label='testing')

        for rc in RedoxCarriers().itervalues():
            observed_species.AddPseudoisomer(rc.cid_ox,
                                             nH=rc.nH_ox,
                                             z=rc.z_ox,
                                             nMg=0,
                                             dG0=0.0,
                                             ref=rc.ref)
            observed_species.AddPseudoisomer(rc.cid_red,
                                             nH=rc.nH_red,
                                             z=rc.z_red,
                                             nMg=0,
                                             dG0=rc.ddG0,
                                             ref=rc.ref)
            observed_species.cid2source_string[rc.cid_ox] = rc.ref
            observed_species.cid2source_string[rc.cid_red] = rc.ref

        self.cid2pmap_dict = {}
        self.cid2source_string = {}

        self.html_writer.write(
            '</br><b>Estimated formation energies for KEGG compounds</b>\n')
        self.html_writer.insert_toggle(start_here=True)
        for cid in sorted(self.kegg.get_all_cids()):
            self.html_writer.write('<b>C%05d - %s</b></br>\n' %
                                   (cid, self.kegg.cid2name(cid)))

            diss_table = self.GetDissociationTable(cid)
            if cid in observed_species.get_all_cids():
                pmap_obs = observed_species.cid2PseudoisomerMap(cid)
                self.cid2source_string[
                    cid] = observed_species.cid2SourceString(cid)
                pmatrix = pmap_obs.ToMatrix(
                )  # returns a list of (nH, z, nMg, dG0)
                if len(pmatrix) == 1 and diss_table is not None:
                    # assume that only the most abundant pseudoisomer is given
                    # and complete the formation energies of the others using the
                    # pKa values in the dissociation table
                    nH, _z, nMg, dG0 = pmatrix[0]
                    diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0,
                                                                nH=nH,
                                                                nMg=nMg)
                    pmap = diss_table.GetPseudoisomerMap()
                    self.SetPseudoisomerMap(cid, pmap)
                else:
                    if diss_table is not None:
                        logging.warning("C%05d has multiple training species, "
                                        "overriding the dissociation table" %
                                        cid)
                    self.SetPseudoisomerMap(cid, pmap_obs)
            elif diss_table is None:
                self.html_writer.write('Warning: no dissociation table</br>\n')
                continue
            else:
                nH, nMg = self.cid2nH_nMg[cid]
                groupvector = self.cid2groupvec.get(cid, None)
                if groupvector is None:
                    self.html_writer.write(
                        'Warning: no group vector (%s)</br>\n' %
                        self.cid2error[cid])
                    continue
                try:
                    dG0 = self.groupvec2val(groupvector)
                except GroupMissingTrainDataError as e:
                    # in this case we do not care if a compound violated the group
                    # conservation laws because it might cancel out later when we
                    # use it to calculate reactions.
                    dG0 = e.value
                    self.html_writer.write('Warning: %s</br>\n' % str(e))
                    logging.debug("C%05d: %s" % (cid, str(e)))

                self.cid2source_string[cid] = "Group Contribution"

                if self.transformed:
                    diss_table.SetTransformedFormationEnergy(dG0_tag=dG0,
                                                             pH=default_pH,
                                                             I=default_I,
                                                             pMg=default_pMg,
                                                             T=default_T)
                else:
                    diss_table.SetFormationEnergyByNumHydrogens(dG0=dG0,
                                                                nH=nH,
                                                                nMg=nMg)
                pmap = diss_table.GetPseudoisomerMap()
                self.SetPseudoisomerMap(cid, pmap)
            self.cid2PseudoisomerMap(cid).WriteToHTML(self.html_writer)

        self.html_writer.div_end()

        logging.info("Writing the results to the database")
        self.ToDatabase(self.db, self.THERMODYNAMICS_TABLE_NAME)
    # dG0 =  -E'*nE*F - R*T*ln(10)*nH*pH
    # Where:
    #    F  = 96.48 kC/mol
    #    nE - change in e-
    #    nH - change in H+
    #    pH - the conditions in which the E' was measured
    #
    # Ferredoxin  ox/red: E' = -0.380V (nE = 1, nH = 0) -> dG0 = 38.0 kJ/mol [1]
    # Ubiqinone   ox/red: E' =  0.113V (nE = 2, nH = 2) -> dG0 = -103.2 kJ/mol [1]
    # Menaquinone ox/red: E' = -0.074V (nE = 2, nH = 2) -> dG0 = -65.8 kJ/mol [1]
    #
    # [1] - Thauer 1977

    observed_thermo_fname = options.thermodynamics_filename
    print 'Loading observed thermodynamic data from %s' % observed_thermo_fname
    observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile(
        observed_thermo_fname)

    if thermo_source == 'hatzi_only':
        thermo = PsuedoisomerTableThermodynamics.FromDatabase(
            db, 'hatzi_thermodynamics')
        thermo.AddPseudoisomer(139, nH=0, z=1, nMg=0, dG0=0)  # Ferrodoxin(ox)
        thermo.AddPseudoisomer(138, nH=0, z=0, nMg=0,
                               dG0=38.0)  # Ferrodoxin(red)
        thermo.AddPseudoisomer(399, nH=90, z=0, nMg=0,
                               dG0=0)  # Ubiquinone-10(ox)
        thermo.AddPseudoisomer(390, nH=92, z=0, nMg=0,
                               dG0=-103.2)  # Ubiquinone-10(red)
        thermo.AddPseudoisomer(828, nH=16, z=0, nMg=0,
                               dG0=0)  # Menaquinone(ox)
        thermo.AddPseudoisomer(5819, nH=18, z=0, nMg=0,
                               dG0=-65.8)  # Menaquinone(red)
def main():
    options, _ = MakeOpts().parse_args(sys.argv)
    db = SqliteDatabase(options.db_file)
    kegg = Kegg.getInstance()

    if options.override_table:
        db.Execute("DROP TABLE IF EXISTS " + options.table_name)

    DissociationConstants._CreateDatabase(
        db, options.table_name, drop_if_exists=options.override_table)

    cids_to_calculate = set()
    if options.nist:
        cids_to_calculate.update(Nist().GetAllCids())
        cids_to_calculate.update(RedoxCarriers().GetAllCids())

        ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
            "../data/thermodynamics/formation_energies.csv")
        cids_to_calculate.update(ptable.get_all_cids())
    else:
        cids_to_calculate.update(kegg.get_all_cids())

    for row in db.Execute("SELECT distinct(cid) FROM %s" % options.table_name):
        if row[0] in cids_to_calculate:
            cids_to_calculate.remove(row[0])

    cid2smiles_and_mw = {}
    for cid in cids_to_calculate:
        # the compound CO is a special case where the conversion from InChI
        # to SMILES fails, so we add a specific override for it only
        if cid == 237:
            cid2smiles_and_mw[cid] = ("[C-]#[O+]", 28)
            continue

        try:
            comp = kegg.cid2compound(cid)
            mol = comp.GetMolecule()
            cid2smiles_and_mw[cid] = (mol.ToSmiles(), mol.GetExactMass())
        except KeggParseException:
            logging.debug("%s (C%05d) has no SMILES, skipping..." %
                          (kegg.cid2name(cid), cid))
        except OpenBabelError:
            logging.debug(
                "%s (C%05d) cannot be converted to SMILES, skipping..." %
                (kegg.cid2name(cid), cid))

    # Do not recalculate pKas for CIDs that are already in the database
    cids_to_calculate = cid2smiles_and_mw.keys()
    cids_to_calculate.sort(key=lambda (cid): (cid2smiles_and_mw[cid][1], cid))

    db_lock = threading.Lock()
    semaphore = threading.Semaphore(options.n_threads)
    for cid in cids_to_calculate:
        smiles, _ = cid2smiles_and_mw[cid]
        if not smiles:
            logging.info("The following compound is blacklisted: C%05d" % cid)
            continue

        thread = DissociationThreads(group=None,
                                     target=None,
                                     name=None,
                                     args=(cid, smiles, semaphore, db_lock,
                                           options),
                                     kwargs={})
        thread.start()
Example #8
0
                          help="Group Contribution Table Name")    
    return opt_parser

options, _ = MakeOpts().parse_args(sys.argv)
if options.sbml_model_filename == None:
    raise ValueError("Must provide a SBML model")

print 'SBML model filename:', options.sbml_model_filename
print 'CSV output filename:', options.csv_output_filename
print 'KEGG Database filename:', options.kegg_db_filename
print 'Observed Thermodynamics filename:', options.thermo_filename
print 'Thermodynamic Database filename:', options.db_filename
print 'Group Contribution Table Name:', options.gc_table_name

db = SqliteDatabase(options.db_filename)
observed_thermo = PsuedoisomerTableThermodynamics.FromCsvFile(
    options.thermo_filename)
if not db.DoesTableExist(options.gc_table_name):
    raise ValueError('The table %s does not exist in the database. '
                     'Please run the groups.py script and try again.'
                     % options.gc_table_name)
thermo = PsuedoisomerTableThermodynamics.FromDatabase(
    db, options.gc_table_name)
thermo.override_data(observed_thermo)
kegg = Kegg.getInstance()

document = libsbml.readSBML(options.sbml_model_filename)
if document.getNumErrors():
    raise Exception('cannot read SBML model from file %s due to error: %s' % 
                    (options.sbml_model_filename, document.getError(0).getMessage()))
model = document.getModel()
logging.info('Done parsing the model: ' + model.getName())
Example #9
0
nist_row_data.pMg = 14
nist_row_data.evaluation = "A"
nist_row_data.url = ""
nist_row_data.ref_id = ""
#nist_row_data.reaction = Reaction(["triose isomerase"], {111:-1, 118:1})
#cid2nH = {111:6, 118:5}
#cid2min_dG0 = {111:-1296.3, 118:-1288.6}

nist_row_data.reaction = Reaction(["triose isomerase"], {
    1: -1,
    78: -1,
    22: 1,
    14: 1,
    463: 1
})
train_species = PsuedoisomerTableThermodynamics.FromCsvFile(
    '../data/thermodynamics/dG0.csv')
cid2nH = {}
cid2dG0 = {}
for cid in train_species.get_all_cids():
    pmap = train_species.cid2PseudoisomerMap(cid)
    pmatrix = pmap.ToMatrix()
    if len(pmatrix) != 1:
        raise Exception("C%05d has more than one species in the training set" %
                        cid)
    cid2nH[cid] = pmatrix[0][0]  # ToMatrix returns tuples of (nH, z, nMg, dG0)
    cid2dG0[cid] = pmatrix[0][3]

ddG0_r = 0
dG0_r = 0
dG0_tag_r = 0
for cid, coeff in nist_row_data.reaction.sparse.iteritems():
Example #10
0
    def ConvertFormation2Reaction(self, output_fname):
        logging.info("Converting all formation energies to reactions")
        output_csv = csv.writer(open(output_fname, 'w'))

        # keep the format used for TECRDB
        output_csv.writerow(
            ('ref', 'ID', 'method', 'eval', 'EC', 'name', 'kegg_reaction',
             'reaction', 'dG0\'', 'T', 'I', 'pH', 'pMg'))

        atom2cid = {}
        for atom, (name, stoich) in KeggObservation.ATOM2ELEMENT.iteritems():
            cid, _, _ = self.kegg.name2cid(name, 0)
            if cid is None:
                raise Exception(
                    "Cannot find the element %s in the KEGG database" % name)
            atom2cid[atom] = (cid, stoich)
            #output_csv.writerow(('element',
            #                     'C%05d' % cid, 'formation', 'A', '',
            #                     'formation of %s' % self.kegg.cid2name(cid),
            #                     "C%05d" % cid,
            #                     name, 0, self.T, self.I, self.pH, self.pMg))

        for label in ['training', 'testing']:
            ptable = PsuedoisomerTableThermodynamics.FromCsvFile(
                self.FormationEnergyFileName, label=label)
            for cid in ptable.get_all_cids():
                pmatrix = ptable.cid2PseudoisomerMap(cid).ToMatrix()
                if len(pmatrix) != 1:
                    raise Exception("multiple training species for C%05d" %
                                    cid)
                nH, _charge, nMg, dG0 = pmatrix[0]
                diss_table = dissociation.GetDissociationTable(cid, False)
                if diss_table is None:
                    continue
                diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
                dG0_prime = diss_table.Transform(pH=self.pH,
                                                 I=self.I,
                                                 pMg=self.pMg,
                                                 T=self.T)
                ref = ptable.cid2SourceString(cid)

                atom_bag = self.kegg.cid2atom_bag(cid)
                if not atom_bag:
                    continue

                ne = self.kegg.cid2num_electrons(cid)
                elem_ne = 0
                sparse = {cid: 1}
                for elem, count in atom_bag.iteritems():
                    if elem == 'H':
                        continue
                    elem_ne += count * Molecule.GetAtomicNum(elem)
                    elem_cid, elem_coeff = atom2cid[elem]
                    sparse.setdefault(elem_cid, 0)
                    sparse[elem_cid] += -count * elem_coeff

                # use the H element to balance the electrons in the formation
                # reactions (we don't need to balance protons since this is
                # a biochemical reaction, so H+ are 'free').
                H_cid, H_coeff = atom2cid['H']
                sparse[H_cid] = (elem_ne - ne) * H_coeff
                reaction = Reaction(
                    "formation of %s" % self.kegg.cid2name(cid), sparse)

                output_csv.writerow(
                    (ref, 'C%05d' % cid, 'formation', 'A', '',
                     'formation of %s' % self.kegg.cid2name(cid),
                     reaction.FullReactionString(),
                     reaction.FullReactionString(show_cids=False),
                     '%.2f' % dG0_prime, self.T, self.I, self.pH, self.pMg))