Esempio n. 1
0
def add_thermodynamics(cursor):
    from groups import GroupMissingTrainDataError, GroupDecompositionError

    gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="pathologic")
    gc.init()

    cursor.execute("DROP TABLE IF EXISTS yeast_inchi2thermo")
    cursor.execute(
        "CREATE TABLE yeast_inchi2thermo (inchi TEXT, charge INT, nH INT, dG0_f REAL)"
    )
    cursor.execute("DROP INDEX IF EXISTS yeast_inchi2thermo_idx")
    cursor.execute(
        "CREATE INDEX yeast_inchi2thermo_idx ON yeast_inchi2thermo (inchi);")

    inchi_list = []
    for row in cursor.execute("SELECT distinct(inchi) " \
                              "FROM yeast_species2inchi WHERE inchi IS NOT NULL"):
        inchi = row[0]
        inchi_list.append(str(inchi))

    for inchi in inchi_list:
        try:
            mol = Molecule.FromInChI(str(inchi))
            pmap = gc.Mol2PseudoisomerMap(mol)
            for ((z, nH), dG0) in pmap.iteritems():
                cursor.execute(
                    "INSERT INTO yeast_inchi2thermo VALUES(?,?,?,?)",
                    [inchi, z, nH, dG0])
        except (IOError, GroupMissingTrainDataError, GroupDecompositionError):
            sys.stderr.write(
                "Cannot convert the following InChI to a pybel Molecule")
Esempio n. 2
0
def add_thermodynamics(cursor):
    from groups import GroupMissingTrainDataError, GroupDecompositionError

    gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="pathologic")
    gc.init()
    
    cursor.execute("DROP TABLE IF EXISTS yeast_inchi2thermo")
    cursor.execute("CREATE TABLE yeast_inchi2thermo (inchi TEXT, charge INT, nH INT, dG0_f REAL)")
    cursor.execute("DROP INDEX IF EXISTS yeast_inchi2thermo_idx")
    cursor.execute("CREATE INDEX yeast_inchi2thermo_idx ON yeast_inchi2thermo (inchi);")
    
    inchi_list = []
    for row in cursor.execute("SELECT distinct(inchi) " \
                              "FROM yeast_species2inchi WHERE inchi IS NOT NULL"):
        inchi = row[0]
        inchi_list.append(str(inchi))
    
    for inchi in inchi_list:
        try:
            mol = Molecule.FromInChI(str(inchi))
            pmap = gc.Mol2PseudoisomerMap(mol)
            for ((z, nH), dG0) in pmap.iteritems():
                cursor.execute("INSERT INTO yeast_inchi2thermo VALUES(?,?,?,?)", [inchi, z, nH, dG0])
        except (IOError, GroupMissingTrainDataError, GroupDecompositionError):
            sys.stderr.write("Cannot convert the following InChI to a pybel Molecule")
def main():
    options, _ = flags.MakeOpts().parse_args(sys.argv)
    c_mid = options.c_mid
    pH = options.ph
    pMg = options.pmg
    I = options.i_s
    T = default_T

    db = SqliteDatabase("../res/gibbs.sqlite")
    kegg = Kegg.getInstance()
    G = GroupContribution(db)
    G.init()

    print ("Parameters: T=%f K, pH=%.2g, pMg=%.2g, " "I=%.2gM, Median concentration=%.2gM" % (T, pH, pMg, I, c_mid))

    cmap = {}
    if not options.ignore_cofactors:
        if options.full_metabolites:
            print "Fixing concentrations of all known metabolites"
            cmap = reversibility.GetFullConcentrationMap(G)
        else:
            print "Fixing concentrations of co-factors"
            cmap = reversibility.GetConcentrationMap(kegg)
    else:
        print "Not fixing concentrations of co-factors"

    if options.report_mode:
        print "Output used metabolites concentrations"

    while True:
        mid = GetModuleIdInput()

        rid_flux_list = kegg.mid2rid_map[mid]

        for rid, flux in rid_flux_list:
            try:
                reaction = kegg.rid2reaction(rid)
                print "Reaction Name", reaction.name
                print "\tKegg Id", reaction.rid
                print "\tEC", reaction.ec_list
                rev = reversibility.CalculateReversability(
                    reaction.sparse, G, pH=pH, I=I, pMg=pMg, T=T, concentration_map=cmap
                )
                if rev == None:
                    dG = G.estimate_dG_reaction(reaction.sparse, pH=pH, pMg=pMg, I=I, T=T, c0=c_mid, media="glucose")
                    print "\tReversibility: No free compounds, dG = %.2g" % dG
                else:
                    corrected_reversibility = flux * rev
                    print "\tReversibility %.2g" % corrected_reversibility

                if options.report_mode:
                    for cid, s in reaction.sparse.iteritems():
                        if cid in cmap:
                            print "(%d C%05d) %s\t: %.2g" % (s, cid, kegg.cid2name(cid), cmap[cid])
                        else:
                            print "(%d C%05d) %s\t: Free concentration" % (s, cid, kegg.cid2name(cid))
            except Exception:
                print "\tCouldn't calculate irreversibility"
Esempio n. 4
0
def test_single_modules(mids):
    from pygibbs.groups import GroupContribution
    db = SqliteDatabase('../res/gibbs.sqlite')
    html_writer = HtmlWriter("../res/thermodynamic_module_analysis.html")
    gc = GroupContribution(db, html_writer)
    gc.init()

    for mid in mids:
        html_writer.write("<h2>M%05d</h2>\n" % mid)
        S, rids, fluxes, cids = gc.kegg.get_module(mid)
        thermodynamic_pathway_analysis(S, rids, fluxes, cids, gc, html_writer)
Esempio n. 5
0
def test_single_modules(mids):
    from pygibbs.groups import GroupContribution
    db = SqliteDatabase('../res/gibbs.sqlite')
    html_writer = HtmlWriter("../res/thermodynamic_module_analysis.html")
    gc = GroupContribution(db, html_writer)
    gc.init()
    
    for mid in mids:
        html_writer.write("<h2>M%05d</h2>\n" % mid)
        S, rids, fluxes, cids = gc.kegg.get_module(mid)
        thermodynamic_pathway_analysis(S, rids, fluxes, cids, gc, html_writer)
Esempio n. 6
0
def test_all_modules():
    from pygibbs.groups import GroupContribution
    gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="dG0_test")
    gc.init()
    c_range = (1e-6, 1e-2)
    c_mid = 1e-3
    pH = 8
    I = 0.1
    T = 300
    map_cid = {201:2, 454:8} # CIDs that should be mapped to other CIDs because they are unspecific (like NTP => ATP)
    
    cids_with_missing_dG_f = set()
    
    f = open("../res/feasibility.csv", "w")
    csv_output = csv.writer(f)
    csv_output.writerow(("MID", "module name", "pH", "I", "T", "pCr", "MTDF"))
    for mid in sorted(gc.kegg().mid2rid_map.keys()):
        module_name = gc.kegg().mid2name_map[mid]
        try:
            S, _rids, _fluxes, cids = gc.kegg().get_module(mid)
        except KeggMissingModuleException as e:
            sys.stderr.write("WARNING: " + str(e) + "\n")
            continue
        _Nr, Nc = S.shape
        for pH in [5, 6, 7, 8, 9]:
            for I in [0.0, 0.1, 0.2, 0.3, 0.4]:
                dG0_f = pylab.zeros((Nc, 1))
                bounds = []
                for c in range(Nc):
                    cid = map_cid.get(cids[c], cids[c])
                    try:
                        pmap = gc.cid2PseudoisomerMap(cid)
                        dG0_f[c] = gc.pmap_to_dG0(pmap, pH, I, T)
                    except MissingCompoundFormationEnergy as e:
                        if (cid not in cids_with_missing_dG_f):
                            sys.stderr.write("Setting the dG0_f of C%05d to NaN because: %s\n"\
                                             % (cid, str(e)))
                            cids_with_missing_dG_f.add(cid)
                        dG0_f[c] = pylab.nan
            
                bounds = [gc.kegg().cid2bounds.get(cid, (None, None)) for cid in cids]

                try:
                    _dG_f, _concentrations, pCr = find_pCr(S, dG0_f, c_mid=c_mid, ratio=3.0, bounds=bounds)
                except LinProgNoSolutionException:
                    sys.stderr.write("M%05d: Pathway is theoretically infeasible\n" % mid)
                    pCr = None

                try:
                    _dG_f, _concentrations, MTDF = find_mtdf(S, dG0_f, c_range=c_range, bounds=bounds)
                except LinProgNoSolutionException:
                    sys.stderr.write("M%05d: Pathway is theoretically infeasible\n" % mid)
                    MTDF = None
                
                csv_output.writerow([mid, module_name, pH, I, T, pCr, MTDF])
                        
    f.close()
def LoadAllEstimators():
    db_public = SqliteDatabase('../data/public_data.sqlite')
    db_gibbs = SqliteDatabase('../res/gibbs.sqlite')
    
    if not db_gibbs.DoesTableExist('prc_pseudoisomers'):
        nist_regression = NistRegression(db_gibbs)
        nist_regression.Train()

    tables = {'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'),
              'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)')}
    estimators = {}
    for key, (db, table_name, thermo_name) in tables.iteritems():
        if db.DoesTableExist(table_name):
            estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase(
                                            db, table_name, name=thermo_name)
        else:
            logging.warning('The table %s does not exist in %s' % (table_name, str(db)))
    
    estimators['hatzi_gc'] = Hatzi(use_pKa=False)
    #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True)
    
    if db.DoesTableExist('bgc_pseudoisomers'):
        estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True)
        estimators['BGC'].init()
        estimators['BGC'].name = 'our method (BGC)'

    if db.DoesTableExist('pgc_pseudoisomers'):
        estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False)
        estimators['PGC'].init()
        estimators['PGC'].name = 'our method (PGC)'

    
    estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs)
    estimators['UGC'].init()
    estimators['UGC'].name = 'our method (UGC)'

    
    estimators['C1'] = ReactionThermodynamics.FromCsv(
        '../data/thermodynamics/c1_reaction_thermodynamics.csv',
        estimators['alberty'])

    if 'PGC' in estimators:    
        estimators['merged'] = BinaryThermodynamics(estimators['alberty'],
                                                    estimators['PGC'])
        estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'],
                                                       estimators['PGC'])

    for thermo in estimators.values():
        thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv')

    return estimators
Esempio n. 8
0
def try_kegg_api():
    db = SqliteDatabase('../res/gibbs.sqlite')
    html_writer = HtmlWriter('../res/dG0_test.html')
    G = GroupContribution(db, html_writer=html_writer)
    G.init()
    
    wsdl = 'http://soap.genome.jp/KEGG.wsdl'
    serv = WSDL.Proxy(wsdl)
    
    rid_file = open('../res/eco_rids.txt', 'w')
    rids = set()
    for x in serv.list_pathways('eco'):
        pathway_id = x['entry_id']
        for reaction_id in serv.get_reactions_by_pathway(pathway_id):
            rid = int(reaction_id[4:])
            if rid not in rids:
                rids.add(rid)
                rid_file.write('%d\n' % rid)
    rid_file.close()
            
    c_mid = 1e-3
    pH, pMg, I, T = (7.0, 3.0, 0.1, 298.15)
    
    rid2reversibility = {}
    misses = 0
    for rid in sorted(rids):
        try:
            reaction = G.kegg.rid2reaction(rid)
            r = CalculateReversability(reaction, G, c_mid, pH, pMg, I, T)
            rid2reversibility[rid] = r
        except thermodynamics.MissingCompoundFormationEnergy:
            misses += 1
            continue
    
    print 'hits = %d, misses = %d' % len(rid2reversibility), misses
    median = pylab.median(rid2reversibility.values())
    print 'median = %.1f' % median

    pylab.figure()
    pylab.hold(True)
    plotting.cdf(rid2reversibility.values(), 'all reactions', 'r', show_median=True)
    pylab.show()
Esempio n. 9
0
def main():
    estimators = LoadAllEstimators()
    args, _ = MakeOpts(estimators).parse_args(sys.argv)

    # Make sure we have all the data.
    db = SqliteDatabase("../res/gibbs.sqlite")
    G = GroupContribution(db=db, html_writer=NullHtmlWriter(), transformed=args.transformed)
    G.init()

    print "Exporting KEGG compounds to %s" % args.compounds_out_filename
    csv_writer = csv.writer(open(args.compounds_out_filename, "w"))
    csv_writer.writerow(["KEGG ID", "nH", "CHARGE", "nMg", "dG0_f"])
    for cid in sorted(G.get_all_cids()):
        try:
            for nH, z, nMg, dG0 in G.cid2PseudoisomerMap(cid).ToMatrix():
                csv_writer.writerow(["C%05d" % cid, nH, z, nMg, "%.1f" % dG0])
        except MissingCompoundFormationEnergy as e:
            csv_writer.writerow(["C%05d" % cid, None, None, None, str(e)])

    print "Exporting KEGG reactions to %s" % args.reactions_out_filename
    csv_writer = csv.writer(open(args.reactions_out_filename, "w"))
    csv_writer.writerow(
        ["KEGG ID", "dG'0_r (pH=%.1f, I=%.2f, pMg=%.1f, T=%.1f)" % (args.ph, args.i_s, args.pmg, args.temp)]
    )
    for rid in sorted(G.kegg.get_all_rids()):
        reaction = G.kegg.rid2reaction(rid)
        try:
            reaction.Balance(balance_water=True)
            dG0_r = reaction.PredictReactionEnergy(G, pH=args.ph, pMg=args.pmg, I=args.i_s, T=args.temp)
            csv_writer.writerow(["R%05d" % rid, "%.1f" % dG0_r])
        except (
            KeggParseException,
            MissingCompoundFormationEnergy,
            KeggReactionNotBalancedException,
            MissingReactionEnergy,
            KeyError,
            OpenBabelError,
        ) as e:
            csv_writer.writerow(["R%05d" % rid, str(e)])
Esempio n. 10
0
def LoadAllEstimators():
    db_public = SqliteDatabase('../data/public_data.sqlite')
    db_gibbs = SqliteDatabase('../res/gibbs.sqlite')

    if not db_gibbs.DoesTableExist('prc_pseudoisomers'):
        nist_regression = NistRegression(db_gibbs)
        nist_regression.Train()

    tables = {
        'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'),
        'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)')
    }
    estimators = {}
    for key, (db, table_name, thermo_name) in tables.iteritems():
        if db.DoesTableExist(table_name):
            estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase(
                db, table_name, name=thermo_name)
        else:
            logging.warning('The table %s does not exist in %s' %
                            (table_name, str(db)))

    estimators['hatzi_gc'] = Hatzi(use_pKa=False)
    #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True)

    if db.DoesTableExist('bgc_pseudoisomers'):
        estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True)
        estimators['BGC'].init()
        estimators['BGC'].name = 'our method (BGC)'

    if db.DoesTableExist('pgc_pseudoisomers'):
        estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False)
        estimators['PGC'].init()
        estimators['PGC'].name = 'our method (PGC)'

    estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs)
    estimators['UGC'].init()
    estimators['UGC'].name = 'our method (UGC)'

    estimators['C1'] = ReactionThermodynamics.FromCsv(
        '../data/thermodynamics/c1_reaction_thermodynamics.csv',
        estimators['alberty'])

    if 'PGC' in estimators:
        estimators['merged'] = BinaryThermodynamics(estimators['alberty'],
                                                    estimators['PGC'])
        estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'],
                                                       estimators['PGC'])

    for thermo in estimators.values():
        thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv')

    return estimators
Esempio n. 11
0
def main():
    estimators = LoadAllEstimators()
    args, _ = MakeOpts(estimators).parse_args(sys.argv)
    
    # Make sure we have all the data.
    db = SqliteDatabase('../res/gibbs.sqlite')
    G = GroupContribution(db=db, html_writer=NullHtmlWriter(),
                          transformed=args.transformed)
    G.init()
    
    print 'Exporting KEGG compounds to %s' % args.compounds_out_filename
    csv_writer = csv.writer(open(args.compounds_out_filename, 'w'))
    csv_writer.writerow(["KEGG ID", "nH", "CHARGE", "nMg", "dG0_f"])
    for cid in sorted(G.get_all_cids()):
        try:
            for nH, z, nMg, dG0 in G.cid2PseudoisomerMap(cid).ToMatrix():
                csv_writer.writerow(["C%05d" % cid, nH, z, nMg, "%.1f" % dG0])
        except MissingCompoundFormationEnergy as e:
            csv_writer.writerow(["C%05d" % cid, None, None, None, str(e)])
        
    print 'Exporting KEGG reactions to %s' % args.reactions_out_filename
    csv_writer = csv.writer(open(args.reactions_out_filename, 'w'))
    csv_writer.writerow(["KEGG ID", "dG'0_r (pH=%.1f, I=%.2f, pMg=%.1f, T=%.1f)" % 
                         (args.ph, args.i_s, args.pmg, args.temp)])
    for rid in sorted(G.kegg.get_all_rids()):
        reaction = G.kegg.rid2reaction(rid)
        try:
            reaction.Balance(balance_water=True)
            dG0_r = reaction.PredictReactionEnergy(G, pH=args.ph,
                        pMg=args.pmg, I=args.i_s, T=args.temp)
            csv_writer.writerow(["R%05d" % rid, "%.1f" % dG0_r])
        except (KeggParseException,
                MissingCompoundFormationEnergy, 
                KeggReactionNotBalancedException,
                MissingReactionEnergy,
                KeyError,
                OpenBabelError) as e:
            csv_writer.writerow(["R%05d" % rid, str(e)])
Esempio n. 12
0
def main():
    options, _ = flags.MakeOpts().parse_args(sys.argv)
    c_mid = options.c_mid
    pH = options.ph
    pMg = options.pmg
    I = options.i_s
    T = default_T

    db = SqliteDatabase('../res/gibbs.sqlite')
    kegg = Kegg.getInstance()
    G = GroupContribution(db)
    G.init()

    print(
        'Parameters: T=%f K, pH=%.2g, pMg=%.2g, '
        'I=%.2gM, Median concentration=%.2gM' % (T, pH, pMg, I, c_mid))

    cmap = {}
    if not options.ignore_cofactors:
        if options.full_metabolites:
            print 'Fixing concentrations of all known metabolites'
            cmap = reversibility.GetFullConcentrationMap(G)
        else:
            print 'Fixing concentrations of co-factors'
            cmap = reversibility.GetConcentrationMap(kegg)
    else:
        print 'Not fixing concentrations of co-factors'

    if options.report_mode:
        print 'Output used metabolites concentrations'

    while True:
        mid = GetModuleIdInput()

        rid_flux_list = kegg.mid2rid_map[mid]

        for rid, flux in rid_flux_list:
            try:
                reaction = kegg.rid2reaction(rid)
                print 'Reaction Name', reaction.name
                print '\tKegg Id', reaction.rid
                print '\tEC', reaction.ec_list
                rev = reversibility.CalculateReversability(
                    reaction.sparse,
                    G,
                    pH=pH,
                    I=I,
                    pMg=pMg,
                    T=T,
                    concentration_map=cmap)
                if rev == None:
                    dG = G.estimate_dG_reaction(reaction.sparse,
                                                pH=pH,
                                                pMg=pMg,
                                                I=I,
                                                T=T,
                                                c0=c_mid,
                                                media='glucose')
                    print '\tReversibility: No free compounds, dG = %.2g' % dG
                else:
                    corrected_reversibility = flux * rev
                    print '\tReversibility %.2g' % corrected_reversibility

                if options.report_mode:
                    for cid, s in reaction.sparse.iteritems():
                        if cid in cmap:
                            print '(%d C%05d) %s\t: %.2g' % (
                                s, cid, kegg.cid2name(cid), cmap[cid])
                        else:
                            print '(%d C%05d) %s\t: Free concentration' % (
                                s, cid, kegg.cid2name(cid))
            except Exception:
                print '\tCouldn\'t calculate irreversibility'
Esempio n. 13
0
def test_all_modules():
    from pygibbs.groups import GroupContribution
    gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="dG0_test")
    gc.init()
    c_range = (1e-6, 1e-2)
    c_mid = 1e-3
    pH = 8
    I = 0.1
    T = 300
    map_cid = {
        201: 2,
        454: 8
    }  # CIDs that should be mapped to other CIDs because they are unspecific (like NTP => ATP)

    cids_with_missing_dG_f = set()

    f = open("../res/feasibility.csv", "w")
    csv_output = csv.writer(f)
    csv_output.writerow(("MID", "module name", "pH", "I", "T", "pCr", "MTDF"))
    for mid in sorted(gc.kegg().mid2rid_map.keys()):
        module_name = gc.kegg().mid2name_map[mid]
        try:
            S, _rids, _fluxes, cids = gc.kegg().get_module(mid)
        except KeggMissingModuleException as e:
            sys.stderr.write("WARNING: " + str(e) + "\n")
            continue
        _Nr, Nc = S.shape
        for pH in [5, 6, 7, 8, 9]:
            for I in [0.0, 0.1, 0.2, 0.3, 0.4]:
                dG0_f = pylab.zeros((Nc, 1))
                bounds = []
                for c in range(Nc):
                    cid = map_cid.get(cids[c], cids[c])
                    try:
                        pmap = gc.cid2PseudoisomerMap(cid)
                        dG0_f[c] = gc.pmap_to_dG0(pmap, pH, I, T)
                    except MissingCompoundFormationEnergy as e:
                        if (cid not in cids_with_missing_dG_f):
                            sys.stderr.write("Setting the dG0_f of C%05d to NaN because: %s\n"\
                                             % (cid, str(e)))
                            cids_with_missing_dG_f.add(cid)
                        dG0_f[c] = pylab.nan

                bounds = [
                    gc.kegg().cid2bounds.get(cid, (None, None)) for cid in cids
                ]

                try:
                    _dG_f, _concentrations, pCr = find_pCr(S,
                                                           dG0_f,
                                                           c_mid=c_mid,
                                                           ratio=3.0,
                                                           bounds=bounds)
                except LinProgNoSolutionException:
                    sys.stderr.write(
                        "M%05d: Pathway is theoretically infeasible\n" % mid)
                    pCr = None

                try:
                    _dG_f, _concentrations, MTDF = find_mtdf(S,
                                                             dG0_f,
                                                             c_range=c_range,
                                                             bounds=bounds)
                except LinProgNoSolutionException:
                    sys.stderr.write(
                        "M%05d: Pathway is theoretically infeasible\n" % mid)
                    MTDF = None

                csv_output.writerow([mid, module_name, pH, I, T, pCr, MTDF])

    f.close()
def CalculateThermo():
    parser = MakeOpts()
    options, _ = parser.parse_args(sys.argv)
    pH, I, pMg, T = options.pH, options.I, options.pMg, options.T

    db = SqliteDatabase('../res/gibbs.sqlite')
    G = GroupContribution(db=db)
    G.init()
    ignore_protonations = False

    list_of_mols = []
    if options.smiles:
        list_of_mols.append({'id':options.smiles, 'mol':options.smiles,
            'format':'smiles'})
    elif options.inchi:
        list_of_mols.append({'id':options.inchi, 'mol':options.inchi,
            'format':'inchi'})
    elif options.csv_input_filename:
        for row in csv.DictReader(open(options.csv_input_filename, 'r')):
            if "InChI" in row:
                list_of_mols.append({'id':row["ID"], 'mol':row["InChI"],
                                     'format':'inchi'})
            elif "smiles" in row:
                list_of_mols.append({'id':row["ID"], 'mol':row["smiles"],
                                     'format':'smiles'})
            else:
                raise Exception("There must be one molecular ID column: InChI or smiles")
    else:
        parser.error("must use either -s or -c option")
    
    if options.biochemical:
        print ("Calculating biochemical formation energies for %s compounds" 
               " at pH = %.1f, I = %.2f, pMg = %.1f, T = %.2f" %  
               (len(list_of_mols), pH, I, pMg, T))
    else:
        print ("Calculating chemical formation energies for %s compounds" % 
               len(list_of_mols))
    
    rowdicts = []
    for mol_dict in list_of_mols:
        mol_id = mol_dict['id']
        diss_table = Molecule._GetDissociationTable(mol_dict['mol'],
                                                    fmt=mol_dict['format'])
        try:
            mol = diss_table.GetMostAbundantMol(pH, I, pMg, T) or \
                  diss_table.GetAnyMol()
            if mol is None:
                raise Exception("Cannot convert input string to Molecule: " + 
                                mol_dict['mol'])
            
            decomposition = G.Mol2Decomposition(mol, 
                ignore_protonations=ignore_protonations)
            groupvec = decomposition.AsVector()
            dG0 = G.groupvec2val(groupvec)
            nH = decomposition.Hydrogens()
            nMg = decomposition.Magnesiums()
            diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
            pmap = diss_table.GetPseudoisomerMap()
            
            if options.biochemical:
                dG0_prime = pmap.Transform(pH, pMg, I, T)
                rowdicts.append({'ID':mol_id, 'pH':pH, 'I':I, 'pMg':pMg,
                                 'dG0\'':"%.1f" % dG0_prime, 'groupvec':str(groupvec)})
            else:
                for p_nH, p_z, p_nMg, p_dG0 in pmap.ToMatrix():
                    rowdicts.append({'ID':mol_id, 'nH':p_nH, 'charge':p_z, 'nMg':p_nMg,
                                     'dG0':"%.1f" % p_dG0, 'groupvec':str(groupvec)})
        except GroupDecompositionError:
            rowdicts.append({'ID':mol_id, 'error':"cannot decompose"})
        except GroupMissingTrainDataError:
            rowdicts.append({'ID':mol_id, 'groupvec':str(groupvec),
                             'error':"missing training data"})
        
    if options.csv_output_filename is not None:
        out_fp = open(options.csv_output_filename, 'w')
        print "writing results to %s ... " % options.csv_output_filename
    else:
        out_fp = sys.stdout
    
    if options.biochemical:
        titles = ['ID', 'error', 'pH', 'I', 'pMg', 'dG0\'', 'groupvec']
    else:
        titles = ['ID', 'error', 'nH', 'nMg', 'charge', 'dG0', 'groupvec'] 
    csv_writer = csv.DictWriter(out_fp, titles)
    csv_writer.writeheader()
    csv_writer.writerows(rowdicts)
Esempio n. 15
0
def CalculateThermo():
    parser = MakeOpts()
    options, _ = parser.parse_args(sys.argv)
    pH, I, pMg, T = options.pH, options.I, options.pMg, options.T

    db = SqliteDatabase('../res/gibbs.sqlite')
    G = GroupContribution(db=db)
    G.init()
    ignore_protonations = False

    list_of_mols = []
    if options.smiles:
        list_of_mols.append({
            'id': options.smiles,
            'mol': options.smiles,
            'format': 'smiles'
        })
    elif options.inchi:
        list_of_mols.append({
            'id': options.inchi,
            'mol': options.inchi,
            'format': 'inchi'
        })
    elif options.csv_input_filename:
        for row in csv.DictReader(open(options.csv_input_filename, 'r')):
            if "InChI" in row:
                list_of_mols.append({
                    'id': row["ID"],
                    'mol': row["InChI"],
                    'format': 'inchi'
                })
            elif "smiles" in row:
                list_of_mols.append({
                    'id': row["ID"],
                    'mol': row["smiles"],
                    'format': 'smiles'
                })
            else:
                raise Exception(
                    "There must be one molecular ID column: InChI or smiles")
    else:
        parser.error("must use either -s or -c option")

    if options.biochemical:
        print(
            "Calculating biochemical formation energies for %s compounds"
            " at pH = %.1f, I = %.2f, pMg = %.1f, T = %.2f" %
            (len(list_of_mols), pH, I, pMg, T))
    else:
        print("Calculating chemical formation energies for %s compounds" %
              len(list_of_mols))

    rowdicts = []
    for mol_dict in list_of_mols:
        mol_id = mol_dict['id']
        diss_table = Molecule._GetDissociationTable(mol_dict['mol'],
                                                    fmt=mol_dict['format'])
        try:
            mol = diss_table.GetMostAbundantMol(pH, I, pMg, T) or \
                  diss_table.GetAnyMol()
            if mol is None:
                raise Exception("Cannot convert input string to Molecule: " +
                                mol_dict['mol'])

            decomposition = G.Mol2Decomposition(
                mol, ignore_protonations=ignore_protonations)
            groupvec = decomposition.AsVector()
            dG0 = G.groupvec2val(groupvec)
            nH = decomposition.Hydrogens()
            nMg = decomposition.Magnesiums()
            diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg)
            pmap = diss_table.GetPseudoisomerMap()

            if options.biochemical:
                dG0_prime = pmap.Transform(pH, pMg, I, T)
                rowdicts.append({
                    'ID': mol_id,
                    'pH': pH,
                    'I': I,
                    'pMg': pMg,
                    'dG0\'': "%.1f" % dG0_prime,
                    'groupvec': str(groupvec)
                })
            else:
                for p_nH, p_z, p_nMg, p_dG0 in pmap.ToMatrix():
                    rowdicts.append({
                        'ID': mol_id,
                        'nH': p_nH,
                        'charge': p_z,
                        'nMg': p_nMg,
                        'dG0': "%.1f" % p_dG0,
                        'groupvec': str(groupvec)
                    })
        except GroupDecompositionError:
            rowdicts.append({'ID': mol_id, 'error': "cannot decompose"})
        except GroupMissingTrainDataError:
            rowdicts.append({
                'ID': mol_id,
                'groupvec': str(groupvec),
                'error': "missing training data"
            })

    if options.csv_output_filename is not None:
        out_fp = open(options.csv_output_filename, 'w')
        print "writing results to %s ... " % options.csv_output_filename
    else:
        out_fp = sys.stdout

    if options.biochemical:
        titles = ['ID', 'error', 'pH', 'I', 'pMg', 'dG0\'', 'groupvec']
    else:
        titles = ['ID', 'error', 'nH', 'nMg', 'charge', 'dG0', 'groupvec']
    csv_writer = csv.DictWriter(out_fp, titles)
    csv_writer.writeheader()
    csv_writer.writerows(rowdicts)
Esempio n. 16
0
def main():
    db = database.SqliteDatabase('../res/gibbs.sqlite')
    html_writer = HtmlWriter("../res/nist/report.html")
    gc = GroupContribution(db)
    gc.override_gc_with_measurements = True
    gc.init()
    grad = GradientAscent(gc)
    nist = Nist(db, html_writer, gc.kegg())
    nist.FromDatabase()
    alberty = Alberty()
    hatzi = Hatzi()
    
    if True:
        grad.load_nist_data(nist, alberty, skip_missing_reactions=False, T_range=(298, 314))
        grad.verify_results("Alberty", alberty, html_writer)
        
        #grad.write_pseudoisomers("../res/nist/nist_dG0_f.csv")

        #html_writer.write("<h2>Using Group Contribution (Hatzimanikatis' implementation)</h2>")
        #html_writer.write("<h3>Correlation with the reduced NIST database (containing only compounds that appear in Alberty's list)</h3>")
        #logging.info("calculate the correlation between Hatzimanikatis' predictions and the reduced NIST database")
        #grad.verify_results("Hatzimanikatis_Reduced", hatzi, html_writer)

        #grad.load_nist_data(nist, hatzi, skip_missing_reactions=True, T_range=(298, 314))
        grad.verify_results("Hatzimanikatis", hatzi, html_writer)

        #grad.load_nist_data(nist, gc, skip_missing_reactions=True, T_range=(298, 314))
        grad.verify_results("Milo", gc, html_writer)
    elif False:
        # Run the gradient ascent algorithm, where the starting point is the same file used for training the GC algorithm
        grad.load_dG0_data("../data/thermodynamics/dG0.csv")
        # load the data for the anchors (i.e. compounds whose dG0 should not be changed - usually their value will be 0). 
        grad.anchors = grad.load_dG0_data("../data/thermodynamics/nist_anchors.csv")
        grad.load_nist_data(nist, grad, skip_missing_reactions=True)
        print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data))
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient1")
        
    elif False:
        # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006)
        grad.load_nist_data(nist, alberty, skip_missing_reactions=True)
        print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data))
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient2")
    
    elif False:
        # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006)
        # Use DETERMINISTIC gradient ascent
        grad.load_nist_data(nist, alberty, skip_missing_reactions=True, T_range=(24 + 273.15, 40 + 273.15))
        print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data))
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        grad.deterministic_hill_climb(max_i=200)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient_deterministic")
        
    elif False:
        # Run the gradient ascent algorithm, where the starting point arbitrary (predict all of the NIST compounds)
        grad = GradientAscent(gc)
        grad.load_nist_data(nist, skip_missing_reactions=False)
        print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data))
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient3")
    
    elif False: # Use Alberty's table from (Mathematica 2006) to calculate the dG0 of all possible reactions in KEGG
        grad = GradientAscent(gc)
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        (pH, I, T) = (7, 0, 300)
        counter = 0
        for rid in grad.kegg.get_all_rids():
            sparse_reaction = grad.kegg.rid2sparse_reaction(rid)
            try:
                dG0 = grad.reaction_to_dG0(sparse_reaction, pH, I, T)
                print "R%05d: dG0_r = %.2f [kJ/mol]" % (rid, dG0)
                counter += 1
            except MissingCompoundFormationEnergy as e:
                #print "R%05d: missing formation energy of C%05d" % (rid, e.cid)
                pass
        print "Managed to calculate the dG0 of %d reactions" % counter
        
    elif False:
        util._mkdir("../res/nist/fig")
        csv_writer = csv.writer(open("../res/nist/pseudoisomers.csv", "w"))
                
        cid_set = set()
        for row in nist.data:
            sparce_reaction = row['sparse']
            cid_set.update(sparce_reaction.keys())
        
        html_writer.write("<table border=1>\n")
        for cid in sorted(list(cid_set)):
            html_writer.write("  <tr><td>C%05d</td><td>%s</td><td>" % (cid, grad.kegg.cid2name(cid)))
            try:
                mol = grad.kegg.cid2mol(cid)
                img_fname = '../res/nist/fig/C%05d.png' % cid
                html_writer.embed_img(img_fname, "C%05d" % cid)
                mol.draw(show=False, filename=img_fname)
            except AssertionError as e:
                html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e)))
            except KeggParseException as e:
                html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e)))
            html_writer.write("</td><td>")
            if (cid in alberty.cid2pmap_dict):
                for (nH, z) in alberty.cid2pmap_dict[cid].keys():
                    html_writer.write("(nH=%d, z=%d)<br>" % (nH, z))
                    csv_writer.writerow((cid, nH, z))
            else:
                nH = grad.kegg.cid2num_hydrogens(cid)
                z = grad.kegg.cid2charge(cid)
                html_writer.write("unknown pseudoisomers<br>")
                html_writer.write("(nH=%d, z=%d)" % (nH, z))
                csv_writer.writerow((cid, nH, z))
            
            html_writer.write("</td></tr>\n")
        html_writer.write("</table>\n")
    html_writer.close()
Esempio n. 17
0
def main():
    db = database.SqliteDatabase('../res/gibbs.sqlite')
    html_writer = HtmlWriter("../res/nist/report.html")
    gc = GroupContribution(db)
    gc.override_gc_with_measurements = True
    gc.init()
    grad = GradientAscent(gc)
    nist = Nist(db, html_writer, gc.kegg())
    nist.FromDatabase()
    alberty = Alberty()
    hatzi = Hatzi()

    if True:
        grad.load_nist_data(nist,
                            alberty,
                            skip_missing_reactions=False,
                            T_range=(298, 314))
        grad.verify_results("Alberty", alberty, html_writer)

        #grad.write_pseudoisomers("../res/nist/nist_dG0_f.csv")

        #html_writer.write("<h2>Using Group Contribution (Hatzimanikatis' implementation)</h2>")
        #html_writer.write("<h3>Correlation with the reduced NIST database (containing only compounds that appear in Alberty's list)</h3>")
        #logging.info("calculate the correlation between Hatzimanikatis' predictions and the reduced NIST database")
        #grad.verify_results("Hatzimanikatis_Reduced", hatzi, html_writer)

        #grad.load_nist_data(nist, hatzi, skip_missing_reactions=True, T_range=(298, 314))
        grad.verify_results("Hatzimanikatis", hatzi, html_writer)

        #grad.load_nist_data(nist, gc, skip_missing_reactions=True, T_range=(298, 314))
        grad.verify_results("Milo", gc, html_writer)
    elif False:
        # Run the gradient ascent algorithm, where the starting point is the same file used for training the GC algorithm
        grad.load_dG0_data("../data/thermodynamics/dG0.csv")
        # load the data for the anchors (i.e. compounds whose dG0 should not be changed - usually their value will be 0).
        grad.anchors = grad.load_dG0_data(
            "../data/thermodynamics/nist_anchors.csv")
        grad.load_nist_data(nist, grad, skip_missing_reactions=True)
        print "Training %d compounds using %d reactions: " % (len(
            grad.cid2pmap_dict.keys()), len(grad.data))
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient1")

    elif False:
        # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006)
        grad.load_nist_data(nist, alberty, skip_missing_reactions=True)
        print "Training %d compounds using %d reactions: " % (len(
            grad.cid2pmap_dict.keys()), len(grad.data))
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient2")

    elif False:
        # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006)
        # Use DETERMINISTIC gradient ascent
        grad.load_nist_data(nist,
                            alberty,
                            skip_missing_reactions=True,
                            T_range=(24 + 273.15, 40 + 273.15))
        print "Training %d compounds using %d reactions: " % (len(
            grad.cid2pmap_dict.keys()), len(grad.data))
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        grad.deterministic_hill_climb(max_i=200)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient_deterministic")

    elif False:
        # Run the gradient ascent algorithm, where the starting point arbitrary (predict all of the NIST compounds)
        grad = GradientAscent(gc)
        grad.load_nist_data(nist, skip_missing_reactions=False)
        print "Training %d compounds using %d reactions: " % (len(
            grad.cid2pmap_dict.keys()), len(grad.data))
        grad.hill_climb(max_i=20000)
        grad.save_energies(grad.gc.comm, "gradient_cid2prm")
        grad.verify_results("gradient3")

    elif False:  # Use Alberty's table from (Mathematica 2006) to calculate the dG0 of all possible reactions in KEGG
        grad = GradientAscent(gc)
        grad.cid2pmap_dict = alberty.cid2pmap_dict
        (pH, I, T) = (7, 0, 300)
        counter = 0
        for rid in grad.kegg.get_all_rids():
            sparse_reaction = grad.kegg.rid2sparse_reaction(rid)
            try:
                dG0 = grad.reaction_to_dG0(sparse_reaction, pH, I, T)
                print "R%05d: dG0_r = %.2f [kJ/mol]" % (rid, dG0)
                counter += 1
            except MissingCompoundFormationEnergy as e:
                #print "R%05d: missing formation energy of C%05d" % (rid, e.cid)
                pass
        print "Managed to calculate the dG0 of %d reactions" % counter

    elif False:
        util._mkdir("../res/nist/fig")
        csv_writer = csv.writer(open("../res/nist/pseudoisomers.csv", "w"))

        cid_set = set()
        for row in nist.data:
            sparce_reaction = row['sparse']
            cid_set.update(sparce_reaction.keys())

        html_writer.write("<table border=1>\n")
        for cid in sorted(list(cid_set)):
            html_writer.write("  <tr><td>C%05d</td><td>%s</td><td>" %
                              (cid, grad.kegg.cid2name(cid)))
            try:
                mol = grad.kegg.cid2mol(cid)
                img_fname = '../res/nist/fig/C%05d.png' % cid
                html_writer.embed_img(img_fname, "C%05d" % cid)
                mol.draw(show=False, filename=img_fname)
            except AssertionError as e:
                html_writer.write("WARNING: cannot draw C%05d - %s" %
                                  (cid, str(e)))
            except KeggParseException as e:
                html_writer.write("WARNING: cannot draw C%05d - %s" %
                                  (cid, str(e)))
            html_writer.write("</td><td>")
            if (cid in alberty.cid2pmap_dict):
                for (nH, z) in alberty.cid2pmap_dict[cid].keys():
                    html_writer.write("(nH=%d, z=%d)<br>" % (nH, z))
                    csv_writer.writerow((cid, nH, z))
            else:
                nH = grad.kegg.cid2num_hydrogens(cid)
                z = grad.kegg.cid2charge(cid)
                html_writer.write("unknown pseudoisomers<br>")
                html_writer.write("(nH=%d, z=%d)" % (nH, z))
                csv_writer.writerow((cid, nH, z))

            html_writer.write("</td></tr>\n")
        html_writer.write("</table>\n")
    html_writer.close()
Esempio n. 18
0
                    v[0, i] = S[cids.index(cid), j]
            if (abs(v * var_P_N) > 1e-10).any():
                dG0_r_prime[0, j] = np.nan
        
        return dG0_r_prime
        
if __name__ == "__main__":

    from pygibbs.groups import GroupContribution
    
    db_public = SqliteDatabase('../data/public_data.sqlite')
    db_gibbs = SqliteDatabase('../res/gibbs.sqlite')
    alberty = PsuedoisomerTableThermodynamics.FromDatabase(\
                        db_public, 'alberty_pseudoisomers', name='alberty')
    
    pgc = GroupContribution(db=db_gibbs, transformed=False)
    pgc.init()
    pgc.name = "PGC"
    
    merged = BinaryThermodynamics(alberty, pgc)
    
    S = np.matrix([[-1,  1,  0,  0,  0,  0,  0,  0,  0], 
                   [ 0, -1, -1,  1,  0,  0, -1,  1,  1], 
                   [ 0,  0,  0, -1,  1,  1,  0,  0,  0], 
                   [ 0, -1, -1,  0,  1,  1, -1,  1,  1]]).T
    cids = [311, 158, 10, 566, 24, 36, 2, 8, 9]
    
    print alberty.GetTransformedFormationEnergies(cids)
    print alberty.GetTransfromedReactionEnergies(S, cids)
    print pgc.GetTransfromedReactionEnergies(S, cids)
    dG0_r_primes = merged.GetTransfromedReactionEnergies(S, cids)