Esempio n. 1
0
    def _Recalculate(self):
        S, cids = self.kegg.reaction_list_to_S(self.reactions)
        known_cids = self.formations.get_all_cids()
        fix_rows = [i for i, cid in enumerate(cids) if cid in known_cids]
        var_rows = [i for i, cid in enumerate(cids) if cid not in known_cids]
        fix_cids = [cids[i] for i in fix_rows]
        var_cids = [cids[i] for i in var_rows]
        
        # subtract the part of the dG0 which is fixed, and leave only the part
        # which is attributed to the NaN compounds.
        fix_S = S[fix_rows, :]
        var_S = S[var_rows, :]
        
        var_P_C, var_P_L = LinearRegression.ColumnProjection(var_S)
        var_P_R, var_P_N = LinearRegression.RowProjection(var_S)

        # take all the known dG0_primes from self.formations
        dG0_f_prime = self.formations.GetTransformedFormationEnergies(fix_cids, 
                                pH=self.pH, I=self.I, T=self.T, pMg=self.pMg)

        # project the dG0_r on the column-space of var_S to eliminate inconsistencies
        # between the dG0_r and the fixed formation energies.
        # then subtract the fixed part of the dG0_r.
        var_dG0_r_prime = np.matrix(self.dG0_r_primes) * var_P_C - dG0_f_prime * fix_S
        var_dG0_f_prime, _ = LinearRegression.LeastSquares(var_S, var_dG0_r_prime)
        
        return var_cids, var_dG0_f_prime, var_P_N
Esempio n. 2
0
    def next(self):
        if self.dimension == len(self):
            raise StopIteration

        while True:
            self.emf_counter += 1

            g_plus, g_minus, coeffs = self.GetSolution()
            self.ExcludeSolutionVector(g_plus, g_minus,
                                       'avoid_%d_plus' % self.emf_counter)
            self.ExcludeSolutionVector(g_minus, g_plus,
                                       'avoid_%d_minus' % self.emf_counter)

            nonzero_indices = np.nonzero(
                g_plus > 0.5)[0].tolist() + np.nonzero(
                    g_minus > 0.5)[0].tolist()
            self.K[self.dimension, nonzero_indices] = coeffs[nonzero_indices]

            if LinearRegression.MatrixRank(self.K) < self.dimension + 1:
                self.K[self.dimension, :] = 0
            else:
                # normalize the kernel vector so that it will have nice coefficients
                g = min(abs(coeffs[nonzero_indices]))
                self.K[self.dimension, :] /= g
                #if sum(self.K[:, self.dimension] < 0.0):
                #    self.K[:, self.dimension] *= -1.0

                v = self.K[self.dimension, :]
                self.AddLinearConstraint(v)
                self.dimension += 1
                return v
Esempio n. 3
0
class SparseKernel(object):
    """
        Finds a sparse representation of the kernel matrix, using MILP
        to iterate the Fundamental Modes of the matrix.
    
        Input:
            a (n x m) matrix A, whose rank is r.
        Return:
            a (m x m-r) matrix K that will span the kernel of A, i.e.:
            span(K) = {x | Ax = 0}
    """
    class LinearProgrammingException(Exception):
        pass

    def __init__(self, A):
        self.upper_bound = 1000
        self.eps = 1e-10
        self.dimension = 0

        try:
            self.cpl = Cplex()
        except NameError, CplexSolverError:
            raise CplexNotInstalledError()

        self.cpl.set_problem_name('find_kernel')
        self.cpl.set_log_stream(None)
        self.cpl.set_results_stream(None)
        self.cpl.set_warning_stream(None)

        self.n_variables = A.shape[1]
        self.CreateAllVariables()
        self.constraint_counter = 0
        for r in xrange(A.shape[0]):
            self.AddLinearConstraint(A[r, :])
        self.kernel_rank = self.n_variables - LinearRegression.MatrixRank(A)
Esempio n. 4
0
    def AnalyzeResiduals(self):
        GS = np.dot(self.G.T, self.S)
        # Write the analysis of residuals:
        # I am not sure if this analysis should be done before "uniquing"
        # the rows of S or after. The observation residual is much smaller
        # in the latter case, since intra-reaction noise is averaged.
        _P_R1, P_N1 = LinearRegression.RowProjection(self.S)
        _P_R2, P_N2 = LinearRegression.RowProjection(GS)

        r_obs = np.linalg.norm(np.dot(self.gibbs_values, P_N1))
        r_est = np.linalg.norm(np.dot(self.gibbs_values, P_N2 - P_N1))
        r_tot = np.linalg.norm(np.dot(self.gibbs_values, P_N2))

        self.html_writer.write('</br><b>Analysis of residuals:<b>\n')
        self.html_writer.insert_toggle(start_here=True)
        residual_text = [
            'r<sub>observation</sub> = %.2f kJ/mol' % r_obs,
            'r<sub>estimation</sub> = %.2f kJ/mol' % r_est,
            'r<sub>total</sub> = %.2f kJ/mol' % r_tot
        ]
        self.html_writer.write_ul(residual_text)
        self.html_writer.div_end()
Esempio n. 5
0
def TestGroupMatrix():
    group_filename = '../data/thermodynamics/hatzimanikatis_groups.csv'
    all_group_names = []
    sparse_matrix = []
    dG_vector = []
    line_no = 0
    for row in csv.DictReader(open(group_filename)):
        line_no += 1
        if row['est_dG'] == "None":
            continue
        dG_vector.append(float(row['est_dG']))

        sparse_groupvec = []
        if row['groups'] != "":
            for token in row['groups'].split(' | '):
                try:
                    [group_name, coeff] = token.split(' : ', 1)
                except ValueError:
                    raise Exception("cannot parse this token (line %d): %s\n" %
                                    (line_no, token))
                coeff = float(coeff)
                if group_name not in all_group_names:
                    all_group_names.append(group_name)
                group_index = all_group_names.index(group_name)
                sparse_groupvec.append((group_index, coeff))
        sparse_matrix.append(sparse_groupvec)

    full_matrix = np.zeros((len(sparse_matrix), len(all_group_names)))
    for i in range(len(sparse_matrix)):
        for j, coeff in sparse_matrix[i]:
            full_matrix[i, j] = coeff
    dG_vector = np.array(dG_vector, ndmin=2).T

    #print full_matrix.shape
    #print LinearRegression.MatrixRank(full_matrix)
    #print dG_vector.shape

    #augmented_matrix = np.hstack([full_matrix, dG_vector])
    #_U, s, _V = np.linalg.svd(augmented_matrix, full_matrices=False)
    #print sorted(s)

    contributions, _K = LinearRegression.LeastSquares(full_matrix, dG_vector)
    for i, group_name in enumerate(all_group_names):
        print "%s,%.3f" % (group_name, contributions[i, 0])

    pyplot.plot(dG_vector, dG_vector - np.dot(full_matrix, contributions), '.')
    pyplot.show()
Esempio n. 6
0
def stoichiometric_matrix2html(html_writer, A, cids, eps=1e-10):
    """
        Print a table in HTML format.
        A is a stoichiometric matrix where each row is a reaction and 
        each column is a compound, corresponding in position to the list "cids".
    """
    dict_list = []
    for i in xrange(A.shape[0]):
        sparse_reaction = dict([(cids[j], A[i, j]) for j in xrange(A.shape[1])
                                if abs(A[i, j]) > eps])
        r = Reaction("reaction%d" % i, sparse_reaction=sparse_reaction)
        dict_list.append({'reaction': r.to_hypertext()})
    html_writer.write_ul([
        '%d rows' % A.shape[0],
        '%d columns' % A.shape[1],
        '%d rank' % LinearRegression.MatrixRank(A)
    ])
    html_writer.write_table(dict_list, headers=['#', 'reaction'])
Esempio n. 7
0
    def GetTransfromedReactionEnergies(self, S, cids, pH=None, I=None, pMg=None, T=None, conc=1):
        """
            Find the set of reaction Gibbs energies that are completely
            consistent with thermo[0], and also close to the energies provided
            by thermo[1].
            To find this solution, we project the vector of Gibbs energies
            obtained using thermo[1] onto the subspace spanned by the columns
            of the stoichiometric matrix (where some of the values are fixed
            according to thermo[0]).
        """
        
        # first try to use thermo[0] to estimate all reaction energies.
        # note that this calculation already adds the effect of concentrations to dG_r.
        dGc_r0 = self.thermo[0].GetTransfromedReactionEnergies(S, cids, pH=pH, I=I, pMg=pMg, T=T, conc=conc)
        if np.all(np.isfinite(dGc_r0)):
            return dGc_r0
        
        # if thermo[1] cannot estimate all reactions, just use thermo[0].
        # note that here we leave out the effect of the concentrations on dG_r,
        # because we are going to use standard formation energies from thermo[0]
        # and fill the gaps using thermo[1].
        dG0_r1 = self.thermo[1].GetTransfromedReactionEnergies(S, cids, pH=pH, I=I, pMg=pMg, T=T)
        if np.isnan(dG0_r1).any():
            return dGc_r0

        dG0_f0 = self.thermo[0].GetTransformedFormationEnergies(cids, pH=pH, I=I, pMg=pMg, T=T)
        
        finite_cols = list(np.where(np.isfinite(dG0_f0))[1].flat)
        nan_cols = list(np.where(np.isnan(dG0_f0))[1].flat)
        fixed_dG0_r = dG0_f0[:, finite_cols] * S[finite_cols, :]

        P_R, P_N = LinearRegression.RowProjection(S[nan_cols, :])
        dG0_r = dG0_r1 * P_R + fixed_dG0_r * P_N
        
        # now add the effect of the concentrations
        if conc != 1:
            return dG0_r + AddConcentrationsToReactionEnergies(S, cids, T, conc)
        else:
            return dG0_r
Esempio n. 8
0
    def AnalyzeTrainingSet(self, skip_formations=True):
        n_obs = self.group_matrix.shape[1]
        rowdicts = []
        fit_results = np.dot(self.group_contributions, self.group_matrix)
        residuals = fit_results - self.obs_values

        if self.transformed:
            sym = symbol_d_G0_prime
        else:
            sym = symbol_d_G0
        for i in xrange(n_obs):
            if self.obs_types[i] in [
                    KeggObservation.TYPE_ACID_BASE, KeggObservation.TYPE_MG,
                    KeggObservation.TYPE_REDOX
            ]:
                continue
            if skip_formations and self.obs_types[
                    i] == KeggObservation.TYPE_FORMATION:
                continue

            rowdict = {'Observation': self.obs_ids[i]}
            rowdict[sym + ' (obs)'] = self.obs_values[0, i]
            rowdict[sym + ' (fit)'] = fit_results[0, i]
            rowdict[sym + ' (res)'] = residuals[0, i]
            rowdict['LOO ' + sym + ' (fit)'] = np.nan
            rowdict['LOO ' + sym + ' (res)'] = np.nan
            rowdict['sortkey'] = 0
            rowdicts.append(rowdict)
            logging.info('Fit Error = %.1f' % residuals[0, i])

            # leave out the row corresponding with observation 'i'
            logging.info('Cross validation, leaving-one-out: ' +
                         self.obs_ids[i])
            subset = range(n_obs)
            subset.pop(i)
            loo_group_contributions, loo_nullspace = LinearRegression.LeastSquares(
                self.group_matrix[:, subset], self.obs_values[:, subset])

            if loo_nullspace.shape[1] > self.group_nullspace.shape[1]:
                logging.warning(
                    'example %d is not linearly dependent in the other examples'
                    % i)
                continue
            rowdict['LOO ' + sym + ' (fit)'] = float(
                np.dot(loo_group_contributions, self.group_matrix[:, i]))
            rowdict['LOO ' + sym + ' (res)'] = \
                rowdict['LOO ' + sym + ' (fit)'] - self.obs_values[0, i]
            rowdict['sortkey'] = abs(rowdict['LOO ' + sym + ' (res)'])
            logging.info('LOO Error = %.1f' % rowdict['LOO ' + sym + ' (res)'])

        logging.info(
            "writing the table of estimation errors for each compound")
        self.html_writer.write('</br><b>Cross validation table</b>')
        self.html_writer.insert_toggle(start_here=True)
        self.html_writer.write('<font size="1">\n')
        obs_vec = np.matrix([row[sym + ' (obs)'] for row in rowdicts])
        resid_vec = np.matrix([row[sym + ' (res)'] for row in rowdicts])
        rmse = rms_flat(resid_vec.flat)

        loo_resid_vec = np.matrix(
            [row['LOO ' + sym + ' (res)'] for row in rowdicts])
        loo_rmse = rms_flat(loo_resid_vec[np.isfinite(loo_resid_vec)].flat)

        self.html_writer.write_ul([
            'fit RMSE = %.1f [kJ/mol]' % rmse,
            'leave-one-out RMSE = %.1f [kJ/mol]' % loo_rmse
        ])
        logging.info("Goodness of fit: RMSE = %.1f [kJ/mol]" % rmse)
        logging.info("Leave-one-out test: RMSE = %.1f [kJ/mol]" % loo_rmse)

        headers = [
            'Observation', sym + ' (obs)', sym + ' (fit)', sym + ' (res)',
            'LOO ' + sym + ' (fit)', 'LOO ' + sym + ' (res)'
        ]
        rowdicts.sort(key=lambda (x): x['sortkey'], reverse=True)
        self.html_writer.write_table(rowdicts, headers, decimal=1)
        self.html_writer.write('</font>\n')
        self.html_writer.div_end()

        self.html_writer.write('</br><b>Cross-validation figure</b>')
        self.html_writer.insert_toggle(start_here=True)

        obs_vs_err_fig = plt.figure(figsize=[6.0, 6.0], dpi=100)
        plt.plot(obs_vec.T, resid_vec.T, '.')
        plt.xlabel('Observation')
        plt.ylabel('Estimated (PGC) Residuals')
        plt.hold(True)
        for row in rowdicts:
            if abs(row[sym + ' (res)']) > 2 * rmse:
                plt.text(row[sym + ' (obs)'],
                         row[sym + ' (res)'],
                         row['Observation'],
                         fontsize=4,
                         figure=obs_vs_err_fig)
        plt.title('Observed vs. Fitted (PGC) Residuals', figure=obs_vs_err_fig)
        self.html_writer.embed_matplotlib_figure(obs_vs_err_fig)
        self.html_writer.div_end()
Esempio n. 9
0
    def WriteRegressionReport(self, T=default_T, pH=default_pH):
        rowdicts = []
        for i in xrange(self.group_matrix.shape[1]):
            groupvec = GroupVector(self.groups_data, self.group_matrix[:, i])
            rowdict = {'#': i, 'ID': self.obs_ids[i]}
            rowdict[self.obs_collection.
                    gibbs_symbol] = '%.1f' % self.obs_values[0, i]
            rowdict['Group Vector'] = str(groupvec)
            rowdicts.append(rowdict)

        self.html_writer.write('</br><b>Regression report</b>')
        self.html_writer.insert_toggle(start_here=True)
        self.html_writer.write('<font size="1">\n')
        self.html_writer.write_ul([
            'observations: %d' % self.group_matrix.shape[1],
            'groups: %d' % self.group_matrix.shape[0],
            'rank: %d' % LinearRegression.MatrixRank(self.group_matrix)
        ])
        self.html_writer.write_table(rowdicts,
                                     headers=[
                                         '#', 'ID', 'Group Vector',
                                         self.obs_collection.gibbs_symbol
                                     ])
        self.html_writer.write('</font>\n')
        self.html_writer.div_end()

        self.html_writer.write('</br><b>Group Contributions</b>\n')
        div_id = self.html_writer.insert_toggle()
        self.html_writer.div_start(div_id)
        self.html_writer.write('</br><font size="1">\n')
        rowdicts = []
        if self.transformed:
            headers = [
                "#", "Group Name", self.obs_collection.gibbs_symbol,
                "acid-base", "formation", "reaction"
            ]
        else:
            headers = [
                "#", "Group Name", "nH", "charge", "nMg",
                self.obs_collection.gibbs_symbol, "acid-base", "formation",
                "reaction"
            ]
        group_names = self.groups_data.GetGroupNames()
        for j, dG0_gr in enumerate(self.group_contributions.flat):
            obs_lists_dict = defaultdict(list)
            for k in self.group_matrix[j, :].nonzero()[1].flat:
                obs_lists_dict[self.obs_types[k]].append(self.obs_ids[k])
            d = {
                "#": "%d" % j,
                "Group Name": group_names[j],
                self.obs_collection.gibbs_symbol: "%.1f" % dG0_gr
            }
            for k, v in obs_lists_dict.iteritems():
                d[k] = ' | '.join(v)
            if not self.transformed:
                group = self.groups_data.all_groups[j]
                d["nH"] = group.hydrogens
                d["charge"] = group.charge
                d["nMg"] = group.nMg
            rowdicts.append(d)
        self.html_writer.write_table(rowdicts, headers)
        self.html_writer.write('</font>\n')
        self.html_writer.div_end()
Esempio n. 10
0
    def Train(self):
        logging.info("Calculating the linear regression data")
        cids, S, b, anchored = self.obs_collection.GetStoichiometry()

        anchored_cols = list(np.where(anchored == 1)[1].flat)
        # now remove anchored data from S and leave only the data which will be
        # used for calculating the group contributions
        g, P_C, P_L = LinearRegression.LeastSquaresProjection(
            S[:, anchored_cols], b[:, anchored_cols])
        self.anchored_cids = cids
        self.anchored_contributions = g * P_C
        self.anchored_P_L = P_L
        self.anchored_P_L[abs(self.anchored_P_L) <= self.epsilon] = 0

        b -= self.anchored_contributions * S
        S = self.anchored_P_L * S

        # set epsilon-small values to absolute 0
        S[np.where(abs(S) <= self.epsilon)] = 0

        # removed zero rows (compounds) from S
        used_cid_indices = set(np.nonzero(np.sum(abs(S), 1))[0].flat)
        for i_cid, cid in enumerate(cids):
            if self.cid2groupvec[cid] is None:
                used_cid_indices.difference_update([i_cid])
                for i_obs in np.nonzero(S[i_cid, :])[1].flat:
                    logging.warning(
                        "%s is removed because C%05d has no group vector, "
                        "but is still part of the final stoichiometric matrix"
                        %
                        (self.obs_collection.observations[i_obs].obs_id, cid))
                    S[:, i_obs] = 0

        used_cid_indices = sorted(used_cid_indices)
        S = S[used_cid_indices, :]

        n_groups = len(self.groups_data.GetGroupNames())  # number of groups
        G = np.matrix(np.zeros((len(used_cid_indices), n_groups)))
        for i, i_cid in enumerate(used_cid_indices):
            G[i, :] = self.cid2groupvec[cids[i_cid]].Flatten()

        GS = G.T * S

        # 'unique' the rows GS. For each set of rows that is united,
        # the Y-value for the new row is the average of the corresponding Y-values.
        unique_GS, col_mapping = LinearRegression.ColumnUnique(
            GS, remove_zero=True)
        unique_b = np.matrix(np.zeros((1, unique_GS.shape[1])))
        unique_obs_types = []
        unique_obs_ids = []
        for i, old_indices in sorted(col_mapping.iteritems()):
            unique_b[0, i] = np.mean(b[0, old_indices])
            obs_list = [
                self.obs_collection.observations[j] for j in old_indices
            ]
            unique_obs_types.append(
                obs_list[0].obs_type
            )  # take the type of the first one (not perfect...)
            unique_obs_ids.append(', '.join([obs.obs_id for obs in obs_list]))

        self.group_matrix = unique_GS
        self.obs_values = unique_b
        self.obs_ids = unique_obs_ids
        self.obs_types = unique_obs_types

        logging.info("Performing linear regression")
        self.group_contributions, self.group_nullspace = \
            LinearRegression.LeastSquares(self.group_matrix, self.obs_values)

        logging.info("Storing the group contribution data in the database")
        self.SaveContributionsToDB()
    def _GetContributionData(self, obs_S, obs_cids, obs_b, obs_anchored):
        assert obs_S.shape[0] == len(obs_cids)
        assert obs_S.shape[1] == obs_b.shape[1]
        assert obs_S.shape[1] == obs_anchored.shape[1]

        # (1)
        # use the anchored reactions to directly estimate the part of est_S
        # which is in their column-span, and normalize that part out from all matrices.
        anchored_cols = list(obs_anchored.nonzero()[1].flat)
        if anchored_cols:
            g_anch, P_C_anch, P_L_anch = LinearRegression.LeastSquaresProjection(
                obs_S[:, anchored_cols], obs_b[:, anchored_cols])
            obs_b -= g_anch * P_C_anch * obs_S  # subtract the contribution of anchored reactions to obs_b
            obs_S = P_L_anch * obs_S  # project obs_S on the residual space
        else:
            g_anch = np.matrix(np.zeros((1, obs_S.shape[0])))
            P_C_anch = np.matrix(np.zeros((obs_S.shape[0], obs_S.shape[0])))
            P_L_anch = np.matrix(np.eye(obs_S.shape[0]))

        # (2)
        # calculate the reactant contributions from obs_S and obs_b, and use that
        # to estimate the part which is in the column-space of NIST.
        g_prc, P_C_prc, P_L_prc = LinearRegression.LeastSquaresProjection(
            obs_S, obs_b)

        # (3)
        # calculate the group contributions from obs_S and obs_b. Note that
        # some reaction involve compounds that don't have groupvectors, and
        # therefore are discarded from this step.
        G, has_groupvec = self._GenerateGroupMatrix(obs_cids)
        bad_compounds = list(np.where(has_groupvec == False)[0].flat)
        reactions_with_groupvec = []
        for i in xrange(obs_S.shape[1]):
            if np.all(abs(obs_S[bad_compounds, i]) < self.epsilon):
                reactions_with_groupvec.append(i)
        obs_GS = G.T * obs_S[:, reactions_with_groupvec]
        g_pgc, P_C_pgc, P_L_pgc = LinearRegression.LeastSquaresProjection(
            obs_GS, obs_b[:, reactions_with_groupvec])

        # calculate the total contributions
        result_dict = {}
        result_dict['names'] = ['anchors', 'reactants', 'groups']
        result_dict['contributions'] = [g_anch, g_prc, g_pgc * P_C_pgc * G.T]
        result_dict['group_contributions'] = g_pgc
        result_dict['column_spaces'] = [P_C_anch, P_C_prc, P_C_pgc]
        result_dict['null_spaces'] = [P_L_anch, P_L_prc, P_L_pgc]
        result_dict['projections'] = [
            P_C_anch, P_C_prc * P_L_anch, P_L_prc * P_L_anch
        ]

        result_dict['total_contributions'] = np.matrix(
            np.zeros((1, len(obs_cids))))
        for g, S in zip(result_dict['contributions'],
                        result_dict['projections']):
            result_dict['total_contributions'] += g * S

        # conservation laws that check if we rely on compounds that have no groupvector
        P_L_bad = (P_L_prc * P_L_anch)[bad_compounds, :]

        # projection of reactions to the residual groupvector space
        G_resid = P_L_prc * P_L_anch * G

        result_dict['bad_conservations'] = P_L_bad
        result_dict['pgc_conservations'] = P_L_pgc
        result_dict['pgc_groupvectors'] = G_resid
        result_dict['conservations'] = np.vstack(
            [P_L_bad, (G_resid * P_L_pgc).T])

        return result_dict
    def LoadData(self, FromDatabase=False):
        if FromDatabase and self.db.DoesTableExist(
                self.STOICHIOMETRIC_TABLE_NAME):
            logging.info("Reading group matrices from database")
            self.S = self.db.LoadSparseNumpyMatrix(
                self.STOICHIOMETRIC_TABLE_NAME)
            self.G = self.db.LoadSparseNumpyMatrix(self.GROUP_TABLE_NAME)
            self.b = self.db.LoadNumpyMatrix(self.GIBBS_ENERGY_TABLE_NAME).T
            self.anchored = self.db.LoadNumpyMatrix(self.ANCHORED_TABLE_NAME).T
            self.has_groupvec = np.sum(self.G, 1) > 0
            self.cids = []
            for rowdict in self.db.DictReader(self.COMPOUND_TABLE_NAME):
                self.cids.append(int(rowdict['cid']))
            self.obs_ids = []
            self.obs_types = []
            self.obs_urls = []
            for rowdict in self.db.DictReader(
                    self.UNIQUE_OBSERVATION_TABLE_NAME):
                self.obs_ids.append(rowdict['id'])
                self.obs_types.append(rowdict['type'])
                self.obs_urls.append(rowdict['url'])
        else:
            logging.info("Calculating group matrices")
            self.cids, S, b, anchored = self.obs_collection.GetStoichiometry()
            if self.CollapseReactions:
                self.S, col_mapping = LinearRegression.ColumnUnique(S)
                self.b = np.matrix(
                    np.zeros((1, len(col_mapping)), dtype='float'))
                self.anchored = np.matrix(
                    np.zeros((1, len(col_mapping)), dtype='int'))
                self.obs_ids = []
                self.obs_types = []
                self.obs_urls = []
                for i, col_indices in col_mapping.iteritems():
                    self.b[0, i] = np.mean(b[0, col_indices])
                    self.anchored[0, i] = anchored[0, col_indices].max()
                    obs_list = [
                        self.obs_collection.observations[j]
                        for j in col_indices
                    ]
                    self.obs_ids.append(', '.join(
                        [obs.obs_id for obs in obs_list]))
                    self.obs_types.append(', '.join(
                        set([obs.obs_type for obs in obs_list])))
                    self.obs_urls.append(', '.join(
                        [obs.url for obs in obs_list]))
            else:
                self.S = S
                self.b = b
                self.anchored = anchored
                self.obs_ids = [
                    obs.obs_id for obs in self.obs_collection.observations
                ]
                self.obs_types = [
                    obs.obs_type for obs in self.obs_collection.observations
                ]
                self.obs_urls = [
                    obs.url for obs in self.obs_collection.observations
                ]

            self.G, self.has_groupvec = self._GenerateGroupMatrix(self.cids)

            # save everything to the database
            self.db.SaveSparseNumpyMatrix(self.STOICHIOMETRIC_TABLE_NAME,
                                          self.S)
            self.db.SaveSparseNumpyMatrix(self.GROUP_TABLE_NAME, self.G)
            self.db.SaveNumpyMatrix(self.GIBBS_ENERGY_TABLE_NAME, self.b.T)
            self.db.SaveNumpyMatrix(self.ANCHORED_TABLE_NAME, self.anchored.T)
            self.db.CreateTable(self.COMPOUND_TABLE_NAME, 'cid INT, name TEXT')
            for cid in self.cids:
                self.db.Insert(self.COMPOUND_TABLE_NAME,
                               [cid, self.kegg.cid2name(cid)])
            self.db.CreateTable(self.UNIQUE_OBSERVATION_TABLE_NAME,
                                'row INT, id TEXT, type TEXT, url TEXT')
            for i in xrange(len(self.obs_ids)):
                self.db.Insert(
                    self.UNIQUE_OBSERVATION_TABLE_NAME,
                    [i, self.obs_ids[i], self.obs_types[i], self.obs_urls[i]])
            self.db.Commit()
Esempio n. 13
0
    def ReverseTransform(self, cid2nH_nMg=None):
        """
            Performs the reverse Legendre transform on all the data in NIST where
            it is possible, i.e. where we have pKa data.
            
            Arguments:
                cid2nH_nMg - a dictionary mapping each compound ID to its chosen
                             pseudoisomer (described by nH and nMg).
        """
        logging.info("Reverse transforming the NIST data")
        nist_rows = self.nist.SelectRowsFromNist()
        logging.info("Selected %d NIST rows out of %d" %
                     (len(nist_rows), len(self.nist.data)))
        
        data = self.GetDissociation().ReverseTransformNistRows(
                                nist_rows, cid2nH_nMg=cid2nH_nMg)
                
        nist_rows_final = data['nist_rows']
        stoichiometric_matrix = data['S']
        cids_to_estimate = data['cids_to_estimate']
        n_cols = stoichiometric_matrix.shape[1]        
        
        logging.info("Only %d out of %d NIST measurements can be used" %
                     (n_cols, len(nist_rows)))

        # squeeze the regression matrix by leaving only unique rows
        unique_cols_S, col_mapping = LinearRegression.ColumnUnique(stoichiometric_matrix)
        logging.info("There are %d unique reactions" % len(col_mapping))
        unique_rids = set([nist_row.reaction.rid for nist_row in nist_rows
                            if nist_row.reaction.rid is not None])
        logging.info("Out of which %d have KEGG reaction IDs" % len(unique_rids))
        
        # for every unique column, calculate the average dG0_r of all the columns that
        # are the same reaction
        
        # full_data_mat will contain these columns: dG0, dG0_tag, dG0 - E[dG0], 
        # dG0_tag - E[dG0_tag], N
        # the averages are over the equivalence set of each reaction (i.e. the 
        # average dG of all the rows in NIST with that same reaction).
        # 'N' is the unique row number (i.e. the ID of the equivalence set)
        full_data_mat = np.matrix(np.zeros((5, n_cols)))
        full_data_mat[0, :] = np.matrix(data['dG0_r'])
        full_data_mat[1, :] = np.matrix(data['dG0_r_tag'])
        
        # unique_data_mat will contain these columns: E[dG0], E[dG0_tag],
        # std(dG0), std(dG0_tag), no. rows
        # there is exactly one row for each equivalence set (i.e. unique reaction)
        # no. rows holds the number of times this unique reaction appears in NIST
        unique_data_mat = np.matrix(np.zeros((5, len(col_mapping))))
        unique_sparse_reactions = []
        unique_nist_row_representatives = []
        for i, col_indices in col_mapping.iteritems():
            col_vector = unique_cols_S[:, i]
            
            # convert the rows of unique_rows_S to a list of sparse reactions
            sparse = {}
            for j in col_vector.nonzero()[0].flat:
                sparse[cids_to_estimate[j]] = unique_cols_S[j, i]
            reaction = Reaction(names=['NIST%03d' % i], sparse_reaction=sparse)
            unique_sparse_reactions.append(reaction)

            # find the list of indices which are equal to row i in unique_rows_S
            unique_nist_row_representatives.append(nist_rows_final[col_indices[0]])
            
            # take the mean and std of the dG0_r of these rows
            sub_data_mat  = full_data_mat[0:2, col_indices]
            unique_data_mat[0:2, i] = np.mean(sub_data_mat, 1)
            unique_data_mat[2:4, i] = np.std(sub_data_mat, 1)
            unique_data_mat[4, i]   = sub_data_mat.shape[1]
            full_data_mat[4, col_indices] = i
            full_data_mat[2:4, col_indices] = sub_data_mat
            for k in col_indices:
                # subtract the mean from each row with this reaction
                full_data_mat[2:4, k] -= unique_data_mat[0:2, i]
                    
        # write a table that lists the variances of each unique reaction
        # before and after the reverse transform
        self.WriteUniqueReactionReport(unique_sparse_reactions,
                                       unique_nist_row_representatives,
                                       unique_data_mat, full_data_mat)
        
        return unique_cols_S, unique_data_mat[0:1, :], cids_to_estimate
Esempio n. 14
0
    def LinearRegression(self, S, obs_dG0_r, cids, cid2nH_nMg,
                         prior_thermodynamics=None):
        logging.info("Regression matrix is %d x %d" % \
                     (S.shape[0], S.shape[1]))

        cid2ref = dict((cid, 'PRC') for cid in cids)
        if prior_thermodynamics:
            # Normalize the contribution of compounds which have formation energies
            # given in the prior. Perform the regression only on the residuals
            # remaining after the normalization (note that the stoichiometric
            # matrix must also be trimmed).
            cid_index_prior = []
            dG0_prior = []
            for i, cid in enumerate(cids):
                nH, nMg = cid2nH_nMg[cid]
                try:
                    pmap_prior = prior_thermodynamics.cid2PseudoisomerMap(cid)
                except MissingCompoundFormationEnergy:
                    continue
                for p_nH, p_z, p_nMg, dG0 in pmap_prior.ToMatrix():
                    if nH == p_nH and p_nMg == nMg:
                        cid_index_prior.append(i)
                        dG0_prior.append(dG0)
                        cid2ref[cid] = pmap_prior.GetRef(p_nH, p_z, p_nMg)
                        break
            
            S_prior = np.matrix(np.zeros((len(cids), len(cid_index_prior))))
            for j, i in enumerate(cid_index_prior):
                S_prior[i, j] = 1
            dG0_prior = np.matrix(dG0_prior)
            g, _ = LinearRegression.LeastSquares(S_prior, dG0_prior)
            P_C, P_L = LinearRegression.ColumnProjection(S_prior)
            prior_dG0_r = g * P_C * S
            new_obs_dG0_r = obs_dG0_r - prior_dG0_r
            new_S = P_L * S
            
            # Find all reactions in new_S which are completely zero. This means that
            # they are completely determined by the prior.
            zero_cols = (abs(new_S).sum(0) < 1e-10).nonzero()[1]
            rowdicts = []
            for j in zero_cols.flat:
                rowdict = {}
                rowdict['reaction'] = NistRegression.row2hypertext(S[:, j], cids)
                rowdict['|error|'] = abs(new_obs_dG0_r[0, j])
                rowdict['error'] = new_obs_dG0_r[0, j]
                rowdict['NIST'] = obs_dG0_r[0, j]
                rowdict['prior'] = prior_dG0_r[0, j]
                rowdicts.append(rowdict)
            rowdicts.sort(key=lambda x:x['|error|'], reverse=True)
            self.html_writer.write('</br><b>Alberty Errors</b>\n')
            self.html_writer.write_table(rowdicts,
                                         headers=['reaction', 'error', 'NIST', 'prior'],
                                         decimal=1)
            
            est_dG0_f, _ = LinearRegression.LeastSquares(new_S, new_obs_dG0_r)
            for j, i in enumerate(cid_index_prior):
                est_dG0_f[0, i] = dG0_prior[0, j]
        else:
            est_dG0_f, _ = LinearRegression.LeastSquares(S, obs_dG0_r)
        
        est_dG0_r = est_dG0_f * S
        residuals = est_dG0_r - obs_dG0_r
        rmse = rms_flat(residuals.flat)
        logging.info("Regression results for reverse transformed data:")
        logging.info("N = %d, RMSE = %.1f" % (S.shape[1], rmse))
       
        self.html_writer.write('<p>RMSE = %.1f [kJ/mol]</p>\n' % rmse)
        rowdicts = []
        headers = ['#', 'Reaction',
                   symbol_dr_G0 + ' (obs)',
                   symbol_dr_G0 + ' (fit)',
                   symbol_dr_G0 + ' (res)']
        for i in xrange(S.shape[1]):
            rowdict = {}
            rowdict['Reaction'] = NistRegression.row2hypertext(S[:, i], cids)
            rowdict[symbol_dr_G0 + ' (obs)'] = obs_dG0_r[0, i]
            rowdict[symbol_dr_G0 + ' (fit)'] = est_dG0_r[0, i]
            rowdict[symbol_dr_G0 + ' (res)'] = residuals[0, i]
            rowdicts.append(rowdict)
        rowdicts.sort(key=lambda x:abs(x[symbol_dr_G0 + ' (res)']), reverse=True)
        self.html_writer.write_table(rowdicts, headers, decimal=1)

        # copy the solution into the diss_tables of all the compounds,
        # and then generate their PseudoisomerMaps.
        for i, cid in enumerate(cids):
            nH, nMg = cid2nH_nMg[cid]
            diss_table = self.GetDissociation().GetDissociationTable(cid)
            z = diss_table.min_charge + (nH - diss_table.min_nH)
            diss_table.SetFormationEnergyByNumHydrogens(est_dG0_f[0, i], nH, nMg)
            pmap = diss_table.GetPseudoisomerMap(nH, nMg)
            pmap.SetRef(nH, z, nMg, cid2ref[cid])
            self.cid2pmap_dict[cid] = pmap
Esempio n. 15
0
def main():
    kegg = Kegg.getInstance()
    prefix = '../res/prc_'

    fixed_cids = {}  # a dictionary from CID to pairs of (nH, dG0)

    # Alberty formation energies directly measured, linearly independent:
    fixed_cids[1] = (2, -237.19)  # H2O
    fixed_cids[9] = (1, -1096.1)  # HPO3(-2)
    fixed_cids[14] = (4, -79.31)  # NH4(+1)
    fixed_cids[59] = (0, -744.53)  # SO4(-2)
    fixed_cids[288] = (1, -586.77)  # HCO3(-1)

    # Alberty zeros:
    fixed_cids[3] = (26, 0.0)  # NAD(ox)
    fixed_cids[10] = (32, 0.0)  # CoA
    fixed_cids[127] = (30, 0.0)  # glutathione(ox)
    fixed_cids[376] = (28, 0.0)  # retinal(ox)

    # Directly measured values
    fixed_cids[4] = (27, 22.65)  # NAD(red) -- relative to NAD(ox)
    fixed_cids[212] = (13, -194.5)  # adenosine
    #fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors'

    # Alberty zeros which are not in NIST:
    #fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox)
    #fixed_cids[16]  = (31, 0.0) # FAD(ox)
    #fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox)
    #fixed_cids[61]  = (19, 0.0) # FMN(ox)
    #fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox)
    #fixed_cids[399] = (90, 0.0) # ubiquinone(ox)

    public_db = SqliteDatabase("../data/public_data.sqlite")
    alberty = PsuedoisomerTableThermodynamics.FromDatabase(
        public_db, 'alberty_pseudoisomers', label=None, name='Alberty')
    alberty_cid2dG0 = {}
    alberty_cid2nH = {}
    for cid in alberty.get_all_cids():
        pmap = alberty.cid2PseudoisomerMap(cid)
        dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer(
            pH=default_pH, I=default_I, pMg=default_pMg, T=default_T)
        alberty_cid2nH[cid] = nH
        alberty_cid2dG0[cid] = dG0

    if not os.path.exists(prefix + 'S.txt'):
        db = SqliteDatabase("../res/gibbs.sqlite")
        nist_regression = NistRegression(db)

        cid2nH = {}
        for cid in nist_regression.nist.GetAllCids():
            if cid in fixed_cids:
                cid2nH[cid] = fixed_cids[cid][0]
            elif cid in alberty_cid2nH:
                cid2nH[cid] = alberty_cid2nH[cid]
            else:
                tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer(
                    cid,
                    pH=default_pH,
                    I=default_I,
                    pMg=default_pMg,
                    T=default_T)
                if tmp is not None:
                    cid2nH[cid] = tmp[0]
                else:
                    logging.warning(
                        'The most abundant pseudoisomer of %s (C%05d) '
                        'cannot be resolved. Using nH = 0.' %
                        (kegg.cid2name(cid), cid))
                    cid2nH[cid] = 0

        #nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction
        #nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40)
        S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH)

        # export the raw data matrices to text files

        C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids])
        np.savetxt(prefix + 'CID.txt', C, fmt='%d', delimiter=',')
        np.savetxt(prefix + 'S.txt', S, fmt='%g', delimiter=',')
        np.savetxt(prefix + 'dG0.txt', dG0, fmt='%.2f', delimiter=',')
    else:
        C = np.loadtxt(prefix + 'CID.txt', delimiter=',')
        cids = [int(cid) for cid in C[:, 0]]
        cid2nH = {}
        for i, cid in enumerate(cids):
            cid2nH[cid] = int(C[i, 1])
        S = np.loadtxt(prefix + 'S.txt', delimiter=',')
        dG0 = np.loadtxt(prefix + 'dG0.txt', delimiter=',')
        dG0 = np.reshape(dG0, (dG0.shape[0], 1))

    html_writer = HtmlWriter('../res/regression_fast.html')
    html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n")
    html_writer.write("<p>The stoichiometric matrix (S):")
    html_writer.insert_toggle(start_here=True)
    stoichiometric_matrix2html(html_writer, S, cids)
    html_writer.div_end()
    html_writer.write('</p>')

    index2value = {}
    S_extended = S  # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds
    for cid in fixed_cids.keys():
        i = cids.index(cid)
        e_i = np.zeros((1, len(cids)))
        e_i[0, i] = 1.0
        S_extended = np.vstack([S_extended, e_i])
        nH, dG0_fixed = fixed_cids[cid]
        index2value[i] = dG0_fixed

    x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value)
    cid2dG0 = {}
    for i, cid in enumerate(cids):
        cid2dG0[cid] = x[i]

    # Calculate the Kernel of the reduced stoichiometric matrix (after removing
    # the columns of the fixed compounds).
    cids_red = [cid for cid in cids if cid not in fixed_cids]
    index_red = [i for i in xrange(len(cids)) if i not in index2value]
    S_red = S[:, index_red]
    K_red = LinearRegression.Kernel(S_red)

    #print "Reduced Stoichiometric Matrix:"
    #print matrix2string(S_red, cids_red, kegg)
    #print '-'*80

    # Find all CIDs that are completely determined and do not depend on any
    # free variable. In other words, all zeros columns in K2.
    dict_list = []

    determined_indices = np.where(
        np.sum(abs(K_red), 0) < 1e-10)[0]  # all zero-columns in reducedK
    determined_cids = [cids_red[i] for i in determined_indices]
    plot_data = []
    for i, cid in enumerate(cids):
        d = {
            'CID': 'C%05d' % cid,
            'Compound': kegg.cid2name(cid),
            'nH': '%d' % cid2nH[cid],
            'dG0 (PRC)': '%.1f' % cid2dG0[cid]
        }
        if cid in alberty_cid2dG0:
            d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid]
            if cid not in fixed_cids:
                plot_data.append(
                    (alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid)))
        else:
            d['dG0 (Alberty)'] = ''

        if cid in fixed_cids:
            d['Depends on'] = 'anchored'
        elif cid in determined_cids:
            d['Depends on'] = 'fixed compounds'
        else:
            d['Depends on'] = 'kernel dimensions'

        dict_list.append(d)

    dict_list.sort(key=lambda (x): (x['Depends on'], x['CID']))
    html_writer.write(
        "<p>Formation energies determined by the linear constraints:")
    html_writer.insert_toggle(start_here=True)
    html_writer.write('<font size="1">')
    html_writer.write_table(dict_list,
                            headers=[
                                '#', 'Compound', 'CID', 'nH', 'dG0 (PRC)',
                                'dG0 (Alberty)', 'Depends on'
                            ])
    html_writer.write('</font>')
    html_writer.div_end()
    html_writer.write('</p>')

    # Plot a comparison between PRC and Alberty formation energies
    fig = plt.figure(figsize=(8, 8), dpi=80)
    plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data],
             'b.',
             figure=fig)
    for x, y, name in plot_data:
        plt.text(x, y, name, fontsize=6)
    plt.xlabel('Alberty $\Delta_f G^\circ$')
    plt.ylabel('PRC $\Delta_f G^\circ$')
    html_writer.write("<p>Plot comparing PRC and Alberty results:")
    html_writer.insert_toggle(start_here=True)
    html_writer.embed_matplotlib_figure(fig)
    html_writer.div_end()
    html_writer.write("</p>")

    K_sparse = SparseKernel(S_red).Solve()
    html_writer.write(
        "<p>The sparse null-space of the reduced stoichiometric matrix:")
    html_writer.insert_toggle(start_here=True)
    stoichiometric_matrix2html(html_writer, K_sparse, cids_red)
    html_writer.div_end()
    html_writer.write("</p>")

    dict_list = []
    index2string_html = dict(
        (i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0]))
    index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0]))
    for i, cid in enumerate(cids_red):
        d = {}
        d['KEGG ID'] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid)
        d['KEGG ID plain'] = 'C%05d' % cid
        d['Compound'] = kegg.cid2name(cid)
        d['nH'] = '%d' % cid2nH[cid]

        if cid in alberty_cid2dG0:
            d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid]
        else:
            d['dG0 (Alberty)'] = ''

        d['dG0 (PRC)'] = '%.1f' % cid2dG0[cid]
        d['dG0 (PRC) plain'] = '%.1f' % cid2dG0[cid]

        indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist()
        indic.reverse()
        d['order_key'] = indic
        if mlab.rms_flat(K_sparse[:, i]) > 1e-10:
            d['dG0 (PRC)'] += " + (" + vector2string(K_sparse[:, i],
                                                     index2string_html) + ")"
            d['dG0 (PRC) plain'] += " + (" + vector2string(
                K_sparse[:, i], index2string) + ")"
        dict_list.append(d)

    dict_list.sort(key=lambda (d): (d['order_key'], d['KEGG ID plain']))

    # Export the results to CSV
    csv_writer = csv.writer(open('../res/prc_results.csv', 'w'))
    csv_writer.writerow(
        ['KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)'])
    for d in dict_list:
        csv_writer.writerow([
            d['KEGG ID plain'], d['Compound'], d['nH'], d['dG0 (PRC) plain'],
            d['dG0 (Alberty)']
        ])

    html_writer.write(
        "<p>All formation energies as a function of the free variables:")
    html_writer.insert_toggle(start_here=True)
    html_writer.write('<font size="1">')
    html_writer.write_table(dict_list,
                            headers=[
                                '#', 'KEGG ID', 'Compound', 'nH', 'dG0 (PRC)',
                                'dG0 (Alberty)'
                            ])
    html_writer.write('</font>')
    html_writer.div_end()
    html_writer.write('</p>')

    fp = open('../res/prc_latex.txt', 'w')
    fp.write(
        latex.table2LaTeX(dict_list,
                          headers=[
                              '#', 'KEGG ID plain', 'Compound', 'nH',
                              'dG0 (PRC) plain', 'dG0 (Alberty)'
                          ]))
    fp.close()