Exemple #1
0
    def Train(self):
        logging.info("Calculating the linear regression data")
        cids, S, b, anchored = self.obs_collection.GetStoichiometry()

        anchored_cols = list(np.where(anchored == 1)[1].flat)
        # now remove anchored data from S and leave only the data which will be
        # used for calculating the group contributions
        g, P_C, P_L = LinearRegression.LeastSquaresProjection(
            S[:, anchored_cols], b[:, anchored_cols])
        self.anchored_cids = cids
        self.anchored_contributions = g * P_C
        self.anchored_P_L = P_L
        self.anchored_P_L[abs(self.anchored_P_L) <= self.epsilon] = 0

        b -= self.anchored_contributions * S
        S = self.anchored_P_L * S

        # set epsilon-small values to absolute 0
        S[np.where(abs(S) <= self.epsilon)] = 0

        # removed zero rows (compounds) from S
        used_cid_indices = set(np.nonzero(np.sum(abs(S), 1))[0].flat)
        for i_cid, cid in enumerate(cids):
            if self.cid2groupvec[cid] is None:
                used_cid_indices.difference_update([i_cid])
                for i_obs in np.nonzero(S[i_cid, :])[1].flat:
                    logging.warning(
                        "%s is removed because C%05d has no group vector, "
                        "but is still part of the final stoichiometric matrix"
                        %
                        (self.obs_collection.observations[i_obs].obs_id, cid))
                    S[:, i_obs] = 0

        used_cid_indices = sorted(used_cid_indices)
        S = S[used_cid_indices, :]

        n_groups = len(self.groups_data.GetGroupNames())  # number of groups
        G = np.matrix(np.zeros((len(used_cid_indices), n_groups)))
        for i, i_cid in enumerate(used_cid_indices):
            G[i, :] = self.cid2groupvec[cids[i_cid]].Flatten()

        GS = G.T * S

        # 'unique' the rows GS. For each set of rows that is united,
        # the Y-value for the new row is the average of the corresponding Y-values.
        unique_GS, col_mapping = LinearRegression.ColumnUnique(
            GS, remove_zero=True)
        unique_b = np.matrix(np.zeros((1, unique_GS.shape[1])))
        unique_obs_types = []
        unique_obs_ids = []
        for i, old_indices in sorted(col_mapping.iteritems()):
            unique_b[0, i] = np.mean(b[0, old_indices])
            obs_list = [
                self.obs_collection.observations[j] for j in old_indices
            ]
            unique_obs_types.append(
                obs_list[0].obs_type
            )  # take the type of the first one (not perfect...)
            unique_obs_ids.append(', '.join([obs.obs_id for obs in obs_list]))

        self.group_matrix = unique_GS
        self.obs_values = unique_b
        self.obs_ids = unique_obs_ids
        self.obs_types = unique_obs_types

        logging.info("Performing linear regression")
        self.group_contributions, self.group_nullspace = \
            LinearRegression.LeastSquares(self.group_matrix, self.obs_values)

        logging.info("Storing the group contribution data in the database")
        self.SaveContributionsToDB()
    def _GetContributionData(self, obs_S, obs_cids, obs_b, obs_anchored):
        assert obs_S.shape[0] == len(obs_cids)
        assert obs_S.shape[1] == obs_b.shape[1]
        assert obs_S.shape[1] == obs_anchored.shape[1]

        # (1)
        # use the anchored reactions to directly estimate the part of est_S
        # which is in their column-span, and normalize that part out from all matrices.
        anchored_cols = list(obs_anchored.nonzero()[1].flat)
        if anchored_cols:
            g_anch, P_C_anch, P_L_anch = LinearRegression.LeastSquaresProjection(
                obs_S[:, anchored_cols], obs_b[:, anchored_cols])
            obs_b -= g_anch * P_C_anch * obs_S  # subtract the contribution of anchored reactions to obs_b
            obs_S = P_L_anch * obs_S  # project obs_S on the residual space
        else:
            g_anch = np.matrix(np.zeros((1, obs_S.shape[0])))
            P_C_anch = np.matrix(np.zeros((obs_S.shape[0], obs_S.shape[0])))
            P_L_anch = np.matrix(np.eye(obs_S.shape[0]))

        # (2)
        # calculate the reactant contributions from obs_S and obs_b, and use that
        # to estimate the part which is in the column-space of NIST.
        g_prc, P_C_prc, P_L_prc = LinearRegression.LeastSquaresProjection(
            obs_S, obs_b)

        # (3)
        # calculate the group contributions from obs_S and obs_b. Note that
        # some reaction involve compounds that don't have groupvectors, and
        # therefore are discarded from this step.
        G, has_groupvec = self._GenerateGroupMatrix(obs_cids)
        bad_compounds = list(np.where(has_groupvec == False)[0].flat)
        reactions_with_groupvec = []
        for i in xrange(obs_S.shape[1]):
            if np.all(abs(obs_S[bad_compounds, i]) < self.epsilon):
                reactions_with_groupvec.append(i)
        obs_GS = G.T * obs_S[:, reactions_with_groupvec]
        g_pgc, P_C_pgc, P_L_pgc = LinearRegression.LeastSquaresProjection(
            obs_GS, obs_b[:, reactions_with_groupvec])

        # calculate the total contributions
        result_dict = {}
        result_dict['names'] = ['anchors', 'reactants', 'groups']
        result_dict['contributions'] = [g_anch, g_prc, g_pgc * P_C_pgc * G.T]
        result_dict['group_contributions'] = g_pgc
        result_dict['column_spaces'] = [P_C_anch, P_C_prc, P_C_pgc]
        result_dict['null_spaces'] = [P_L_anch, P_L_prc, P_L_pgc]
        result_dict['projections'] = [
            P_C_anch, P_C_prc * P_L_anch, P_L_prc * P_L_anch
        ]

        result_dict['total_contributions'] = np.matrix(
            np.zeros((1, len(obs_cids))))
        for g, S in zip(result_dict['contributions'],
                        result_dict['projections']):
            result_dict['total_contributions'] += g * S

        # conservation laws that check if we rely on compounds that have no groupvector
        P_L_bad = (P_L_prc * P_L_anch)[bad_compounds, :]

        # projection of reactions to the residual groupvector space
        G_resid = P_L_prc * P_L_anch * G

        result_dict['bad_conservations'] = P_L_bad
        result_dict['pgc_conservations'] = P_L_pgc
        result_dict['pgc_groupvectors'] = G_resid
        result_dict['conservations'] = np.vstack(
            [P_L_bad, (G_resid * P_L_pgc).T])

        return result_dict