def test_basic(self): a = khatri_rao(array([[1, 2], [3, 4]]), array([[5, 6], [7, 8]])) assert_array_equal(a, array([[5, 12], [7, 16], [15, 24], [21, 32]])) b = khatri_rao(np.empty([2, 2]), np.empty([2, 2])) assert_array_equal(b.shape, (4, 2))
def test_number_of_columns_equality(self): with pytest.raises(ValueError): a = array([[1, 2, 3], [4, 5, 6]]) b = array([[1, 2], [3, 4]]) khatri_rao(a, b)
def ALS_solver(X, r, nmax = 1000, err_tol = 1e-4): """ Parameters ---------- X : tensor like B1 r : tensor rank nmax : maximum number of iterations The default is 1000. err_tol : tolerance for relative residual error, optional The default is 1e-4. Returns ------- A : matrix with size n by r B : matrix with size n by r C : matrix with size n by r X_hat : approximated tensor with same shape as X """ n3, n2, n1 = X.shape B = np.random.normal(0, 1, (n2, r)) C = np.random.normal(0, 1, (n3, r)) X1 = tensor2matrix(X, 1) X2 = tensor2matrix(X, 2) X3 = tensor2matrix(X, 3) X_norm = lin.norm(X1, 'fro') err = np.inf B = col_normalize(B) i = 0 while (err >= err_tol) and i < nmax: C = col_normalize(C) tem1 = lin.khatri_rao(C, B) A, res, rnk, s = lin.lstsq(tem1, X1.T) A = A.T A = col_normalize(A) tem2 = lin.khatri_rao(C, A) B, res, rnk, s = lin.lstsq(tem2, X2.T) B = B.T B = col_normalize(B) tem3 = lin.khatri_rao(B, A) C, res, rnk, s = lin.lstsq(tem3, X3.T) C = C.T X_hat1 = A.dot(lin.khatri_rao(C, B).T) err = lin.norm(X_hat1 - X1, 'fro') / X_norm i += 1 print('Relative error at iteration ', i, ': ', err) X_hat = matrix2tensor(X_hat1, X.shape) print('Finished!') return A, B, C, X_hat
def compute_vec_tensor(U, V, W): """ Return vectorized tensor from CP decomposition """ # return np.sum(lin.khatri_rao(lin.khatri_rao(U, V), W), axis = 1) r = U.shape[1] out = 0 for i in range(r): out += lin.khatri_rao(lin.khatri_rao(np.expand_dims(U[:, i], 1), np.expand_dims(V[:, i], 1)), np.expand_dims(W[:, i], 1)) return np.squeeze(out)
def perform_CMTF(tOrig=None, mOrig=None, r=10): """ Perform CMTF decomposition. """ if tOrig is None: tOrig, mOrig = createCube() tFac = CPTensor(initialize_cp(np.nan_to_num(tOrig, nan=np.nanmean(tOrig)), r, non_negative=True)) mFac = CPTensor(initialize_cp(np.nan_to_num(mOrig, nan=np.nanmean(mOrig)), r, non_negative=True)) # Pre-unfold selPat = np.all(np.isfinite(mOrig), axis=1) unfolded = tl.unfold(tOrig, 0) missing = np.any(np.isnan(unfolded), axis=0) unfolded = unfolded[:, ~missing] R2X = -1.0 mFac.factors[0] = tFac.factors[0] mFac.factors[1] = np.linalg.lstsq(mFac.factors[0][selPat, :], mOrig[selPat, :], rcond=None)[0].T for ii in range(8000): # Solve for the subject matrix kr = khatri_rao(tFac.factors[1], tFac.factors[2])[~missing, :] kr2 = np.vstack((kr, mFac.factors[1])) unfolded2 = np.hstack((unfolded, mOrig)) tFac.factors[0] = censored_lstsq(kr2, unfolded2.T) mFac.factors[0] = tFac.factors[0] # PARAFAC on other antigen modes for m in [1, 2]: kr = khatri_rao(tFac.factors[0], tFac.factors[3 - m]) unfold = tl.unfold(tOrig, m) tFac.factors[m] = censored_lstsq(kr, unfold.T) # Solve for the glycan matrix fit mFac.factors[1] = np.linalg.lstsq(mFac.factors[0][selPat, :], mOrig[selPat, :], rcond=None)[0].T if ii % 20 == 0: R2X_last = R2X R2X = calcR2X(tOrig, mOrig, tFac, mFac) if R2X - R2X_last < 1e-6: break tFac.normalize() mFac.normalize() # Reorient the later tensor factors tFac.factors, mFac.factors = reorient_factors(tFac.factors, mFac.factors) return tFac, mFac, R2X
def eval(self, data, eval_env, encoding): # TODO: factor can't be a call or interaction yet. if isinstance(self.factor, Term): factor = data[self.factor.variable] if not hasattr(factor.dtype, "ordered") or not factor.dtype.ordered: categories = sorted(factor.unique().tolist()) cat_type = pd.api.types.CategoricalDtype(categories=categories, ordered=True) factor = factor.astype(cat_type) else: raise ValueError( "Factor on right hand side of group specific term must be a single term." ) # Notation as in lme4 paper Ji = pd.get_dummies( factor).to_numpy() # note we don't use `drop_first=True`. Xi = self.expr.eval(data, eval_env, encoding) Zi = linalg.khatri_rao(Ji.T, Xi["value"].T).T out = { "type": Xi["type"], "Xi": Xi["value"], "Ji": Ji, "Zi": sparse.coo_matrix(Zi), "groups": factor.cat.categories.tolist(), } if Xi["type"] == "categoric": if "levels" in Xi.keys(): out["levels"] = Xi["levels"] out["reference"] = Xi["reference"] out["encoding"] = Xi["encoding"] else: out["reference"] = Xi["reference"] return out
def eval_new_data(self, data): """Evaluates the term with new data. Converts the variable in ``factor`` to the type remembered from the first evaluation and produces the design matrix for this grouping, calls ``.eval_new_data()`` on ``self.expr`` to obtain the design matrix for the ``expr`` side, then computes the design matrix corresponding to the group specific effect. Parameters ---------- data: pd.DataFrame The data frame where variables are taken from. Returns ---------- Zi: np.ndarray """ Xi = self.expr.eval_new_data(data) Ji = self.factor.eval_new_data(data) if Xi.ndim == 1: Xi = Xi[:, np.newaxis] if Ji.ndim == 1: Ji = Ji[:, np.newaxis] Zi = linalg.khatri_rao(Ji.T, Xi.T).T return Zi
def test_equality_of_two_equations(self): a = array([[1, 2], [3, 4]]) b = array([[5, 6], [7, 8]]) res1 = khatri_rao(a, b) res2 = np.vstack( [np.kron(a[:, k], b[:, k]) for k in range(b.shape[1])]).T assert_array_equal(res1, res2)
def findGraphLaplacian(self): # dimensions L = self.L # hyperparams p = self.p beta_2 = self.beta_2 # find the diffusion process H = self.findDiffusionProcess().squeeze() # get the eigendecomposition of the diffusion process Up, Vp = np.linalg.eig(H) Wp = np.empty((0, L, L)) for i in range(p): # get eignevectors of H_p V = Vp[i] # get U U = khatri_rao(V[:, 1:], V[:, 1:]) # get set D D, Dcomp = self.getSetDandDcomp() # test the feasibility of the problem # rank_U_D = np.linalg.matrix_rank(U[D]) # if rank_U_D <= L-1: # print("Rank of W_D : {}, Problem is Feasible".format(rank_U_D)) # else: # print("Rank of W_D : {}, Problem is infeasible".format(rank_U_D)) # compute A Q_full = np.identity(L**2) - np.matmul(U, np.linalg.pinv(U)) Q = Q_full[Dcomp] A = Q.T # compute b b = -np.matmul(Q_full[D].T, np.ones((L, 1))) # solve basis pursuit with noisy observations # l_Dcomp = np.array(l1regls(matrix(A / np.sqrt(beta_2)), matrix(b / np.sqrt(beta_2)))) l_Dcomp = np.matmul( np.linalg.inv( np.matmul(A.T, A) + beta_2 * np.identity(A.shape[1])), np.matmul(A.T, b)) # obtain W from L l = np.zeros((L * L, )) l[Dcomp] = l_Dcomp.squeeze() Laplacian = l.reshape(L, L).T np.fill_diagonal(Laplacian, 1) Laplacian[abs(Laplacian) < 1e-5] = 0 W = np.identity(L) - Laplacian Wp = np.vstack((Wp, W.reshape(1, L, L))) W = np.mean(Wp, axis=0) return W
def _evaluate_new_data(self, data): if not self.evaluated: raise ValueError("Can't evaluate new data on unevaluated matrix.") new_instance = self.__class__(self.terms) start_row = start_col = 0 Z = [] for term in self.terms: d = term.eval_new_data(data) if d["type"] == "categoric": levels = d["levels"] if d["encoding"] == "full" else d[ "levels"][1:] Ji = d["Ji"] for idx, level in enumerate(levels): Xi = np.atleast_2d(d["Xi"][:, idx]).T Zi = linalg.khatri_rao(Ji.T, Xi.T).T delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string(level) # All the info, except from the indexes, is copied. new_instance.terms_info[term_name] = deepcopy( self.terms_info[term_name]) new_instance.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col else: Zi = d["Zi"] delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string() new_instance.terms_info[term_name] = deepcopy( self.terms_info[term_name]) new_instance.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col new_instance.data = data new_instance.eval_env = self.eval_env # Stored in Compressed Sparse Column format if Z: new_instance.design_matrix = sp.sparse.block_diag(Z).tocsc() return new_instance
def recovery_matrix_RKKP_Strassen(C_tilda, saved_nodes, P, Q, l, p): (N, m), n = P.shape, Q.shape[1] G = khatri_rao(P.T, Q.T).T G_pinv = np.linalg.pinv(G[saved_nodes]) C_tilda_reshaped = C_tilda.reshape(len(saved_nodes), -1) C_rec_reshaped = G_pinv @ C_tilda_reshaped C_rec = C_rec_reshaped[::m + 1] C_rec = np.vstack([np.split(c, l) for c in C_rec]) return C_rec
def eval(self, data): if isinstance(self.factor, Term): factor = data[self.factor.variable] else: raise ValueError("Factor on right hand side of group specific term can only be a term.") # Notation as in lme4 paper Ji = pd.get_dummies(factor).to_numpy() # note we don't use `drop_first=True`. Xi = self.expr.eval(data) Zi = linalg.khatri_rao(Ji.T, Xi["value"].T).T out = {"type": Xi["type"], "Zi": sparse.coo_matrix(Zi)} if Xi["type"] == "categoric": out["levels"] = Xi["levels"] out["reference"] = Xi["reference"] return out
def recovery_matrix_RKKP(C_tilda, saved_nodes, P, Q, l, p): (N, m), n = P.shape, Q.shape[1] G = khatri_rao(P.T, Q.T).T G_pinv = np.linalg.pinv(G[saved_nodes]) C_tilda_reshaped = C_tilda.reshape(len(saved_nodes), -1) C_rec_reshaped = G_pinv @ C_tilda_reshaped C_rec_reshaped = np.vstack([C_i.reshape(l, p) for C_i in C_rec_reshaped]) C_rec = np.hstack(np.split(C_rec_reshaped, m * n)) C_rec = np.vstack(np.split(C_rec, m, 1)) return C_rec
def set_data(self, spans_intercept): self.expr.set_data(spans_intercept) self.factor.set_data( True ) # Factor is a categorical term that always spans the intercept # Obtain group names. These are obtained from the labels of the contrast matrices groups = [] for component in self.factor.components: groups.append(component.contrast_matrix.labels) self.groups = [":".join(s) for s in list(itertools.product(*groups))] Xi, Ji = self.expr.data, self.factor.data if Xi.ndim == 1: Xi = Xi[:, np.newaxis] if Ji.ndim == 1: Ji = Ji[:, np.newaxis] self.data = linalg.khatri_rao(Ji.T, Xi.T).T # Zi self.kind = self.expr.kind
def eval_new_data(self, data): """Evaluates the term with new data.""" # factor uses the same data type that is used in first evaluation. factor = data[self.factor.name].astype(self.factor_type) Xi = self.expr.eval_new_data(data) Ji = pd.get_dummies(factor).to_numpy() Zi = linalg.khatri_rao(Ji.T, Xi.T).T out = { "type": self.expr.metadata["type"], "Xi": Xi, "Ji": Ji, "Zi": sparse.coo_matrix(Zi), "groups": factor.cat.categories.tolist(), } if self.expr._type == "categoric": # pylint: disable = protected-access out["levels"] = self.expr.metadata["levels"] out["reference"] = self.expr.metadata["reference"] out["encoding"] = self.expr.metadata["encoding"] elif self.expr._type == "interaction": # pylint: disable = protected-access out["terms"] = self.expr.metadata["terms"] return out
def eval_new_data(self, data): """Evaluates the term with new data. Converts the variable in ``factor`` to the type remembered from the first evaluation and produces the design matrix for this grouping, calls ``.eval_new_data()`` on ``self.expr`` to obtain the design matrix for the ``expr`` side, then computes the design matrix corresponding to the group specific effect. Parameters ---------- data: pd.DataFrame The data frame where variables are taken from. Returns ---------- out: dict Same rules as in :meth:`eval <GroupSpecificTerm.eval>`. """ # factor uses the same data type that is used in first evaluation. factor = data[self.factor.name].astype(self.factor_type) Xi = self.expr.eval_new_data(data) Ji = pd.get_dummies(factor).to_numpy() Zi = linalg.khatri_rao(Ji.T, Xi.T).T out = { "type": self.expr.metadata["type"], "Xi": Xi, "Ji": Ji, "Zi": sparse.coo_matrix(Zi), "groups": factor.cat.categories.tolist(), } if self.expr._type == "categoric": # pylint: disable = protected-access out["levels"] = self.expr.metadata["levels"] out["reference"] = self.expr.metadata["reference"] out["encoding"] = self.expr.metadata["encoding"] elif self.expr._type == "interaction": # pylint: disable = protected-access out["terms"] = self.expr.metadata["terms"] return out
def eval(self, data, eval_env, encoding): # Note: factor can't be a call or interaction yet. if len(self.factor.components) == 1 and isinstance(self.factor.components[0], Variable): factor = data[self.factor.name] if not hasattr(factor.dtype, "ordered") or not factor.dtype.ordered: categories = sorted(factor.unique().tolist()) type_ = pd.api.types.CategoricalDtype(categories=categories, ordered=True) factor = factor.astype(type_) else: type_ = factor.dtype self.factor_type = type_ else: raise ValueError( "Factor on right hand side of group specific term must be a single term." ) # Notation as in lme4 paper # Note we don't use `drop_first=True` for factor. self.expr.set_type(data, eval_env) self.expr.set_data(encoding) Xi = self.expr.data Ji = pd.get_dummies(factor).to_numpy() Zi = linalg.khatri_rao(Ji.T, Xi.T).T out = { "type": self.expr.metadata["type"], "Xi": Xi, "Ji": Ji, "Zi": sparse.coo_matrix(Zi), "groups": factor.cat.categories.tolist(), } if self.expr._type == "categoric": # pylint: disable = protected-access out["levels"] = self.expr.metadata["levels"] out["reference"] = self.expr.metadata["reference"] out["encoding"] = self.expr.metadata["encoding"] elif self.expr._type == "interaction": # pylint: disable = protected-access out["terms"] = self.expr.metadata["terms"] return out
def test_to_assure_2d_array(self): with pytest.raises(ValueError): # both arrays are 1-D a = array([1, 2, 3]) b = array([4, 5, 6]) khatri_rao(a, b) with pytest.raises(ValueError): # first array is 1-D a = array([1, 2, 3]) b = array([[1, 2, 3], [4, 5, 6]]) khatri_rao(a, b) with pytest.raises(ValueError): # second array is 1-D a = array([[1, 2, 3], [7, 8, 9]]) b = array([4, 5, 6]) khatri_rao(a, b)
import numpy as np from scipy.linalg import khatri_rao import tensorly as ts #TODO implement in a function # start off with an example chi = (np.arange(24) + 1).reshape(2, 3, 4) r_guess = 3 a_guesses = np.array([np.random.random((np.size(chi, n), r_guess)) for n in np.arange(np.ndim(chi))], dtype=object) # print(np.shape(a_guesses[0])) # loop until happy with the result # set up a while loop here with an error condition v_total = np.eye(r_guess) for m in np.arange(np.ndim(chi)): for n in np.arange(np.ndim(chi)): # do a cascaded multiplication of all the factor matrices v_total = np.multiply(v_total, np.matmul(a_guesses[n].T, a_guesses[n])) # intermediate result: get the khatri-rao product of all the factor matrices a_khatri_prod = a_guesses[np.ndim(chi) - 1] # start with last matrix for p in np.arange(np.ndim(chi), -1, -1): a_khatri_prod = khatri_rao(a_khatri_prod, a_guesses[p]) a_guesses[n] = np.matmul(np.matmul(ts.unfold(chi, n), a_khatri_prod), v_total.T) # figure out a way to normalize a_guesses
def _evaluate_new_data(self, data): """Evaluates group specific terms with new data and return a new instance of ``GroupEffectsMatrix``. This method is intended to be used to obtain design matrices for new data and obtain out of sample predictions. Stateful transformations are properly handled if present in any of the group specific terms, which means parameters involved in the transformation are not overwritten with the new data. Parameters ---------- data: pandas.DataFrame The data frame where variables are taken from Returns ---------- new_instance: GroupEffectsMatrix A new instance of ``GroupEffectsMatrix`` whose design matrix is obtained with the values in the new data set. """ if not self.evaluated: raise ValueError("Can't evaluate new data on unevaluated matrix.") new_instance = self.__class__(self.terms) start_row = start_col = 0 Z = [] for term in self.terms: d = term.eval_new_data(data) if d["type"] == "categoric": levels = d["levels"] if d["encoding"] == "full" else d[ "levels"][1:] Ji = d["Ji"] for idx, level in enumerate(levels): Xi = np.atleast_2d(d["Xi"][:, idx]).T Zi = linalg.khatri_rao(Ji.T, Xi.T).T delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string(level) # All the info, except from the indexes, is copied. new_instance.terms_info[term_name] = deepcopy( self.terms_info[term_name]) new_instance.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col else: Zi = d["Zi"] delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string() new_instance.terms_info[term_name] = deepcopy( self.terms_info[term_name]) new_instance.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col new_instance.data = data new_instance.eval_env = self.eval_env # Stored in Compressed Sparse Column format if Z: new_instance.design_matrix = sp.sparse.block_diag(Z).tocsc() return new_instance
def findGraph(self): # dimensions L = self.L # find the diffusion process H = self.findDiffusionProcess().squeeze() # get the eigendecomposition of the diffusion process U, V = np.linalg.eig(H) V_hat = np.zeros((L, L)) for i in range(self.p): if np.sign(V[0, 0, 0]) == np.sign(V[i, 0, 0]): V_hat += V[i] else: V_hat -= V[i] V_hat /= self.p V = V_hat # get U U = khatri_rao(V, V) Uh, S, Vh = np.linalg.svd(U) S[-1] = 0 S[-2] = 0 S_mod = np.concatenate((np.diag(S), np.zeros((56, 8)))) U = np.matmul(np.matmul(Uh, S_mod), Vh) # get set D D, Dcomp = self.getSetDandDcomp() # test the feasibility of the problem rank_U_D = np.linalg.matrix_rank(U[D]) if rank_U_D <= L - 1: print("Rank of W_D : {}, Problem is Feasible".format(rank_U_D)) else: print("Rank of W_D : {}, Problem is infeasible".format(rank_U_D)) # # compute R M_full = np.identity(L**2) - np.matmul(U, np.linalg.pinv(U)) M = M_full[Dcomp] # e1 = np.zeros((L, 1)) # e1[0] = 1 # l = np.ones((L-1, 1)) ll = np.zeros((len(Dcomp), 1)) ll[np.arange(0, self.L - 1, 1)] = 1 # R = np.hstack((M, np.kron(e1, l))) R = np.hstack((M, ll)) # get b b = np.zeros((L**2 + 1, 1)) b[-1] = 1 # solve basis pursuit with noisy observations beta = 0.001 A = R.T w_Dcomp = np.array( l1regls(matrix(A / np.sqrt(beta)), matrix(b / np.sqrt(beta)))) # get adjacency matrix W = self.unvectorize(w_Dcomp.squeeze()) # w = np.zeros((L*L, )) # w[Dcomp] = w_Dcomp.squeeze() # W = w.reshape(L, L).T W[abs(W) < 1e-5] = 0 return W
def evaluate(self): """Evaluates `self.terms` inside the data mask provided by `data` and updates `self.design_matrix`. """ start_row = 0 start_col = 0 Z = [] self.terms_info = {} for term in self.terms: encoding = True if not isinstance(term.expr, InterceptTerm): for term_ in self.terms: if term_.factor == term.factor and isinstance( term_.expr, InterceptTerm): encoding = False d = term.eval(self.data, self.eval_env, encoding) if d["type"] == "categoric": levels = d["levels"] if d["encoding"] == "full" else d[ "levels"][1:] for idx, level in enumerate(levels): Xi = np.atleast_2d(d["Xi"][:, idx]).T Ji = d["Ji"] Zi = linalg.khatri_rao(Ji.T, Xi.T).T delta_row = Zi.shape[0] delta_col = Zi.shape[1] Z.append(Zi) term_name = term.to_string(level) self.terms_info[term_name] = { "type": "categoric", "Xi": Xi, "Ji": Ji, "groups": d["groups"], "encoding": d["encoding"], "levels": d["levels"], "reference": d["reference"], "full_names": [f"{term_name}[{group}]" for group in d["groups"]], } self.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col else: Zi = d["Zi"] delta_row = Zi.shape[0] delta_col = Zi.shape[1] Z.append(Zi) term_name = term.to_string() self.terms_info[term_name] = { k: v for k, v in d.items() if k != "Zi" } self.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) self.terms_info[term_name][ "full_names"] = self.get_term_full_names(term_name) start_row += delta_row start_col += delta_col # Stored in Compressed Sparse Column format if Z: self.design_matrix = sp.sparse.block_diag(Z).tocsc() else: self.design_matrix = np.zeros((0, 0))
def eval(self, data, eval_env, encoding): """Evaluates term. First, it evaluates the variable in ``self.factor``, creates an oredered categorical data type using its levels, and stores it in ``self.factor_type``. Then, it obtains the design matrix for ``self.expr`` to finally produce the matrix for the group specific effect. The output contains the following information * ``"type"``: The type of the ``expr`` term. * ``"Xi"``: The design matrix for the ``expr`` term. * ``"Ji"``: The design matrix for the ``factor`` term. * ``"Zi"``: The design matrix for the group specific term. * ``"groups"``: The groups present in ``factor``. If ``"type"`` is ``"categoric"``, the output dictionary also contains * ``"levels"``: Levels of the term in ``expr``. * ``"reference"``: The level taken as baseline. * ``"encoding"``: The encoding of the term, either ``"full"`` or ``"reduced"`` If ``"type"`` is ``"interaction"``, the output dictionary also contains * ``"terms"``: Metadata for each of the components in the interaction in ``expr``. Parameters ---------- data: pandas.DataFrame The data frame where variables are taken from. eval_env: EvalEnvironment The environment where values and functions are taken from. encoding: bool Whether to use full or reduced rank encoding when ``expr`` is categoric. Returns ------- out: dict See above. """ # Evaluate factor and save type to self.factor_type. # Note: factor can't be a call or interaction yet. if len(self.factor.components) == 1 and isinstance( self.factor.components[0], Variable): factor = data[self.factor.name] if not hasattr(factor.dtype, "ordered") or not factor.dtype.ordered: categories = sorted(factor.unique().tolist()) type_ = pd.api.types.CategoricalDtype(categories=categories, ordered=True) factor = factor.astype(type_) else: type_ = factor.dtype self.factor_type = type_ else: raise ValueError( "Factor on right hand side of group specific term must be a single term." ) # Note we don't use drop_first=True for the factor. self.expr.set_type(data, eval_env) self.expr.set_data(encoding) Xi = self.expr.data Ji = pd.get_dummies(factor).to_numpy() Zi = linalg.khatri_rao(Ji.T, Xi.T).T out = { "type": self.expr.metadata["type"], "Xi": Xi, "Ji": Ji, "Zi": sparse.coo_matrix(Zi), "groups": factor.cat.categories.tolist(), } if self.expr._type == "categoric": # pylint: disable = protected-access out["levels"] = self.expr.metadata["levels"] out["reference"] = self.expr.metadata["reference"] out["encoding"] = self.expr.metadata["encoding"] elif self.expr._type == "interaction": # pylint: disable = protected-access out["terms"] = self.expr.metadata["terms"] return out
def _evaluate(self, data, eval_env): """Evaluate group specific terms. This evaluates ``self.terms`` inside the data mask provided by ``data`` and the environment ``eval_env``. It updates ``self.design_matrix`` with the result from the evaluation of each term. This method also sets the values of ``self.data`` and ``self.eval_env``. It also populates the dictionary ``self.terms_info`` with information related to each term,such as the type, the columns and rows they occupy in the design matrix and the names of the columns. Parameters ---------- data: pandas.DataFrame The data frame where variables are taken from eval_env: EvalEnvironment The environment where values and functions are taken from. """ self.data = data self.eval_env = eval_env start_row = 0 start_col = 0 Z = [] self.terms_info = {} for term in self.terms: encoding = True # If both (1|g) and (x|g) are in the model, then the encoding for x is False. if not isinstance(term.expr, Intercept): for term_ in self.terms: if term_.factor == term.factor and isinstance( term_.expr, Intercept): encoding = False d = term.eval(self.data, self.eval_env, encoding) if d["type"] == "categoric": levels = d["levels"] if d["encoding"] == "full" else d[ "levels"][1:] for idx, level in enumerate(levels): Xi = np.atleast_2d(d["Xi"][:, idx]).T Ji = d["Ji"] Zi = linalg.khatri_rao(Ji.T, Xi.T).T delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string(level) self.terms_info[term_name] = { "type": "categoric", "Xi": Xi, "Ji": Ji, "groups": d["groups"], "encoding": d["encoding"], "levels": d["levels"], "reference": d["reference"], "full_names": [f"{term_name}[{group}]" for group in d["groups"]], } self.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) start_row += delta_row start_col += delta_col else: Zi = d["Zi"] delta_row, delta_col = Zi.shape Z.append(Zi) term_name = term.to_string() self.terms_info[term_name] = { k: v for k, v in d.items() if k != "Zi" } self.terms_info[term_name]["idxs"] = ( slice(start_row, start_row + delta_row), slice(start_col, start_col + delta_col), ) self.terms_info[term_name][ "full_names"] = self._term_full_names(term_name) start_row += delta_row start_col += delta_col # Stored in Compressed Sparse Column format if Z: self.design_matrix = sp.sparse.block_diag(Z).tocsc() self.evaluated = True