def testing(self, x): """ Estimate the outputs corresponding to new input points :param x: new input data (nb_data, dim_x) :return: corresponding output predictions (nb_data, dim_y) """ nb_data = x.shape[0] # Compute gate probabilities inProdEst = np.zeros((nb_data, self.nb_class)) for n in range(0, nb_data): for c in range(0, self.nb_class): vTmp = khatriRaoProd(self.V[1][c], self.V[0][c]) for i in range(2, self.nb_dim_x): vTmp = khatriRaoProd(self.V[i][c], vTmp) vVec = np.dot(vTmp, np.ones((self.rank_g, 1))) inProdEst[n, c] = self.beta[c] + np.dot( vVec[:, None].T, x[n].flatten()) priors = softmax(inProdEst).T ytmp = [] for c in range(0, self.nb_class): # Compute experts predictions alpha = self.alpha[c][:] wmsTmp = self.W[c][0] # Compute vec(W) wVec = [] for j in range(self.nb_dim): wTmp = khatriRaoProd(wmsTmp[1], wmsTmp[0]) for k in range(2, self.nb_dim_x): wTmp = khatriRaoProd(wmsTmp[k], wTmp) wVec.append(np.dot(wTmp, np.ones((self.rank_e[c], 1)))) yhat_tmp = np.zeros((nb_data, self.nb_dim)) for n in range(0, nb_data): for dim in range(0, self.nb_dim): yhat_tmp[n, dim] = alpha[dim] + np.dot( wVec[dim][:, None].T, x[n].flatten()) # Append expert predictions weighted by the gate ytmp.append(yhat_tmp * priors[i][:, None]) # Compute final predictions return np.sum(ytmp, axis=0)
def func(v): # Recover parameters from vector v # vector v is composed as [ xhi_1, phi_1, psi_1, ..., xhi_c, phi_c, psi_c ] V = np.reshape(v, (nb_dim * rank + 1, nb_class)) xhi = V[0] phi = [ np.reshape(V[1:nb_dim1 * rank + 1, i], (nb_dim1, rank)) for i in range(nb_class) ] psi = [ np.reshape(V[nb_dim1 * rank + 1::, i], (nb_dim2, rank)) for i in range(nb_class) ] # Compute probabilitites inProdEst = np.zeros((nb_data, nb_class)) for n in range(0, nb_data): for dim in range(0, nb_class): phipsiVec = np.dot(khatriRaoProd(psi[dim], phi[dim]), np.ones((rank, 1))) inProdEst[n, dim] = xhi[dim] + np.dot(phipsiVec[:, None].T, x[n].flatten()) est = softmax(inProdEst) est += 1e-308 # Regularization term reg_term = 0.0 for dim in range(0, nb_class): reg_term += np.linalg.norm(psi[dim]) + np.linalg.norm(phi[dim]) return -np.sum(y * np.log(est)) + reg_fact * reg_term
def testing(self, x): """ Estimate the outputs corresponding to new input points :param x: new input data (nb_data, dim_x) :return: corresponding output predictions (nb_data, dim_y) """ nb_data = x.shape[0] # Compute gate probabilities inProdEst = np.zeros((nb_data, self.nb_class)) for n in range(0, nb_data): for dim in range(0, self.nb_class): phipsiVec = np.dot(khatriRaoProd(self.psi[dim], self.phi[dim]), np.ones((self.rank_g, 1))) inProdEst[n, dim] = self.xhi[dim] + np.dot( phipsiVec[:, None].T, x[n].flatten()) priors = softmax(inProdEst).T ytmp = [] for i in range(0, self.nb_class): # Compute experts predictions alpha = self.alpha[i][:] b1tmp = self.b1[i][:] b2tmp = self.b2[i][:] bVec = [ np.dot(khatriRaoProd(b2tmp[j], b1tmp[j]), np.ones((self.rank_e[i], 1))) for j in range(self.nb_dim) ] yhat_tmp = np.zeros((nb_data, self.nb_dim)) for n in range(0, nb_data): for dim in range(0, self.nb_dim): yhat_tmp[n, dim] = alpha[dim] + np.dot( bVec[dim][:, None].T, x[n].flatten()) # Append expert predictions weighted by the gate ytmp.append(yhat_tmp * priors[i][:, None]) # Compute final predictions return np.sum(ytmp, axis=0)
def func(v): # Recover parameters from vector v # vector v is composed as [ beta_1, v1_1, v2_1, ..., beta_c, v1_c, v2_c, ... ] Vall = np.reshape(v, (nb_dim * rank + 1, nb_class)) beta = Vall[0] V = [] start = 1 end = dims[0] * rank + 1 for m in range(len(dims)): Vtmp = [ np.reshape(Vall[start:end, i], (dims[m], rank)) for i in range(nb_class) ] V.append(Vtmp) start = end if m < len(dims) - 1: end += dims[m + 1] * rank # Compute probabilitites inProdEst = np.zeros((nb_data, nb_class)) for n in range(0, nb_data): for c in range(0, nb_class): vTmp = khatriRaoProd(V[1][c], V[0][c]) for i in range(2, len(dims)): vTmp = khatriRaoProd(V[i][c], vTmp) vVec = np.dot(vTmp, np.ones((rank, 1))) inProdEst[n, c] = beta[c] + np.dot(vVec[:, None].T, x[n].flatten()) est = softmax(inProdEst) est += 1e-308 # Regularization term reg_term = 0.0 for c in range(0, nb_class): for m in range(len(dims)): reg_term += np.linalg.norm(V[m][c]) return -np.sum(y * np.log(est)) + reg_fact * reg_term
def grad(v): # Recover parameters from vector v V = np.reshape(v, (nb_dim * rank + 1, nb_class)) xhi = V[0] phi = [ np.reshape(V[1:nb_dim1 * rank + 1, i], (nb_dim1, rank)) for i in range(nb_class) ] psi = [ np.reshape(V[nb_dim1 * rank + 1::, i], (nb_dim2, rank)) for i in range(nb_class) ] # Compute probabilities inProdEst = np.zeros((nb_data, nb_class)) for n in range(0, nb_data): for dim in range(0, nb_class): phipsiVec = np.dot(khatriRaoProd(psi[dim], phi[dim]), np.ones((rank, 1))) inProdEst[n, dim] = xhi[dim] + np.dot(phipsiVec[:, None].T, x[n].flatten()) est = softmax(inProdEst) # Compute gradients grad_xhi = np.sum(est - y, axis=0).flatten() grad_phi = np.zeros((nb_dim1 * rank, nb_class)) grad_psi = np.zeros((nb_dim2 * rank, nb_class)) for dim in range(0, nb_class): xVecPsi = np.zeros((nb_data, nb_dim1 * rank)) for n in range(0, nb_data): xVecPsi[n] = np.dot(x[n], psi[dim]).flatten() xVecPhi = np.zeros((nb_data, nb_dim2 * rank)) for n in range(0, nb_data): xVecPhi[n] = np.dot(x[n].T, phi[dim]).flatten() grad_phi[:, dim] = np.dot(xVecPsi.T, (est[:, dim] - y[:, dim])) grad_psi[:, dim] = np.dot(xVecPhi.T, (est[:, dim] - y[:, dim])) # Regularization term grad_phi[:, dim] += 2 * reg_fact * phi[dim].flatten() grad_psi[:, dim] += 2 * reg_fact * psi[dim].flatten() return np.hstack( (grad_xhi.flatten(), grad_phi.flatten(), grad_psi.flatten()))
def training(self, x, y, y_class, reg_rr=1e-2, reg_lr=1e-2, maxiter=100, max_diff_ll=1e-5, optmethod='BFGS'): """ Training the MME model :param x: input data (nb_data, dim_x) :param y: output data (nb_data, dim_y) :param y_class: classes labels of outputs (nb_data, nb_classes) :param reg_rr: regularization term of the experts :param reg_lr: regularization term of the gate :param maxiter: maximum number of iterations for the EM algorithm :param max_diff_ll: maximum difference of log-likelihood for the EM algorithm :param optmethod: optimization method for the logistic regression (gate) """ nb_data = x.shape[0] dX = x.shape[1:] self.nb_dim_x = len(dX) self.nb_class = y_class.shape[1] self.nb_dim = y.shape[1] if type(self.rank_e) is not list: self.rank_e = [self.rank_e for i in range(self.nb_class)] # Experts initialization self.alpha = [] self.W = [] for rank_e in self.rank_e: tensRR = TensorRidgeRegression(rank=rank_e) tensRR.training(x, y) self.alpha.append(tensRR.alpha[:]) self.W.append(tensRR.W) self.sigma = [np.eye(self.nb_dim) for i in range(0, self.nb_class)] # Gating initialization tensRR = TensorRidgeRegression(rank=self.rank_g) tensRR.training(x, y_class) beta_init = np.array(tensRR.alpha).T Vall = beta_init for m in range(self.nb_dim_x): wTmpRR = [] for c in range(self.nb_class): wTmpRR.append(tensRR.W[c][m]) Vtmp = np.reshape(np.array(wTmpRR), (self.nb_class, dX[m] * self.rank_g)).T Vall = np.vstack((Vall, Vtmp)) vinit = Vall.flatten() self.beta, self.V = optTensLogReg(x, y_class, rank=self.rank_g, v_init=vinit, optmethod=optmethod, reg_fact=reg_lr) # EM algorithm nb_min_steps = 2 # min num iterations nb_max_steps = maxiter # max iterations LL = np.zeros(nb_max_steps) for it in range(nb_max_steps): # E - step Ltmp = np.zeros((self.nb_class, nb_data)) # Compute gate probabilities inProdEst = np.zeros((nb_data, self.nb_class)) for n in range(0, nb_data): for c in range(0, self.nb_class): vTmp = khatriRaoProd(self.V[1][c], self.V[0][c]) for i in range(2, self.nb_dim_x): vTmp = khatriRaoProd(self.V[i][c], vTmp) vVec = np.dot(vTmp, np.ones((self.rank_g, 1))) inProdEst[n, c] = self.beta[c] + np.dot( vVec[:, None].T, x[n].flatten()) priors = softmax(inProdEst).T # Compute experts distributions weighted by gate probabilities for c in range(0, self.nb_class): alpha = self.alpha[c][:] wmsTmp = self.W[c][0] # Compute vec(W) wVec = [] for j in range(self.nb_dim): wTmp = khatriRaoProd(wmsTmp[1], wmsTmp[0]) for k in range(2, self.nb_dim_x): wTmp = khatriRaoProd(wmsTmp[k], wTmp) wVec.append(np.dot(wTmp, np.ones((self.rank_e[c], 1)))) # Compute predictions yhat_tmp = np.zeros((nb_data, self.nb_dim)) for n in range(0, nb_data): for dim in range(0, self.nb_dim): yhat_tmp[n, dim] = alpha[dim] + np.dot( wVec[dim][:, None].T, x[n].flatten()) # Likelihood Ltmp[c] = priors[c] * multi_variate_normal( y, yhat_tmp, self.sigma[c], log=False) # Compute responsabilities GAMMA = Ltmp / (np.sum(Ltmp, axis=0) + 1e-100) LL[it] = np.sum(np.sum(GAMMA * np.log(Ltmp + 1e-100))) # M-step # Experts parameters update yhat = [] for c in range(0, self.nb_class): r = np.diag(GAMMA[c]) sqrGAMMA = np.sqrt(GAMMA[c]) weighted_y = sqrGAMMA[:, None] * y for d in range(len(x.shape) - 1): sqrGAMMA = np.expand_dims(sqrGAMMA, axis=-1) weighted_x = sqrGAMMA * x tensRR = None tensRR = TensorRidgeRegression(rank=self.rank_e[c]) tensRR.training(weighted_x, weighted_y, reg=reg_rr) self.alpha[c] = tensRR.alpha[:] self.W[c] = tensRR.W yhat_tmp = tensRR.testing_multiple(x) yhat.append(yhat_tmp * priors[c][:, None]) self.sigma[c] = np.dot(np.dot( (y - yhat_tmp).T, r), (y - yhat_tmp)) / sum( GAMMA[c]) + 1e-6 * np.eye(self.nb_dim) # Gate parameters update beta_init = np.array(self.beta).T Vall = beta_init for m in range(self.nb_dim_x): Vtmp = np.reshape(np.array(self.V[m]), (self.nb_class, dX[m] * self.rank_g)).T Vall = np.vstack((Vall, Vtmp)) vinit = Vall.flatten() self.beta, self.V = optTensLogReg(x, GAMMA.T, rank=self.rank_g, v_init=vinit, optmethod=optmethod, reg_fact=reg_lr) print(it) # Check for convergence if it > nb_min_steps: if LL[it] - LL[it - 1] < max_diff_ll: print('Converged after %d iterations: %.3e' % (it, LL[it]), 'red', 'on_white') print(LL) return LL[it] print( "TME did not converge before reaching max iteration. Consider augmenting the number of max iterations." ) print(LL) return LL[-1]
def training(self, x, y, reg=1e-2, maxDiffCrit=1e-4, maxIter=200): """ Train the parameters of the MRR model :param x: input matrices (nb_data, dim1, dim2) :param y: output vectors (nb_data, dim_y) :param reg: regularization term :param maxDiffCrit: stopping criterion for the alternative least squares procedure :param maxIter: maximum number of iterations for the alternative least squares procedure """ # Dimensions N = x.shape[0] d1 = x.shape[1] d2 = x.shape[2] self.dY = y.shape[1] for dim in range(0, self.dY): # Initialization # self.b1.append(np.random.randn(d1, self.rank)) # self.b2.append(np.random.randn(d2, self.rank)) # self.alpha.append(np.random.randn(1)) self.b1.append(np.ones((d1, self.rank))) self.b2.append(np.ones((d2, self.rank))) self.alpha.append(np.zeros(1)) self.bVec.append(np.random.randn(d1 * d2, 1)) # Optimization of parameters (ALS procedure) nbIter = 1 prevRes = 0 while nbIter < maxIter: # Update b1 zVec1 = np.zeros((N, d1 * self.rank)) for n in range(0, N): zVec1[n] = np.dot(x[n], self.b2[-1]).flatten() b1 = np.linalg.solve( zVec1.T.dot(zVec1) + np.eye(d1 * self.rank) * reg, zVec1.T).dot(y[:, dim] - self.alpha[-1]) self.b1[-1] = np.reshape(b1, (d1, self.rank)) # Update b2 zVec2 = np.zeros((N, d2 * self.rank)) for n in range(0, N): zVec2[n] = np.dot(x[n].T, self.b1[-1]).flatten() b2 = np.linalg.solve( zVec2.T.dot(zVec2) + np.eye(d2 * self.rank) * reg, zVec2.T).dot(y[:, dim] - self.alpha[-1]) self.b2[-1] = np.reshape(b2, (d2, self.rank)) # Update alpha self.bVec[-1] = np.dot(khatriRaoProd(self.b2[-1], self.b1[-1]), np.ones((self.rank, 1))) alpha = 0 for n in range(0, N): alpha += y[n, dim] - np.dot(self.bVec[-1][:, None].T, x[n].flatten()) self.alpha[-1] = alpha[0] / N # Compute residuals res = 0 for n in range(0, N): res += ( y[n, dim] - self.alpha[-1] - np.dot(self.bVec[-1][:, None].T, x[n].flatten()))**2 resDiff = prevRes - res # Check convergence if resDiff < maxDiffCrit and nbIter > 1: print('MRR converged after %d iterations.' % nbIter) break nbIter += 1 prevRes = res if resDiff > maxDiffCrit: print('MRR did not converged after %d iterations.' % nbIter)
def training(self, x, y, y_class, reg_rr=1e-2, reg_lr=1e-2, maxiter=100, max_diff_ll=1e-5, optmethod='BFGS'): """ Training the MME model :param x: input data (nb_data, dim_x) :param y: output data (nb_data, dim_y) :param y_class: classes labels of outputs (nb_data, nb_classes) :param reg_rr: regularization term of the experts :param reg_lr: regularization term of the gate :param maxiter: maximum number of iterations for the EM algorithm :param max_diff_ll: maximum difference of log-likelihood for the EM algorithm :param optmethod: optimization method for the logistic regression (gate) :return: """ nb_data = x.shape[0] d1 = x.shape[1] d2 = x.shape[2] self.nb_class = y_class.shape[1] self.nb_dim = y.shape[1] if type(self.rank_e) is not list: self.rank_e = [self.rank_e for i in range(self.nb_class)] # Initialization with sklearn # ysk = np.sum(y.T * np.array(range(0, self.nb_class))[:, None], axis=0) # mul_lr = sk.linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(xvec, ysk) # self.V = mul_lr.coef_.T # self.V = optLogReg(xvec.T, y.T) # matRR = MatrixRidgeRegression(rank=self.rank_e) # matRR.training(x, y) # # self.alpha = [matRR.alpha[:] for i in range(0, self.nb_class)] # self.b1 = [matRR.b1[:] for i in range(0, self.nb_class)] # self.b2 = [matRR.b2[:] for i in range(0, self.nb_class)] # Experts initialization self.alpha = [] self.b1 = [] self.b2 = [] for rank_e in self.rank_e: matRR = MatrixRidgeRegression(rank=rank_e) matRR.training(x, y) self.alpha.append(matRR.alpha[:]) self.b1.append(matRR.b1[:]) self.b2.append(matRR.b2[:]) self.sigma = [np.eye(self.nb_dim) for i in range(0, self.nb_class)] # Gating initialization matRR = MatrixRidgeRegression(rank=self.rank_g) matRR.training(x, y_class) xhi_init = np.array(matRR.alpha).T phi_init = np.reshape(np.array(matRR.b1), (self.nb_class, d1 * self.rank_g)).T psi_init = np.reshape(np.array(matRR.b2), (self.nb_class, d2 * self.rank_g)).T vinit = np.vstack((xhi_init, phi_init, psi_init)).flatten() self.xhi, self.phi, self.psi = optMatLogReg(x, y_class, rank=self.rank_g, v_init=vinit, optmethod=optmethod, reg_fact=reg_lr) # EM algorithm nb_min_steps = 2 # min num iterations nb_max_steps = maxiter # max iterations LL = np.zeros(nb_max_steps) for it in range(nb_max_steps): # E - step Ltmp = np.zeros((self.nb_class, nb_data)) # Compute gate probabilities inProdEst = np.zeros((nb_data, self.nb_class)) for n in range(0, nb_data): for dim in range(0, self.nb_class): phipsiVec = np.dot( khatriRaoProd(self.psi[dim], self.phi[dim]), np.ones((self.rank_g, 1))) inProdEst[n, dim] = self.xhi[dim] + np.dot( phipsiVec[:, None].T, x[n].flatten()) priors = softmax(inProdEst).T # Compute experts distributions weighted by gate probabilities for i in range(0, self.nb_class): alpha = self.alpha[i][:] b1tmp = self.b1[i][:] b2tmp = self.b2[i][:] bVec = [ np.dot(khatriRaoProd(b2tmp[j], b1tmp[j]), np.ones((self.rank_e[i], 1))) for j in range(self.nb_dim) ] yhat_tmp = np.zeros((nb_data, self.nb_dim)) for n in range(0, nb_data): for dim in range(0, self.nb_dim): yhat_tmp[n, dim] = alpha[dim] + np.dot( bVec[dim][:, None].T, x[n].flatten()) Ltmp[i] = priors[i] * multi_variate_normal( y, yhat_tmp, self.sigma[i], log=False) # Compute responsabilities GAMMA = Ltmp / (np.sum(Ltmp, axis=0) + 1e-100) LL[it] = np.sum(np.sum(GAMMA * np.log(Ltmp + 1e-100))) # M-step # Experts parameters update yhat = [] for i in range(0, self.nb_class): r = np.diag(GAMMA[i]) sqrGAMMA = np.sqrt(GAMMA[i]) weighted_x = sqrGAMMA[:, None, None] * x weighted_y = sqrGAMMA[:, None] * y matRR = None matRR = MatrixRidgeRegression(rank=self.rank_e[i]) matRR.training(weighted_x, weighted_y, reg=reg_rr) self.alpha[i] = matRR.alpha[:] self.b1[i] = matRR.b1[:] self.b2[i] = matRR.b2[:] yhat_tmp = matRR.testing_multiple(x) yhat.append(yhat_tmp * priors[i][:, None]) self.sigma[i] = np.dot(np.dot( (y - yhat_tmp).T, r), (y - yhat_tmp)) / sum( GAMMA[i]) + 1e-6 * np.eye(self.nb_dim) # Gate parameters update xhi_init = np.array(self.xhi).T phi_init = np.reshape(np.array(self.phi), (self.nb_class, d1 * self.rank_g)).T psi_init = np.reshape(np.array(self.psi), (self.nb_class, d2 * self.rank_g)).T vinit = np.vstack((xhi_init, phi_init, psi_init)).flatten() self.xhi, self.phi, self.psi = optMatLogReg(x, GAMMA.T, rank=self.rank_g, v_init=vinit, optmethod=optmethod, reg_fact=reg_lr) print(it) # Check for convergence if it > nb_min_steps: if LL[it] - LL[it - 1] < max_diff_ll: print('Converged after %d iterations: %.3e' % (it, LL[it]), 'red', 'on_white') print(LL) return LL[it] print( "MME did not converge before reaching max iteration. Consider augmenting the number of max iterations." ) print(LL) return LL[-1]
def training(self, x, y, reg=1e-2, maxDiffCrit=1e-4, maxIter=200): """ Train the parameters of the MRR model :param x: input matrices (nb_data, dim1, dim2, ...) :param y: output vectors (nb_data, dim_y) :param reg: regularization term :param maxDiffCrit: stopping criterion for the alternative least squares procedure :param maxIter: maximum number of iterations for the alternative least squares procedure """ # Dimensions N = x.shape[0] dX = x.shape[1:] self.dY = y.shape[1] for dim in range(0, self.dY): # Initialization wms = [] for m in range(len(dX)): wms.append(np.ones((dX[m], self.rank))) self.alpha.append(np.zeros(1)) self.wVec.append(np.reshape(np.zeros(dX), -1)) # Optimization of parameters (ALS procedure) nbIter = 1 prevRes = 0 while nbIter < maxIter: for m in range(len(dX)): # Compute Wm complement (WM o ... o Wm+1 o Wm-1 o ... o W1) if m is 0: wmComplement = wms[1] for i in range(2, len(dX)): wmComplement = khatriRaoProd(wms[i], wmComplement) else: wmComplement = wms[0] for i in range(1, len(dX)): if i != m: wmComplement = khatriRaoProd( wms[i], wmComplement) # Update Wm zVec = np.zeros((N, dX[m] * self.rank)) for n in range(0, N): zVec[n] = np.dot(tensor2mat(x[n], m), wmComplement).flatten() wm = np.linalg.solve( zVec.T.dot(zVec) + np.eye(dX[m] * self.rank) * reg, zVec.T).dot(y[:, dim] - self.alpha[-1]) wms[m] = np.reshape(wm, (dX[m], self.rank)) # Update alpha wTmp = khatriRaoProd(wms[1], wms[0]) for i in range(2, len(dX)): wTmp = khatriRaoProd(wms[i], wTmp) self.wVec[-1] = np.dot(wTmp, np.ones((self.rank, 1))) alpha = 0 for n in range(0, N): alpha += y[n, dim] - np.dot(self.wVec[-1][:, None].T, x[n].flatten()) self.alpha[-1] = alpha[0] / N # Compute residuals res = 0 for n in range(0, N): res += ( y[n, dim] - self.alpha[-1] - np.dot(self.wVec[-1][:, None].T, x[n].flatten()))**2 resDiff = prevRes - res # Check convergence if resDiff < maxDiffCrit and nbIter > 1: print('TRR converged after %d iterations.' % nbIter) break nbIter += 1 prevRes = res if resDiff > maxDiffCrit: print('TRR did not converged after %d iterations.' % nbIter) self.W.append(wms)
# Tensor-valued mixture of experts print('Tensor-valued mixture of experts...') tme = TensorMixtureLinearExperts([tme_rank_e, tme_rank_e], tme_rank_g) tme_LL = tme.training(X, y, y_class, reg_rr=1e-1, reg_lr=1e-1, maxiter=20, max_diff_ll=5.0, optmethod='CG') tme_coeffs = [] tme_coeffs_lr = [] for c in range(0, tme.nb_class): # TRR part alpha = tme.alpha[c][:] wmsTmp = tme.W[c][0] # Compute vec(W) tme_bVec = [] for j in range(tme.nb_dim): wTmp = khatriRaoProd(wmsTmp[1], wmsTmp[0]) for k in range(2, tme.nb_dim_x): wTmp = khatriRaoProd(wmsTmp[k], wTmp) tme_bVec.append(np.dot(wTmp, np.ones((tme.rank_e[c], 1)))) tme_coeffs.append(np.reshape(tme_bVec[0], tuple_dim)) # TLR part vTmp = khatriRaoProd(tme.V[1][c], tme.V[0][c]) for j in range(2, tme.nb_dim_x): vTmp = khatriRaoProd(tme.V[j][c], vTmp) tme_vVec = np.dot(vTmp, np.ones((tme.rank_g, 1))) tme_coeffs_lr.append(np.reshape(tme_vVec, tuple_dim)) rmse_tme_coeffs_rr = np.sqrt(np.sum([(np.sum((bVec[i] - tme_coeffs[i].flatten()) ** 2))
tot_rmse = np.sum([(np.sum((bVec[i] - me_coeffs[i].flatten()) ** 2)) for i in range(Nclass)]) + np.sum((np.sum((phipsiVec - me_coeffs_lr[0].flatten()) ** 2))) tot_rmse /= Nclass * d1 * d2 + d1 * d2 rmse_me_coeffs = tot_rmse # Matrix-valued mixture of experts print('Matrix-valued mixture of experts...') mme = MaxtrixMixtureLinearExperts([mme_rank_e, mme_rank_e], mme_rank_g) mme_LL = mme.training(X, y, y_class, reg_rr=1e-1, reg_lr=1e-1, maxiter=20, max_diff_ll=5.0, optmethod='CG') mme_coeffs = [] mme_coeffs_lr = [] for i in range(0, mme.nb_class): alpha = mme.alpha[i][:] b1tmp = mme.b1[i][:] b2tmp = mme.b2[i][:] mme_bVec = [np.dot(khatriRaoProd(b2tmp[j], b1tmp[j]), np.ones((mme.rank_e[i], 1))) for j in range(mme.nb_dim)] mme_coeffs.append(np.reshape(mme_bVec[0], (d1, d2))) mme_phipsiVec = np.dot(khatriRaoProd(mme.psi[i], mme.phi[i]), np.ones((mme.rank_g, 1))) mme_coeffs_lr.append(np.reshape(mme_phipsiVec, (d1, d2))) rmse_mme_coeffs_rr = np.sqrt(np.sum([(np.sum((bVec[i] - mme_coeffs[i].flatten()) ** 2)) for i in range(Nclass)]) / (Nclass * d1 * d2)) rmse_mme_coeffs_lr = np.sqrt(np.sum((np.sum((phipsiVec - mme_coeffs_lr[0].flatten()) ** 2))) / (d1 * d2)) tot_rmse = np.sum([(np.sum((bVec[i] - mme_coeffs[i].flatten()) ** 2)) for i in range(Nclass)]) + \ np.sum((np.sum((phipsiVec - mme_coeffs_lr[0].flatten()) ** 2))) tot_rmse /= Nclass * d1 * d2 + d1 * d2 rmse_mme_coeffs = np.sqrt(tot_rmse) # Show recovered coefficients
def grad(v): # Recover parameters from vector v Vall = np.reshape(v, (nb_dim * rank + 1, nb_class)) beta = Vall[0] V = [] start = 1 end = dims[0] * rank + 1 for m in range(len(dims)): Vtmp = [ np.reshape(Vall[start:end, i], (dims[m], rank)) for i in range(nb_class) ] V.append(Vtmp) start = end if m < len(dims) - 1: end += dims[m + 1] * rank # Compute probabilities inProdEst = np.zeros((nb_data, nb_class)) for n in range(0, nb_data): for c in range(0, nb_class): vTmp = khatriRaoProd(V[1][c], V[0][c]) for i in range(2, len(dims)): vTmp = khatriRaoProd(V[i][c], vTmp) vVec = np.dot(vTmp, np.ones((rank, 1))) inProdEst[n, c] = beta[c] + np.dot(vVec[:, None].T, x[n].flatten()) est = softmax(inProdEst) # Compute gradients grad_beta = np.sum(est - y, axis=0).flatten() grad_vec = grad_beta.flatten() gradV = [ np.zeros((dims[m] * rank, nb_class)) for m in range(len(dims)) ] for c in range(nb_class): for m in range(len(dims)): # Compute Vm complement (VM o ... o Vm+1 o Vm-1 o ... o V1) if m is 0: vmComplement = V[1][c] for i in range(2, len(dims)): vmComplement = khatriRaoProd(V[i][c], vmComplement) else: vmComplement = V[0][c] for i in range(1, len(dims)): if i != m: vmComplement = khatriRaoProd(V[i][c], vmComplement) # Gradient zVec = np.zeros((nb_data, dims[m] * rank)) for n in range(0, nb_data): zVec[n] = np.dot(tensor2mat(x[n], m), vmComplement).flatten() gradV[m][:, c] = np.dot(zVec.T, (est[:, c] - y[:, c])) # Regularization term gradV[m][:, c] += 2 * reg_fact * V[m][c].flatten() for m in range(len(dims)): grad_vec = np.hstack((grad_vec, gradV[m].flatten())) return grad_vec