def get_value(self, alphas, betas, gamma, gamma_nuc_norm=None): matrix_eval = make_column_major_flat( self.observed_matrix - get_matrix_completion_groups_fitted_values( self.row_features, self.col_features, alphas, betas, gamma, )) square_loss = 0.5 / self.num_train * get_norm2( matrix_eval[self.train_idx], power=2, ) if gamma_nuc_norm is not None: nuc_norm = self.lambdas[0] * gamma_nuc_norm else: nuc_norm = self.lambdas[0] * np.linalg.norm(gamma, ord="nuc") alpha_pen = 0 for i, a in enumerate(alphas): # group lasso penalties alpha_pen += self.lambdas[1 + i] * get_norm2(a, power=1) beta_pen = 0 for i, b in enumerate(betas): # group lasso penalties beta_pen += self.lambdas[1 + self.num_row_groups + i] * get_norm2( b, power=1) return square_loss + nuc_norm + alpha_pen + beta_pen
def _get_block_diag_component(idx): beta = beta_minis[idx] if beta.size == 0: return np.matrix(np.zeros((0, 0))).T betabeta = beta * beta.T block_diag_component = -1 * self.fmodel.current_lambdas[idx] / get_norm2(beta, power=3) * betabeta return block_diag_component
def _get_dbeta_dlambda1(beta_minis, matrix_to_invert): if np.concatenate(beta_minis).size == 0: return np.zeros((matrix_to_invert.shape[0], 1)) else: normed_betas = [beta / get_norm2(beta) for beta in beta_minis] all_normed_betas = np.concatenate(normed_betas) dbeta_dlambda1 = sp.sparse.linalg.lsmr(matrix_to_invert, -1 * all_normed_betas.A1) return np.matrix(dbeta_dlambda1[0]).T
def _get_block_diag_component(idx): beta = beta_minis[idx] if beta.size == 0: return np.matrix(np.zeros((0,0))).T repeat_hstacked_beta = np.tile(beta, (1, beta.size)).T block_diag_component = -1 * self.fmodel.current_lambdas[idx] / get_norm2(beta, power=3) * np.diagflat(beta) * repeat_hstacked_beta return block_diag_component
def _get_block_diag_component(idx): beta = beta_minis[idx] if beta.size == 0: return np.matrix(np.zeros((0, 0))).T betabeta = beta * beta.T block_diag_component = -1 * self.fmodel.current_lambdas[ idx] / get_norm2(beta, power=3) * betabeta return block_diag_component
def _get_dbeta_dlambda1(beta_minis, matrix_to_invert): if np.concatenate(beta_minis).size == 0: return np.zeros((matrix_to_invert.shape[0], 1)) else: normed_betas = [beta / get_norm2(beta) for beta in beta_minis] all_normed_betas = np.concatenate(normed_betas) dbeta_dlambda1 = sp.sparse.linalg.lsmr( matrix_to_invert, -1 * all_normed_betas.A1) return np.matrix(dbeta_dlambda1[0]).T
def _get_dbeta_dlambda1(beta, matrix_to_invert, num_features_before): if beta.size == 0: return np.zeros((matrix_to_invert.shape[0], 1)) else: normed_beta = beta / get_norm2(beta) zero_normed_beta = np.concatenate([ np.matrix(np.zeros(num_features_before)).T, normed_beta, np.matrix(np.zeros(total_features - normed_beta.size - num_features_before)).T ]) dbeta_dlambda1 = sp.sparse.linalg.lsmr(matrix_to_invert, -1 * zero_normed_beta.A1)[0] return np.matrix(dbeta_dlambda1).T
def get_value(self, alpha, beta, gamma, given_nuc_norm=None): matrix_eval = make_column_major_flat( self.observed_matrix - get_matrix_completion_fitted_values( self.row_features, self.col_features, alpha, beta, gamma, )) square_loss = 0.5 / self.num_train * get_norm2( matrix_eval[self.train_idx], power=2, ) if given_nuc_norm is None: nuc_norm = self.lambdas[0] * np.linalg.norm(gamma, ord="nuc") else: nuc_norm = self.lambdas[0] * given_nuc_norm alpha_norm1 = self.lambdas[1] * np.linalg.norm(alpha, ord=1) alpha_norm2 = 0.5 * self.lambdas[2] * get_norm2(alpha, power=2) beta_norm1 = self.lambdas[3] * np.linalg.norm(beta, ord=1) beta_norm2 = 0.5 * self.lambdas[4] * get_norm2(beta, power=2) return square_loss + nuc_norm + alpha_norm1 + alpha_norm2 + beta_norm1 + beta_norm2
def _get_dbeta_dlambda1(beta, matrix_to_invert, num_features_before): if beta.size == 0: return np.zeros((matrix_to_invert.shape[0], 1)) else: normed_beta = beta / get_norm2(beta) zero_normed_beta = np.concatenate([ np.matrix(np.zeros(num_features_before)).T, normed_beta, np.matrix( np.zeros(total_features - normed_beta.size - num_features_before)).T ]) dbeta_dlambda1 = sp.sparse.linalg.lsmr( matrix_to_invert, -1 * zero_normed_beta.A1)[0] return np.matrix(dbeta_dlambda1).T
def get_prox_l2(self, x_vector, scale_factor): thres_x = max(1 - scale_factor / get_norm2(x_vector, power=1), 0) * x_vector return thres_x
def _get_diagmatrix_component(idx): beta = beta_minis[idx] if beta.size == 0: return np.matrix(np.zeros((0, 0))).T return self.fmodel.current_lambdas[idx] / get_norm2( beta) * np.identity(beta.size)
def _get_dmodel_dlambda( self, lambda_idx, imp_derivs, alphas, betas, gamma, row_features, col_features, u_hat, sigma_hat, v_hat, lambdas, ): # this fcn accepts mini-fied model parameters - alpha, beta, and u/sigma/v # returns the gradient of the model parameters wrt lambda num_alphas = len(alphas) dd_square_loss_mini = self._get_dd_square_loss_mini( imp_derivs, row_features, col_features) sigma_mask = self._create_sigma_mask(sigma_hat) obj = 0 lambda_offset = 1 if sigma_hat.size > 0 else 0 # Constraint from implicit differentiation of the optimality conditions # that were defined by taking the gradient of the training objective wrt gamma if sigma_hat.size > 0: d_square_loss = self._get_d_square_loss(alphas, betas, gamma, row_features, col_features) d_square_loss_reshape = make_column_major_reshape( d_square_loss, (self.data.num_rows, self.data.num_cols)) dd_square_loss = self._get_dd_square_loss(imp_derivs, row_features, col_features) dd_square_loss_reshape = reshape( dd_square_loss, self.data.num_rows, self.data.num_cols, ) # left multiply U^T and implicit derivative dgamma_left_imp_deriv_dlambda = ( imp_derivs.dU_dlambda.T * d_square_loss_reshape + u_hat.T * dd_square_loss_reshape + lambdas[0] * np.sign(sigma_hat) * imp_derivs.dV_dlambda.T) # right multiply V and implicit derivative dgamma_right_imp_deriv_dlambda = ( d_square_loss_reshape * imp_derivs.dV_dlambda + dd_square_loss_reshape * v_hat + lambdas[0] * imp_derivs.dU_dlambda * np.sign(sigma_hat)) if lambda_idx == 0: dgamma_left_imp_deriv_dlambda += np.sign(sigma_hat) * v_hat.T dgamma_right_imp_deriv_dlambda += u_hat * np.sign(sigma_hat) obj += sum_squares(dgamma_left_imp_deriv_dlambda) + sum_squares( dgamma_right_imp_deriv_dlambda) # Constraint from implicit differentiation of the optimality conditions # that were defined by taking the gradient of the training objective wrt # alpha and beta, respectively for i, a_tuple in enumerate( zip(row_features, alphas, imp_derivs.dalphas_dlambda)): row_f, alpha, da_dlambda = a_tuple for j in range(alpha.size): dalpha_imp_deriv_dlambda = ( dd_square_loss_mini.T * vec(row_f[:, j] * self.onesT_row)[self.data.train_idx] + lambdas[1] * (da_dlambda[j] / get_norm2(alpha, power=1) - alpha[j] / get_norm2(alpha, power=3) * (alpha.T * da_dlambda))) if lambda_idx == 1: dalpha_imp_deriv_dlambda += alpha[j] / get_norm2(alpha, power=1) obj += sum_squares(dalpha_imp_deriv_dlambda) for i, b_tuple in enumerate( zip(col_features, betas, imp_derivs.dbetas_dlambda)): col_f, beta, db_dlambda = b_tuple for j in range(beta.size): dbeta_imp_deriv_dlambda = ( dd_square_loss_mini.T * vec( (col_f[:, j] * self.onesT_col).T)[self.data.train_idx] + lambdas[1] * (db_dlambda[j] / get_norm2(beta, power=1) - beta[j] / get_norm2(beta, power=3) * (beta.T * db_dlambda))) if lambda_idx == 1: dbeta_imp_deriv_dlambda += beta[j] / get_norm2(beta, power=1) obj += sum_squares(dbeta_imp_deriv_dlambda) return imp_derivs.solve(obj)
def _check_optimality_conditions(self, model_params, lambdas, opt_thres=1e-2): # sanity check function to see that cvxpy is solving to a good enough accuracy # check that the gradient is close to zero # can use this to check that our implicit derivative assumptions hold # lambdas must be an exploded lambda matrix print "check_optimality_conditions!" alphas = model_params["alphas"] betas = model_params["betas"] gamma = model_params["gamma"] u_hat, sigma_hat, v_hat = self._get_svd_mini(gamma) a = self.data.observed_matrix - get_matrix_completion_groups_fitted_values( self.data.row_features, self.data.col_features, alphas, betas, gamma) d_square_loss = self._get_d_square_loss( alphas, betas, gamma, self.data.row_features, self.data.col_features, ) left_grad_at_opt_gamma = (make_column_major_reshape( d_square_loss, (self.data.num_rows, self.data.num_cols)) * v_hat + lambdas[0] * u_hat * np.sign(sigma_hat)) right_grad_at_opt_gamma = (u_hat.T * make_column_major_reshape( d_square_loss, (self.data.num_rows, self.data.num_cols)) + lambdas[0] * np.sign(sigma_hat) * v_hat.T) print "left grad_at_opt wrt gamma (should be zero)", get_norm2( left_grad_at_opt_gamma) print "right grad_at_opt wrt gamma (should be zero)", get_norm2( right_grad_at_opt_gamma) # assert(get_norm2(left_grad_at_opt_gamma) < opt_thres) # assert(get_norm2(right_grad_at_opt_gamma) < opt_thres) for i, a_f_tuple in enumerate(zip(alphas, self.data.row_features)): alpha, row_f = a_f_tuple if np.linalg.norm(alpha) > 1e-5: grad_at_opt_alpha = [] for j in range(alpha.size): grad_at_opt_alpha.append( (d_square_loss.T * make_column_major_flat(row_f[:, j] * self.onesT_row) + lambdas[1 + i] * alpha[j] / np.linalg.norm(alpha, ord=None))[0, 0]) print "grad_at_opt wrt alpha (should be zero)", get_norm2( grad_at_opt_alpha) # assert(np.linalg.norm(grad_at_opt_alpha) < opt_thres) for i, b_f_tuple in enumerate(zip(betas, self.data.col_features)): beta, col_f = b_f_tuple if np.linalg.norm(beta) > 1e-5: grad_at_opt_beta = [] for j in range(beta.size): grad_at_opt_beta.append( (d_square_loss.T * make_column_major_flat( (col_f[:, j] * self.onesT_col).T) + lambdas[1 + self.settings.num_row_groups + i] * beta[j] / np.linalg.norm(beta, ord=None))[0, 0]) print "grad_at_opt wrt beta (should be zero)", get_norm2( grad_at_opt_beta)
def _get_diagmatrix_component(idx): beta = beta_minis[idx] if beta.size == 0: return np.matrix(np.zeros((0, 0))).T return self.fmodel.current_lambdas[idx] / get_norm2(beta) * np.identity(beta.size)