def _pbcd_epoch_slow(P, X, y, loss, regularizer, lams, degree, beta, gamma, eta0, indices_feature, kernel): sum_viol = 0 n_features = X.shape[1] for j in indices_feature: # compute prediction y_pred = _poly_predict(X, P.T, lams, kernel, degree=degree) # compute grad and inv_step_size x = X[:, j] notj_mask = np.arange(n_features) != j X_notj = X[:, notj_mask] P_notj = P[notj_mask] if kernel == "anova": grad_kernel = anova_kernel(P_notj.T, X_notj, degree=degree-1) else: grad_kernel = all_subsets_kernel(P_notj.T, X_notj) grad_kernel *= x # (n_components, n_samples) grad_y = grad_kernel * lams[:, None] l2_reg = beta inv_step_size = loss.mu * np.sum(grad_y*grad_y) + l2_reg dloss = loss.dloss(y_pred, y) step = np.sum(dloss*grad_y, axis=1) + l2_reg * P[j] step /= inv_step_size # update p_j_old = np.array(P[j]) P[j] -= eta0 * step regularizer.prox_bcd( P, eta0*gamma/inv_step_size, degree, j ) sum_viol += np.sum(np.abs(p_j_old - P[j])) return sum_viol
def _pcd_epoch_slow(P, X, y, loss, regularizer, lams, degree, beta, gamma, eta0, indices_component, indices_feature, kernel): sum_viol = 0 n_features = X.shape[1] for s in indices_component: p_s = P[s] for j in indices_feature: # compute prediction y_pred = _poly_predict(X, P, lams, kernel, degree=degree) # compute grad and inv_step_size x = X[:, j] notj_mask = np.arange(n_features) != j X_notj = X[:, notj_mask] ps_notj = np.atleast_2d(p_s[notj_mask]) if kernel == "anova": grad_kernel = anova_kernel(ps_notj, X_notj, degree=degree-1) else: grad_kernel = all_subsets_kernel(ps_notj, X_notj) grad_kernel *= x grad_y = lams[s] * grad_kernel.ravel() inv_step_size = loss.mu * np.dot(grad_y, grad_y) + beta dloss = loss.dloss(y_pred, y) step = np.dot(dloss, grad_y) + beta * p_s[j] step /= inv_step_size # update p_sj_new = regularizer.prox_cd( p_s[j]-eta0*step, p_s, eta0*gamma/inv_step_size, degree, j ) sum_viol += np.abs(p_sj_new - P[s, j]) P[s, j] = p_sj_new return sum_viol
def _psgd_epoch_slow(P, w, X, y, loss, regularizer, lams, degree, alpha, beta, gamma, indices_samples, fit_linear, eta0, learning_rate, power_t, batch_size, it, kernel): n_samples = X.shape[0] n_features = X.shape[1] sum_loss = 0.0 n_minibatches = math.ceil(n_samples / batch_size) for ii in range(n_minibatches): # pick a minibatch minibatch_indices = np.atleast_1d( indices_samples[ii * batch_size:(ii + 1) * batch_size]) X_batch = X[minibatch_indices] y_batch = y[minibatch_indices] # compute prediction and loss y_pred_batch = _poly_predict(X_batch, P.T, lams, kernel, degree=degree) y_pred_batch += np.dot(X_batch, w) sum_loss += np.sum( loss.loss(np.atleast_1d(y_pred_batch), np.atleast_1d(y_batch))) # compute grad and inv_step_size dloss = loss.dloss(np.atleast_1d(y_pred_batch), np.atleast_1d(y_batch)) grad_P = np.zeros(P.shape) # (n_features, n_components) for j in range(n_features): notj_mask = np.arange(n_features) != j X_batch_notj = X_batch[:, notj_mask] P_notj = P[notj_mask] # grad_kernel: (n_components, n_samples) if kernel == "anova": grad_kernel = anova_kernel(P_notj.T, X_batch_notj, degree=degree - 1) else: grad_kernel = all_subsets_kernel(P_notj.T, X_batch_notj) grad_P[j] = np.dot(grad_kernel, dloss * X_batch[:, j]) # (n_components, n_samples) grad_P *= lams grad_P /= len(minibatch_indices) eta_P, eta_w = _get_eta(learning_rate, eta0, alpha, beta, power_t, it) P -= eta_P * grad_P P /= (1.0 + eta_P * beta) # update regularizer.prox( P, eta_P * gamma / (1.0 + eta_P * beta), degree, ) if fit_linear: grad_w = np.dot(X_batch.T, dloss) / len(minibatch_indices) w -= eta_w * grad_w w /= (1.0 + eta_w * alpha) it += 1 return sum_loss, it
def test_all_subsets_same_as_slow_reg(mean, loss, regularizer): y = _poly_predict(X, P, lams, kernel="all-subsets") reg = SparseAllSubsetsRegressor( n_components=n_components, beta=1, gamma=1e-3, regularizer=regularizer, warm_start=False, tol=1e-3, max_iter=5, random_state=0, mean=mean, shuffle=False, solver="pbcd") with warnings.catch_warnings(): warnings.simplefilter('ignore') reg.fit(X, y) P_fit_slow = pbcd_slow( X, y, loss=loss, regularizer=regularizer, lams=reg.lams_, degree=-1, n_components=n_components, beta=1, gamma=1e-3, eta0=0.1, max_iter=5, tol=1e-3, random_state=0, mean=mean) assert_array_almost_equal(reg.P_, P_fit_slow, decimal=4)
def test_fm_same_as_slow_clf(degree, batch_size, learning_rate, fit_linear, loss, regularizer): y = _poly_predict(X, P, lams, kernel="anova", degree=degree) y = np.sign(y) reg = SparseFactorizationMachineClassifier(degree=degree, n_components=n_components, fit_lower=None, fit_linear=fit_linear, alpha=1e-3, beta=1e-3, gamma=0.0, regularizer=regularizer, learning_rate=learning_rate, eta0=0.01, warm_start=False, tol=1e-3, max_iter=10, random_state=0, shuffle=False, solver="psgd", batch_size=batch_size, verbose=0, loss=loss) with warnings.catch_warnings(): warnings.simplefilter('ignore') reg.fit(X, y) P_fit_slow, w_fit_slow = psgd_slow(X, y, loss=loss, regularizer=regularizer, lams=reg.lams_, degree=degree, n_components=n_components, alpha=1e-3, beta=1e-3, gamma=0.0, learning_rate=learning_rate, eta0=0.01, shuffle=False, max_iter=10, tol=1e-3, random_state=0, fit_linear=fit_linear, batch_size=batch_size, verbose=0) assert_array_almost_equal(reg.P_[0, :, :], P_fit_slow, decimal=4) assert_array_almost_equal(reg.w_, w_fit_slow, decimal=4)
def test_fm_same_as_slow_reg(degree, mean, loss, regularizer): y = _poly_predict(X, P, lams, kernel="anova", degree=degree) reg = SparseFactorizationMachineRegressor( degree=degree, n_components=n_components, fit_lower=None, fit_linear=False, beta=1, gamma=1e-3, regularizer=regularizer, warm_start=False, tol=1e-3, max_iter=5, random_state=0, mean=mean, shuffle=False) with warnings.catch_warnings(): warnings.simplefilter('ignore') reg.fit(X, y) P_fit_slow = pcd_slow( X, y, loss=loss, regularizer=regularizer, lams=reg.lams_, degree=degree, n_components=n_components, beta=1, gamma=1e-3, max_iter=5, tol=1e-3, random_state=0, mean=mean) assert_array_almost_equal(reg.P_[0, :, :], P_fit_slow, decimal=4)