def sparsity_estimator0(X, y, n_boots=48, train_frac=0.75): sparsity_estimates = np.zeros(n_boots) n_features, n_samples = X.shape for boot in range(n_boots): # Draw bootstraps idxs_train, idxs_test = train_test_split(np.arange(X.shape[0]), train_size=train_frac, test_size=1 - train_frac) Xb = X[idxs_train] yb = y[idxs_train] Xb = StandardScaler().fit_transform(Xb) yb -= np.mean(yb) # Use the pycasso solver solver = pycasso.Solver(Xb, yb, penalty='l1') solver.train() coefs = solver.result['beta'] y_pred = Xb @ coefs.T # Assess BIC on the LARS path to estimate the sparsity. BIC_scores = np.array([ GIC(yb.ravel(), y_pred[:, j].ravel(), np.count_nonzero(coefs[j, :]), np.log(n_samples)) for j in range(coefs.shape[0]) ]) sparsity_estimates[boot] = float( np.count_nonzero(coefs[:, np.argmin(BIC_scores)])) / float(n_features) return sparsity_estimates
def fit(self, X, y): """Fit data according to the pycasso object. Parameters ---------- X : ndarray, (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) Response vector. Will be cast to X's dtype if necessary. Currently, this implementation does not handle multiple response variables. """ if self.alphas is None: raise Exception('Set alphas before fitting.') self.solver = pycasso.Solver(X, y, family='gaussian', useintercept=self.fit_intercept, lambdas=self.alphas, penalty='l1', max_ite=self.max_iter) self.solver.train() # Coefs across the entire solution path self.coef_ = self.solver.result['beta'] self.intercept_ = self.solver.result['intercept'] self.isfitted = True return self
def main(args): raw_df = pd.read_csv(args.train_data_path) columns = args.features.split(",") features_values = raw_df[columns].values label = args.label y = raw_df[label].values # predict_df = pd.read_csv(args.predict_data_path) # predict_values = predict_df[columns].values s = pycasso.Solver(features_values, y, penalty=args.penalty) s.train() beta = s.coef()['beta'][-1] print("beta :", beta) intercept = s.coef()['intercept'][-1] print("intercept :", intercept) result = s.predict(features_values) print("result :", result) select_df = pd.DataFrame(np.array(beta).reshape(1, -1), columns=columns) select_df.to_csv(args.selection_path, index=None, sep=",") return
def init_solver(self, X, y, alphas): self.solver = pycasso.Solver(X, y, family='gaussian', useintercept=self.fit_intercept, lambdas=alphas, penalty='l1', max_ite=self.max_iter)
def init_solver(self, X, y, lambda1 = None, lambda2 = None): self.init_reg_params(X, y) # We solve for an entire elastic net path with a fixed lambda2 # For the given fixed lambda1, we modify the dataset to allow # for the solution of a lasso-like problem xx, yy = augment_data(X, y, self.lambda2) # Augmented regularization parameters gamma = self.lambda1/np.sqrt(1 + self.lambda2) self.solver = pycasso.Solver(xx, yy, family = 'gaussian', useintercept = self.fit_intercept, lambdas = gamma, penalty = 'l1', max_ite = self.max_iter)
def test_lognet(n, p, c, nlambda=100): X, Y, true_beta = generate_sim_lognet(n, p, c) time0 = time.time() picasso = pycasso.Solver(X, Y, lambdas=(nlambda, 0.01), family='binomial', penalty='l1') picasso.train() time1 = time.time() picasso_time = time1 - time0 idx = 50 picasso_obj = lognet_obj(X, Y, picasso.result['beta'][idx, :], picasso.result['intercept'][idx], picasso.lambdas[idx]) time0 = time.time() clf = linear_model.LogisticRegression(penalty='l1', tol=1e-6, warm_start=True) coefs_ = [] intcpt_ = [] for lamb in picasso.lambdas: clf.set_params(C=1.0 / (n * lamb)) clf.fit(X, Y) coefs_.append(clf.coef_.ravel().copy()) intcpt_.append(clf.intercept_.ravel().copy()) time1 = time.time() sklearn_obj = lognet_obj(X, Y, coefs_[idx], intcpt_[idx], picasso.lambdas[idx]) sklearn_time = time1 - time0 print( "Testing L1 penalized linear regression, number of samples:%d, sample dimension:%d, correlation:%f" % (n, p, c)) print("Picasso time:%f, Obj function value:%f" % (picasso_time, picasso_obj)) print("Sklearn time:%f, Obj function value:%f" % (sklearn_time, sklearn_obj)) return picasso_time, picasso_obj, sklearn_time, sklearn_obj
def _mcp_reg(y, x, n_sample, p): # p is the number of columns in x if p == 1: x = x.reshape((n_sample, 1)) lambda_list = np.exp(np.arange(-5, 3, 0.1)) for j in range(p): x[:, j] = x[:, j] - np.mean(x[:, j]) std = np.sqrt(np.sum(x * x, axis=0)) / np.sqrt(n_sample) x = x / std mcp = pycasso.Solver(x, y - np.mean(y), penalty="mcp", gamma=1.25, prec=1e-4, lambdas=lambda_list) mcp.train() BIC = np.zeros(len(lambda_list)) for k in range(len(lambda_list)): BIC[k] = np.sum(np.square(y - np.mean(y) - x @ mcp.coef()['beta'][k])) + \ sum(mcp.coef()['beta'][k]!=0)*np.log(n_sample) return mcp.coef()['beta'][np.argmin(BIC)] / std
def sparsity_estimator1(X, y, s0, n_boots=48, train_frac=0.75): sparsity_estimates = np.zeros(n_boots) for boot in range(n_boots): # Draw bootstraps idxs_train, idxs_test = train_test_split(np.arange(X.shape[0]), train_size=train_frac, test_size=1 - train_frac) Xb = X[idxs_train] yb = y[idxs_train] Xb = StandardScaler().fit_transform(Xb) yb -= np.mean(yb) n_samples, n_features = Xb.shape # Use fast pycasso solver solver = pycasso.Solver(Xb, yb, penalty='l1') solver.train() coefs = solver.result['beta'] y_pred = Xb @ coefs.T mBIC_scores = np.zeros(coefs.shape[1]) for j in range(coefs.shape[0]): ll_, p1_, BIC_, BIC2_, BIC3_, M_k_, P_M_ = full_bayes_factor( yb, y_pred[:, j], n_features, np.count_nonzero(coefs[j, :]), s0, 0) mBIC_scores[j] = 2 * ll_ - BIC_ - BIC2_ + BIC3_ + P_M_ sparsity_estimates[boot] = float( np.count_nonzero( coefs[:, np.argmax(mBIC_scores)])) / float(n_features) return sparsity_estimates
def test_elnet(n, p, c, nlambda=100): X, Y, true_beta = generate_sim_elnet(n, p, c) time0 = time.time() picasso = pycasso.Solver(X, Y, lambdas=(nlambda, 0.01), family='gaussian', penalty='l1') picasso.train() time1 = time.time() picasso_time = time1 - time0 idx = 50 picasso_obj = elnet_obj(X, Y, picasso.result['beta'][idx, :], picasso.result['intercept'][idx], picasso.lambdas[idx]) time0 = time.time() X_intcpt = np.concatenate((X, np.ones(n).reshape(-1, 1)), axis=1) alphas_lasso, coefs_lasso, _ = lasso_path(X_intcpt, Y, alphas=picasso.lambdas, eps=1e-3) time1 = time.time() sklearn_obj = elnet_obj(X, Y, coefs_lasso[0:p, idx], coefs_lasso[p, idx], alphas_lasso[idx] * 2) sklearn_time = time1 - time0 print( "Testing L1 penalized linear regression, number of samples:%d, sample dimension:%d, correlation:%f" % (n, p, c)) print("Picasso time:%f, Obj function value:%f" % (picasso_time, picasso_obj)) print("Sklearn time:%f, Obj function value:%f" % (sklearn_time, sklearn_obj)) return picasso_time, picasso_obj, sklearn_time, sklearn_obj
## Generate the design matrix and regression coefficient vector n = 100 # sample number d = 80 # sample dimension c = 0.5 # correlation parameter s = 20 # support size of coefficient X = scale(np.random.randn(n,d)+c* np.tile(np.random.randn(n),[d,1]).T )/ (n*(n-1))**0.5 beta = np.append(np.random.rand(s), np.zeros(d-s)) ## Generate response using Gaussian noise, and fit sparse linear models noise = np.random.randn(n) Y = np.matmul(X,beta) + noise ## l1 regularization solved with naive update solver_l1 = pycasso.Solver(X,Y, lambdas=(100,0.05), family="gaussian") solver_l1.train() ## mcp regularization solver_mcp = pycasso.Solver(X,Y, lambdas=(100,0.05), penalty="mcp") solver_mcp.train() ## scad regularization solver_scad = pycasso.Solver(X,Y, lambdas=(100,0.05), penalty="scad") solver_scad.train() ## Obtain the result result = solver_l1.coef() ## print out training time print(result['total_train_time'])
## Generate the design matrix and regression coefficient vector n = 100 # sample number d = 80 # sample dimension c = 0.5 # correlation parameter s = 20 # support size of coefficient X = scale(np.random.randn(n,d)+c* np.tile(np.random.randn(n),[d,1]).T )/ (n*(n-1))**0.5 beta = np.append(np.random.rand(s), np.zeros(d-s)) ## Generate response using Gaussian noise, and fit sparse linear models noise = np.random.randn(n) Y = np.matmul(X,beta) + noise ## l1 regularization solved with naive update solver_l1_naive = pycasso.Solver(X,Y, nlambda=100, family="gaussian", type_gaussian="naive") solver_l1_naive.train() ## l1 regularization solved with covariance update solver_l1_cov = pycasso.Solver(X,Y, nlambda=100, family="gaussian", type_gaussian="covariance") solver_l1_cov.train() ## mcp regularization solver_mcp = pycasso.Solver(X,Y, nlambda=100, penalty="mcp") solver_mcp.train() ## scad regularization solver_scad = pycasso.Solver(X,Y, nlambda=100, penalty="scad") solver_scad.train() ## Obtain the result
def lasso(x, y): sol = pycasso.Solver(x, y) sol.train() return sol.coef()["beta"][-1]
def fit(self, X_train_mod, y_train_mod, X_train, y_train, X_test, y_test, alpha_list, threshold_list, max_features, force_features = True): """ Hyperparameter/feature selection and fitting final model Parameters: ----------- X_train : array-like, shape (n_samples, n_features) Covariate matrix with train data y_train : array-like, shape (n_samples,) Response vector with train data X_test : array-like, shape (n_samples, n_features) Covariate matrix with test data y_test : array-like, shape (n_samples,) Response vector with test data alpha_list : array-like, list of alphas for gridsearch threshold_list : array-like list of thresholds (min abs value for coefficients to be selected as feature) for gridsearch max_features : int max number of features that can be selected force_features : bool, if True, the hyperparameters chosen must select for at least 1 feature """ self.alpha_list = alpha_list self.threshold_list = threshold_list self.max_features = max_features regr = LinearRegression() s = pycasso.Solver(X_train_mod, y_train_mod, lambdas=alpha_list, penalty = self.penalty) s.train() self.gridsearch_regression = s for i in range(len(self.alpha_list)): for j in range(len(self.threshold_list)): alpha = alpha_list[i] beta = s.coef()['beta'][i] threshold = self.threshold_list[j] feats = getFeatures(alpha, beta, threshold, self.max_features) score = getScoring(regr, X_train, y_train, X_test, y_test, feats, alpha) self.gridsearch_results_raw.append([alpha, threshold, beta, feats, score[0], score[1]]) self.gridsearch_results_raw = pd.DataFrame(self.gridsearch_results_raw, columns = ['alpha', 'threshold','beta', 'features', 'Train MSE', 'Test MSE']) self.gridsearch_results, self.alpha_, self.threshold_ = getBestParam(self.gridsearch_results_raw, force_features) self.y_avg = np.mean(y_train) #grab best beta if force_features: best_idx = self.gridsearch_results.sort_values(by = 'Test MSE').loc[self.gridsearch_results['num_features'] >= 1].index[0] else: best_idx = self.gridsearch_results['Test MSE'].idxmin() self.beta_ = self.gridsearch_results_raw.iloc[best_idx]['beta'] self.feats_ = getFeatures(self.alpha_, self.beta_, threshold = self.threshold_, max_features = self.max_features) if len(self.feats_) == 0: self.regr_ = 'DID NOT CHOOSE ANY FEATURES' else: self.regr_ = clone(regr) self.regr_.fit(X_train[:,self.feats_],y_train) self.coef_ = self.regr_.coef_