Beispiel #1
0
def sparsity_estimator0(X, y, n_boots=48, train_frac=0.75):

    sparsity_estimates = np.zeros(n_boots)

    n_features, n_samples = X.shape

    for boot in range(n_boots):
        # Draw bootstraps
        idxs_train, idxs_test = train_test_split(np.arange(X.shape[0]),
                                                 train_size=train_frac,
                                                 test_size=1 - train_frac)
        Xb = X[idxs_train]
        yb = y[idxs_train]

        Xb = StandardScaler().fit_transform(Xb)
        yb -= np.mean(yb)

        # Use the pycasso solver
        solver = pycasso.Solver(Xb, yb, penalty='l1')
        solver.train()

        coefs = solver.result['beta']
        y_pred = Xb @ coefs.T
        # Assess BIC on the LARS path to estimate the sparsity.
        BIC_scores = np.array([
            GIC(yb.ravel(), y_pred[:,
                                   j].ravel(), np.count_nonzero(coefs[j, :]),
                np.log(n_samples)) for j in range(coefs.shape[0])
        ])

        sparsity_estimates[boot] = float(
            np.count_nonzero(coefs[:,
                                   np.argmin(BIC_scores)])) / float(n_features)

    return sparsity_estimates
Beispiel #2
0
    def fit(self, X, y):
        """Fit data according to the pycasso object.

        Parameters
        ----------
        X : ndarray, (n_samples, n_features)
            The design matrix.
        y : ndarray, shape (n_samples,)
            Response vector. Will be cast to X's dtype if necessary.
            Currently, this implementation does not handle multiple response
            variables.
        """
        if self.alphas is None:
            raise Exception('Set alphas before fitting.')

        self.solver = pycasso.Solver(X,
                                     y,
                                     family='gaussian',
                                     useintercept=self.fit_intercept,
                                     lambdas=self.alphas,
                                     penalty='l1',
                                     max_ite=self.max_iter)
        self.solver.train()
        # Coefs across the entire solution path
        self.coef_ = self.solver.result['beta']
        self.intercept_ = self.solver.result['intercept']
        self.isfitted = True
        return self
Beispiel #3
0
def main(args):
    raw_df = pd.read_csv(args.train_data_path)
    columns = args.features.split(",")
    features_values = raw_df[columns].values
    label = args.label

    y = raw_df[label].values

    # predict_df = pd.read_csv(args.predict_data_path)
    # predict_values = predict_df[columns].values

    s = pycasso.Solver(features_values, y, penalty=args.penalty)
    s.train()
    beta = s.coef()['beta'][-1]
    print("beta :", beta)

    intercept = s.coef()['intercept'][-1]
    print("intercept :", intercept)

    result = s.predict(features_values)
    print("result :", result)

    select_df = pd.DataFrame(np.array(beta).reshape(1, -1), columns=columns)
    select_df.to_csv(args.selection_path, index=None, sep=",")

    return
Beispiel #4
0
    def init_solver(self, X, y, alphas):

        self.solver = pycasso.Solver(X,
                                     y,
                                     family='gaussian',
                                     useintercept=self.fit_intercept,
                                     lambdas=alphas,
                                     penalty='l1',
                                     max_ite=self.max_iter)
Beispiel #5
0
    def init_solver(self, X, y, lambda1 = None, lambda2 = None):

        self.init_reg_params(X, y)

        # We solve for an entire elastic net path with a fixed lambda2
        # For the given fixed lambda1, we modify the dataset to allow 
        # for the solution of a lasso-like problem
        xx, yy = augment_data(X, y, self.lambda2)

        # Augmented regularization parameters
        gamma = self.lambda1/np.sqrt(1 + self.lambda2)
        self.solver = pycasso.Solver(xx, yy, family = 'gaussian', 
                      useintercept = self.fit_intercept, lambdas = gamma,
                      penalty = 'l1', max_ite = self.max_iter)
Beispiel #6
0
def test_lognet(n, p, c, nlambda=100):
    X, Y, true_beta = generate_sim_lognet(n, p, c)
    time0 = time.time()
    picasso = pycasso.Solver(X,
                             Y,
                             lambdas=(nlambda, 0.01),
                             family='binomial',
                             penalty='l1')
    picasso.train()
    time1 = time.time()
    picasso_time = time1 - time0

    idx = 50
    picasso_obj = lognet_obj(X, Y, picasso.result['beta'][idx, :],
                             picasso.result['intercept'][idx],
                             picasso.lambdas[idx])

    time0 = time.time()
    clf = linear_model.LogisticRegression(penalty='l1',
                                          tol=1e-6,
                                          warm_start=True)
    coefs_ = []
    intcpt_ = []
    for lamb in picasso.lambdas:
        clf.set_params(C=1.0 / (n * lamb))
        clf.fit(X, Y)
        coefs_.append(clf.coef_.ravel().copy())
        intcpt_.append(clf.intercept_.ravel().copy())

    time1 = time.time()

    sklearn_obj = lognet_obj(X, Y, coefs_[idx], intcpt_[idx],
                             picasso.lambdas[idx])
    sklearn_time = time1 - time0

    print(
        "Testing L1 penalized linear regression, number of samples:%d, sample dimension:%d, correlation:%f"
        % (n, p, c))
    print("Picasso time:%f, Obj function value:%f" %
          (picasso_time, picasso_obj))
    print("Sklearn time:%f, Obj function value:%f" %
          (sklearn_time, sklearn_obj))
    return picasso_time, picasso_obj, sklearn_time, sklearn_obj
Beispiel #7
0
def _mcp_reg(y, x, n_sample, p):  # p is the number of columns in x
    if p == 1:
        x = x.reshape((n_sample, 1))
    lambda_list = np.exp(np.arange(-5, 3, 0.1))
    for j in range(p):
        x[:, j] = x[:, j] - np.mean(x[:, j])
    std = np.sqrt(np.sum(x * x, axis=0)) / np.sqrt(n_sample)
    x = x / std
    mcp = pycasso.Solver(x,
                         y - np.mean(y),
                         penalty="mcp",
                         gamma=1.25,
                         prec=1e-4,
                         lambdas=lambda_list)
    mcp.train()
    BIC = np.zeros(len(lambda_list))
    for k in range(len(lambda_list)):
        BIC[k] = np.sum(np.square(y - np.mean(y) - x @ mcp.coef()['beta'][k])) + \
                        sum(mcp.coef()['beta'][k]!=0)*np.log(n_sample)
    return mcp.coef()['beta'][np.argmin(BIC)] / std
Beispiel #8
0
def sparsity_estimator1(X, y, s0, n_boots=48, train_frac=0.75):

    sparsity_estimates = np.zeros(n_boots)

    for boot in range(n_boots):
        # Draw bootstraps
        idxs_train, idxs_test = train_test_split(np.arange(X.shape[0]),
                                                 train_size=train_frac,
                                                 test_size=1 - train_frac)
        Xb = X[idxs_train]
        yb = y[idxs_train]

        Xb = StandardScaler().fit_transform(Xb)
        yb -= np.mean(yb)

        n_samples, n_features = Xb.shape

        # Use fast pycasso solver
        solver = pycasso.Solver(Xb, yb, penalty='l1')
        solver.train()

        coefs = solver.result['beta']

        y_pred = Xb @ coefs.T

        mBIC_scores = np.zeros(coefs.shape[1])

        for j in range(coefs.shape[0]):

            ll_, p1_, BIC_, BIC2_, BIC3_, M_k_, P_M_ = full_bayes_factor(
                yb, y_pred[:, j], n_features, np.count_nonzero(coefs[j, :]),
                s0, 0)
            mBIC_scores[j] = 2 * ll_ - BIC_ - BIC2_ + BIC3_ + P_M_

        sparsity_estimates[boot] = float(
            np.count_nonzero(
                coefs[:, np.argmax(mBIC_scores)])) / float(n_features)

    return sparsity_estimates
Beispiel #9
0
def test_elnet(n, p, c, nlambda=100):
    X, Y, true_beta = generate_sim_elnet(n, p, c)
    time0 = time.time()
    picasso = pycasso.Solver(X,
                             Y,
                             lambdas=(nlambda, 0.01),
                             family='gaussian',
                             penalty='l1')
    picasso.train()
    time1 = time.time()
    picasso_time = time1 - time0

    idx = 50
    picasso_obj = elnet_obj(X, Y, picasso.result['beta'][idx, :],
                            picasso.result['intercept'][idx],
                            picasso.lambdas[idx])

    time0 = time.time()

    X_intcpt = np.concatenate((X, np.ones(n).reshape(-1, 1)), axis=1)

    alphas_lasso, coefs_lasso, _ = lasso_path(X_intcpt,
                                              Y,
                                              alphas=picasso.lambdas,
                                              eps=1e-3)
    time1 = time.time()

    sklearn_obj = elnet_obj(X, Y, coefs_lasso[0:p, idx], coefs_lasso[p, idx],
                            alphas_lasso[idx] * 2)
    sklearn_time = time1 - time0

    print(
        "Testing L1 penalized linear regression, number of samples:%d, sample dimension:%d, correlation:%f"
        % (n, p, c))
    print("Picasso time:%f, Obj function value:%f" %
          (picasso_time, picasso_obj))
    print("Sklearn time:%f, Obj function value:%f" %
          (sklearn_time, sklearn_obj))
    return picasso_time, picasso_obj, sklearn_time, sklearn_obj
Beispiel #10
0
## Generate the design matrix and regression coefficient vector
n = 100 # sample number
d = 80 # sample dimension
c = 0.5 # correlation parameter
s = 20  # support size of coefficient

X = scale(np.random.randn(n,d)+c* np.tile(np.random.randn(n),[d,1]).T )/ (n*(n-1))**0.5
beta = np.append(np.random.rand(s), np.zeros(d-s))

## Generate response using Gaussian noise, and fit sparse linear models
noise = np.random.randn(n)
Y = np.matmul(X,beta) + noise


## l1 regularization solved with naive update
solver_l1 = pycasso.Solver(X,Y, lambdas=(100,0.05), family="gaussian")
solver_l1.train()

## mcp regularization
solver_mcp = pycasso.Solver(X,Y, lambdas=(100,0.05), penalty="mcp")
solver_mcp.train()

## scad regularization
solver_scad = pycasso.Solver(X,Y, lambdas=(100,0.05), penalty="scad")
solver_scad.train()

## Obtain the result
result = solver_l1.coef()

## print out training time
print(result['total_train_time'])
Beispiel #11
0
## Generate the design matrix and regression coefficient vector
n = 100 # sample number
d = 80 # sample dimension
c = 0.5 # correlation parameter
s = 20  # support size of coefficient

X = scale(np.random.randn(n,d)+c* np.tile(np.random.randn(n),[d,1]).T )/ (n*(n-1))**0.5
beta = np.append(np.random.rand(s), np.zeros(d-s))

## Generate response using Gaussian noise, and fit sparse linear models
noise = np.random.randn(n)
Y = np.matmul(X,beta) + noise


## l1 regularization solved with naive update
solver_l1_naive = pycasso.Solver(X,Y, nlambda=100, family="gaussian", type_gaussian="naive")
solver_l1_naive.train()

## l1 regularization solved with covariance update
solver_l1_cov = pycasso.Solver(X,Y, nlambda=100, family="gaussian", type_gaussian="covariance")
solver_l1_cov.train()

## mcp regularization
solver_mcp = pycasso.Solver(X,Y, nlambda=100, penalty="mcp")
solver_mcp.train()

## scad regularization
solver_scad = pycasso.Solver(X,Y, nlambda=100, penalty="scad")
solver_scad.train()

## Obtain the result
Beispiel #12
0
def lasso(x, y):
    sol = pycasso.Solver(x, y)
    sol.train()
    return sol.coef()["beta"][-1]
	def fit(self, X_train_mod, y_train_mod, X_train, y_train, X_test, y_test, alpha_list, threshold_list, max_features, force_features = True):
		
		"""
		Hyperparameter/feature selection and fitting final model

		Parameters:
		-----------
		X_train : array-like, shape (n_samples, n_features)
			  Covariate matrix with train data

		y_train : array-like, shape (n_samples,)
				  Response vector with train data

		X_test : array-like, shape (n_samples, n_features)
				 Covariate matrix with test data

		y_test : array-like, shape (n_samples,)
				 Response vector with test data

		alpha_list : array-like,
					 list of alphas for gridsearch

		threshold_list : array-like
						 list of thresholds (min abs value for coefficients to be selected as feature) 
						 for gridsearch

		max_features : int
				   max number of features that can be selected

		force_features : bool,
					 if True, the hyperparameters chosen must select for at least 1 feature

		"""
		self.alpha_list = alpha_list
		self.threshold_list = threshold_list
		self.max_features = max_features
		
		
		regr = LinearRegression()
		
		s = pycasso.Solver(X_train_mod, y_train_mod, lambdas=alpha_list, penalty = self.penalty)
		s.train()
		
		self.gridsearch_regression = s
		
		for i in range(len(self.alpha_list)):
			for j in range(len(self.threshold_list)):
				alpha = alpha_list[i]
				beta = s.coef()['beta'][i]
				threshold = self.threshold_list[j]
				feats = getFeatures(alpha, beta, threshold, self.max_features)
				
				score = getScoring(regr, X_train, y_train, X_test, y_test, feats, alpha)
				self.gridsearch_results_raw.append([alpha, threshold, beta, feats, score[0], score[1]])
		self.gridsearch_results_raw = pd.DataFrame(self.gridsearch_results_raw, 
							columns = ['alpha', 'threshold','beta', 'features', 'Train MSE', 'Test MSE'])
		
		self.gridsearch_results, self.alpha_, self.threshold_ = getBestParam(self.gridsearch_results_raw, force_features)
		
		self.y_avg = np.mean(y_train)
		
		#grab best beta
		if force_features:
			best_idx = self.gridsearch_results.sort_values(by = 'Test MSE').loc[self.gridsearch_results['num_features'] >= 1].index[0]
		else:
			best_idx = self.gridsearch_results['Test MSE'].idxmin()
		
		self.beta_ = self.gridsearch_results_raw.iloc[best_idx]['beta']
		
		self.feats_ = getFeatures(self.alpha_, self.beta_, threshold = self.threshold_, max_features = self.max_features)
		
		if len(self.feats_) == 0:
			self.regr_ = 'DID NOT CHOOSE ANY FEATURES'
		else:    
			self.regr_ = clone(regr)
			self.regr_.fit(X_train[:,self.feats_],y_train)        
			self.coef_ = self.regr_.coef_