def fit(self, warm_start=True, label=None, save=False, use_glmnet=False, **kwargs): ''' Note: this only works with CV weights (i.e. all 0 or 1) ''' if not use_glmnet: super(LinearRegressionModel, self).fit(warm_start, label, save, **kwargs) elif use_glmnet: from glmnet_py import glmnet inds = self.example_weights.astype(np.bool) x = self.training_data.X[inds, :].copy() y = self.training_data.Y[inds].copy().astype(np.float) lambdau = np.array([ self.L1Lambda / (2 * x.shape[0]), ]) fit = glmnet( x=x[:, :-1], y=y, family='gaussian', standardize=True, lambdau=lambdau, thresh=1e-10, maxit=10e4, alpha=1.0, ) self.params.set_free(np.append(np.squeeze(fit['beta']), fit['a0']))
def fit(self, warm_start=True, label=None, save=False, use_glmnet=False, **kwargs): ''' Note: use_glmnet only works with CV weights (i.e. all 0 or 1) ''' if not use_glmnet: super(LogisticRegressionModel, self).fit(warm_start, label, save, **kwargs) elif use_glmnet: from glmnet_py import glmnet lambdau = np.array([ self.L1Lambda / self.training_data.X.shape[0], ]) inds = self.example_weights.astype(np.bool) x = self.training_data.X[inds, :-1].copy() y = self.training_data.Y[inds].copy().astype(np.float) y[np.where(y == -1)] = 0.0 fit = glmnet( x=x, y=y, family='binomial', standardize=True, lambdau=lambdau, thresh=1e-10, maxit=10e3, alpha=1.0, ) self.params.set_free(np.append(fit['beta'], 0)) return
def fit(self, warm_start=True, label=None, save=False, use_glmnet=False, **kwargs): ''' Note: use_glmnet only works with CV weights (i.e. all 0 or 1) ''' if not use_glmnet: super(ExponentialPoissonRegressionModel, self).fit(warm_start, label, save, **kwargs) elif use_glmnet: from glmnet_py import glmnet lambdau = np.array([ self.L1Lambda, ]) inds = self.example_weights.astype(np.bool) x = self.training_data.X[inds, :].copy() y = self.training_data.Y[inds].copy().astype(np.float) fit = glmnet( x=x, y=y, family='poisson', standardize=False, lambdau=lambdau, thresh=1e-20, maxit=10e4, alpha=1.0, )
def fit(self, warm_start=True, label=None, save=False, use_glmnet=False, use_cvxpy=False, cvxpy_tol=1e-4, **kwargs): ''' Note: use_cvxpy or use_glmnet only works with CV weights (i.e. all 0 or 1) ''' if not use_cvxpy and not use_glmnet: super(ExponentialPoissonRegressionModel, self).fit(warm_start, label, save, **kwargs) elif use_glmnet: from glmnet_py import glmnet lambdau = np.array([self.L1Lambda,]) inds = self.example_weights.astype(np.bool) x = self.training_data.X[inds,:].copy() y = self.training_data.Y[inds].copy().astype(np.float) fit = glmnet(x=x, y=y, family='poisson', standardize=False, lambdau=lambdau, thresh=1e-20, maxit=10e4, alpha=1.0, ) elif use_cvxpy: import cvxpy D = self.params.get_free().shape[0] Ntrain = self.training_data.X.shape[0] theta = cvxpy.Variable(D, value=self.params.get_free()) weights = self.example_weights X = self.training_data.X Y = self.training_data.Y obj = cvxpy.Minimize(-Y*cvxpy.multiply(weights,(X*(theta))) + cvxpy.sum(cvxpy.multiply(weights, cvxpy.exp(X*(theta)))) + self.L1Lambda * cvxpy.sum(cvxpy.abs((theta)[:-1])) ) problem = cvxpy.Problem(obj) problem.solve(solver=cvxpy.SCS, normalize=True, eps=cvxpy_tol, verbose=False, max_iters=2000) if (problem.status == 'infeasible_inaccurate' or problem.status == 'unbounded_inaccurate'): problem.solve(solver=cvxpy.SCS, normalize=True, eps=cvxpy_tol, verbose=False, max_iters=2000) problem.solve(solver=cvxpy.SCS, normalize=True, eps=cvxpy_tol, verbose=False, max_iters=10000) try: self.params.set_free(theta.value) except: print('Bad problem?', problem.status)
def glmnetFit(self, X, y, offsets, numCodons, numGenes, varsNames, lambda_min): """ :param X: :param y: :param offsets: :param numCodons: :param numGenes: :param varsNames: :param lambda_min: :return: """ # fit the model if not lambda_min: fit = cvglmnet(x=X.copy(), y=y.copy(), family='poisson', offset=offsets, alpha=0, parallel=True, lambda_min=np.array([0])) coefs = cvglmnetCoef(fit, s=fit['lambda_min']) # lambda_min lambda_1se else: fit = glmnet(x=X.copy(), y=y.copy(), family='poisson', offset=offsets, alpha=0, lambda_min=np.array([0])) coefs = glmnetCoef(fit, s=scipy.float64([lambda_min])) # parse and scale coefficients intercept = coefs[0][0] geneBetas = pd.DataFrame([[varsNames[i-1].split("_")[1], coefs[i][0]] for i in range(1, numGenes+1)], columns=["gene", "beta"]) geneBetas["log2_TE"] = (geneBetas["beta"] - np.median(geneBetas["beta"])) / np.log(2) geneBetas.drop(["beta"], inplace=True, axis=1) codonBetas = pd.DataFrame([[varsNames[i-1].split("_")[1], coefs[i][0]] for i in range(numGenes+1, numGenes + numCodons + 1)], columns=["codon", "beta"]) codonBetas["log_codon_dwell_time"] = (codonBetas["beta"] - np.median(codonBetas["beta"])) codonBetas["codon_dwell_time"] = np.exp(codonBetas["log_codon_dwell_time"]) codonBetas.drop(["beta", "log_codon_dwell_time"], inplace=True, axis=1) downstreamSLBeta = coefs[numGenes + numCodons + 1][0] # export to local geneBetas.to_csv(path_or_buf=self.output + '/genesTE.csv', sep='\t', header=True, index=False, float_format='%.4f') codonBetas.to_csv(path_or_buf=self.output + '/codons.csv', sep='\t', header=True, index=False, float_format='%.4f') # print results if lambda_min: sys.stderr.write("[results]\tpre-defined lambda: " + str(lambda_min) + "\n") else: sys.stderr.write("[results]\tlambda that gives minimum mean cv error: " + str(fit['lambda_min']) + "\n") sys.stderr.write("[results]\tlambda 1 se away: " + str(fit['lambda_1se']) + "\n") sys.stderr.write("[results]\tintercept: " + str(intercept) + "\n") sys.stderr.write("[results]\tbetas for 2' structure windows: " + str(downstreamSLBeta) + "\n") # plot if not lambda_min: plt.figure() cvglmnetPlot(fit) plt.gcf() plt.savefig(self.output + "/" + "lambda_cv.pdf") plt.clf()
def _process( self, raw_data, regression_var ): """ Take in pre-processed data, run a regression via glmnet and return the results. Args: raw_data: A pandas.DataFrame that contains both the design matrix and vector of predictors along with any labels. regression_var: Index of the column in raw_data that contains the vector of predictors. Returns: A pandas.DataFrame containing the regression coefficients corresponding to the support ( that is non-zero and signigiant coefficients ) along with the intercept term. """ reg_idx = int( regression_var ) Y = raw_data.ix[:,reg_idx] X = raw_data.ix[:, raw_data.columns != raw_data.columns[reg_idx] ] col_names = list( raw_data.columns.values ) Y = Y.as_matrix().astype(np.float64) X = X.as_matrix().astype(np.float64) C = 0.75 lamdbaMax = 2 * max( abs( np.dot( np.transpose( X ), Y ) ) ) / Y.size lambdaGrid = np.arange( 0, 100 ) lambdaGrid = np.apply_along_axis( lambda x: lamdbaMax / 1.3**x, 0, lambdaGrid ) betas = glmnet_py.glmnetCoef( glmnet_py.glmnet( x = X.copy(), y = Y.copy(), lambdau = lambdaGrid ) ) j = 0 t = 1 while( t > 0 and j < 99 ): j += 1 beta_j = betas[:,j] lambda_j = lambdaGrid[j] for k in np.arange( 1, ( j - 1 ) ) : beta_k = betas[:,k] lambda_k = lambdaGrid[k] t = t * (max(abs(beta_j - beta_k)) / (lambda_j + lambda_k) <= C) beta = betas[:,j] # fit['beta'] = fit['beta'] * (abs(fit['beta']) >= 3 * C * fit['lambdau']) # beta = beta * (abs( beta ) >= 3 * C * lambdaGrid[j] ) coefficients = beta nz_indices = coefficients.nonzero()[0] support_coefs = coefficients[ nz_indices ] # cvglmnetCoef places the intercept term in the first position of the # coefficients vector -- need to manually add Intercept name to vector of column names col_names.insert( 0, "Intercept" ) col_names = [ col_names[idx] for idx in nz_indices ] return( pd.DataFrame( data = support_coefs , index = col_names ).to_json(orient='columns') )
all_counts = count_vect.transform(pt_data['notes'].note_text) all_counts = scipy.sparse.csc_matrix(all_counts).astype('float64') note_sums = scipy.sparse.csc_matrix( pt_data['notes'].word_count).transpose().astype('float64') all_counts = all_counts.multiply(note_sums.power(-1)) feature_sums = scipy.sparse.csc_matrix(all_counts.sum(axis=0)) all_counts = all_counts.multiply(feature_sums.power(-1)) all_counts = scipy.sparse.csc_matrix(all_counts) from glmnet_py import glmnet from glmnetPrint import glmnetPrint from glmnetCoef import glmnetCoef from glmnetPredict import glmnetPredict fit = glmnet(x=all_counts[train_sample, :], y=pt_data['notes'].loc[train_sample].surv_12mo.values * 1.0, family='binomial', alpha=1.0) chunksize = 1000 num_s = fit['lambdau'].shape[0] predictions = np.zeros([len(test_sample), num_s]) for i in range(int(len(test_sample) / chunksize)): #looping avoids MemoryError predictions[(i * chunksize):((i + 1) * chunksize), :] = glmnetPredict( fit, all_counts[test_sample[(i * chunksize):((i + 1) * chunksize)], :], ptype='response') predictions[((i + 1) * chunksize):, :] = glmnetPredict( fit, all_counts[test_sample[((i + 1) * chunksize):], :], ptype='response') for i in range(num_s):