def comp_varexp(Y,S,nneuron): """ Compute variance explained from the GLM results, with parameters set above """ y = np.squeeze(Y[nneuron,:]) #spike train of interest stimulus = S[:,None] #same stimulus for all neurons X = build_convolved_matrix(stimulus, Y.T, Ks, couple) #design matrix with features projected onto basis functions glm = GLMCV(distr="binomial", tol=1e-5, eta=1.0, score_metric="deviance", alpha=0., learning_rate=1e-6, max_iter=1000, cv=3, verbose=True) #important to have v slow learning_rate glm.fit(X, y) yhat = simulate_glm('binomial', glm.beta0_, glm.beta_, X) #simulate spike rate given the firring results varexp = np.corrcoef(y,yhat) return varexp
def test_glmcv(): """Test GLMCV class.""" raises(ValueError, GLM, distr='blah') raises(ValueError, GLM, distr='gaussian', max_iter=1.8) scaler = StandardScaler() n_samples, n_features = 100, 10 # coefficients beta0 = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0) beta = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0, (n_features,)) distrs = ['softplus', 'gaussian', 'poisson', 'binomial', 'probit', 'gamma'] solvers = ['batch-gradient', 'cdfast'] score_metric = 'pseudo_R2' learning_rate = 2e-1 for solver in solvers: for distr in distrs: if distr == 'gamma' and solver == 'cdfast': continue glm = GLMCV(distr, learning_rate=learning_rate, solver=solver, score_metric=score_metric, cv=2) assert (repr(glm)) np.random.seed(glm.random_state) X_train = np.random.normal(0.0, 1.0, [n_samples, n_features]) y_train = simulate_glm(glm.distr, beta0, beta, X_train) X_train = scaler.fit_transform(X_train) glm.fit(X_train, y_train) beta_ = glm.beta_ assert_allclose(beta, beta_, atol=0.5) # check fit y_pred = glm.predict(scaler.transform(X_train)) assert (y_pred.shape[0] == X_train.shape[0])
def kernel_MSE(Y, S, nneuron, Ks): """ Compute MSE from the GLM results, with simulated activity, stimulus, and the ground truth kernel """ ###GLM y = np.squeeze(Y[nneuron, :]) #spike train of interest stimulus = S[:, None] #same stimulus for all neurons X = build_convolved_matrix( stimulus, Y.T, Ks, couple) #design matrix with features projected onto basis functions glm = GLMCV(distr="binomial", tol=1e-5, eta=1.0, score_metric="deviance", alpha=0., learning_rate=0.1, max_iter=1000, cv=3, verbose=True) #important to have v slow learning_rate glm.fit(X, y) ###store kernel theta_rec = glm.beta_[1:] theta_rec = theta_rec.reshape(nbasis, N + 1) K_rec = np.zeros((N + 1, pad)) for ii in range(N + 1): K_rec[ii, :] = np.dot(theta_rec[:, ii], Ks) ###normalize and shift kernels K_rec_norm = np.array([ K_rec[ii, :] / np.linalg.norm(K_rec[ii, :]) for ii in range(N + 1) ]) # K_tru_norm = np.array([ allK[nneuron, ii, :] / np.linalg.norm(allK[nneuron, ii, :]) for ii in range(N + 1) ]) # # K_rec_norm = K_rec_norm.T-K_rec_norm[:,-1] # K_tru_norm = K_tru_norm.T-K_tru_norm[:,-1] mses = np.sum((K_rec_norm - K_tru_norm)**2, axis=1) #measuring MSE for each kernel return mses
def test_random_state_consistency(): """Test model's random_state.""" # Generate the dataset n_samples, n_features = 1000, 10 beta0 = 1. / (np.float(n_features) + 1.) * np.random.normal(0.0, 1.0) beta = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0, (n_features,)) Xtrain = np.random.normal(0.0, 1.0, [n_samples, n_features]) ytrain = simulate_glm("gaussian", beta0, beta, Xtrain, sample=False, random_state=42) # Test simple glm glm_a = GLM(distr="gaussian", random_state=1) ypred_a = glm_a.fit_predict(Xtrain, ytrain) glm_b = GLM(distr="gaussian", random_state=1) ypred_b = glm_b.fit_predict(Xtrain, ytrain) # Consistency between two different models assert_array_equal(ypred_a, ypred_b) # Test also cross-validation glm_cv_a = GLMCV(distr="gaussian", cv=3, random_state=1) ypred_a = glm_cv_a.fit_predict(Xtrain, ytrain) glm_cv_b = GLMCV(distr="gaussian", cv=3, random_state=1) ypred_b = glm_cv_b.fit_predict(Xtrain, ytrain) ypred_c = glm_cv_b.fit_predict(Xtrain, ytrain) assert_array_equal(ypred_a, ypred_b) assert_array_equal(ypred_b, ypred_c)
def test_glmcv(): """Test GLMCV class.""" scaler = StandardScaler() n_samples, n_features = 100, 10 # coefficients beta0 = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0) beta = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0, (n_features,)) distrs = ['softplus', 'gaussian', 'poisson', 'binomial', 'probit', 'gamma'] solvers = ['batch-gradient', 'cdfast'] score_metric = 'pseudo_R2' learning_rate = 2e-1 for solver in solvers: for distr in distrs: if distr == 'gamma' and solver == 'cdfast': continue glm = GLMCV(distr, learning_rate=learning_rate, solver=solver, score_metric=score_metric) assert_true(repr(glm)) np.random.seed(glm.random_state) X_train = np.random.normal(0.0, 1.0, [n_samples, n_features]) y_train = simulate_glm(glm.distr, beta0, beta, X_train) X_train = scaler.fit_transform(X_train) glm.fit(X_train, y_train) beta_ = glm.beta_ assert_allclose(beta, beta_, atol=0.5) # check fit y_pred = glm.predict(scaler.transform(X_train)) assert_equal(y_pred.shape[0], X_train.shape[0])
def test_glmcv(distr, solver): """Test GLMCV class.""" raises(ValueError, GLM, distr='blah') raises(ValueError, GLM, distr='gaussian', max_iter=1.8) scaler = StandardScaler() n_samples, n_features = 100, 10 # coefficients beta0 = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0) beta = 1. / (np.float(n_features) + 1.) * \ np.random.normal(0.0, 1.0, (n_features,)) score_metric = 'pseudo_R2' learning_rate = 2e-1 if not (distr == 'gamma' and solver == 'cdfast'): glm = GLMCV(distr, learning_rate=learning_rate, solver=solver, score_metric=score_metric, cv=2) assert(repr(glm)) np.random.seed(glm.random_state) X_train = np.random.normal(0.0, 1.0, [n_samples, n_features]) y_train = simulate_glm(glm.distr, beta0, beta, X_train) X_train = scaler.fit_transform(X_train) glm.fit(X_train, y_train) beta_ = glm.beta_ assert_allclose(beta, beta_, atol=0.5) # check fit y_pred = glm.predict(scaler.transform(X_train)) assert(y_pred.shape[0] == X_train.shape[0]) # test picky score_metric check within fit(). glm.score_metric = 'bad_score_metric' # reuse last glm raises(ValueError, glm.fit, X_train, y_train)
X, y = datasets.fetch_community_crime_data() n_samples, n_features = X.shape ######################################################## # Split the data into training and test sets X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.33, random_state=0) ######################################################## # Fit a binomial distributed GLM with elastic net regularization # use the default value for reg_lambda glm = GLMCV(distr='binomial', alpha=0.05, score_metric='pseudo_R2', cv=3, tol=1e-4) # fit model glm.fit(X_train, y_train) # score the test set prediction y_test_hat = glm.predict_proba(X_test) print("test set pseudo $R^2$ = %f" % glm.score(X_test, y_test)) ######################################################## # Now use GridSearchCV to compare import numpy as np # noqa from sklearn.model_selection import GridSearchCV # noqa
# %% ############################################################################### # %% inference method (single) nneuron = 2 pad = 100 #window for kernel nbasis = 7 #number of basis couple = 1 #wether or not coupling cells considered Y = np.squeeze(rate[nneuron,:]) #spike train of interest Ks = (np.fliplr(basis_function1(pad,nbasis).T).T).T #basis function used for kernel approximation stimulus = stim[:,None] #same stimulus for all neurons X = build_convolved_matrix(stimulus, rate.T, Ks, couple) #design matrix with features projected onto basis functions ###pyGLMnet function with optimal parameters glm = GLMCV(distr="binomial", tol=1e-5, eta=1.0, score_metric="deviance", alpha=0., learning_rate=1e-6, max_iter=1000, cv=3, verbose=True) #important to have v slow learning_rate glm.fit(X, Y) # %% direct simulation yhat = simulate_glm('binomial', glm.beta0_, glm.beta_, X) #simulate spike rate given the firring results plt.figure() plt.plot(Y*1.) #ground truth plt.plot(yhat,'--') # %%reconstruct kernel theta = glm.beta_ dc_ = theta[0] theta_ = theta[1:] if couple == 1: theta_ = theta_.reshape(nbasis,N+1) #nbasis times (stimulus + N neurons)
# Set up the training and testing sets from sklearn.model_selection import train_test_split # noqa X = df[df.columns.difference(["Label"])].values y = df.loc[:, "Label"].values Xtrain, Xtest, ytrain, ytest = \ train_test_split(X, y, test_size=0.2, random_state=42) ########################################################## # Setup the models # set up the group lasso GLM model gl_glm = GLMCV(distr="binomial", tol=1e-3, group=group_idxs, score_metric="pseudo_R2", alpha=1.0, cv=3) # set up the lasso model glm = GLMCV(distr="binomial", tol=1e-3, score_metric="pseudo_R2", alpha=1.0, cv=3) print("gl_glm: ", gl_glm) print("glm: ", glm) ########################################################## # Fit models gl_glm.fit(Xtrain, ytrain) glm.fit(Xtrain, ytrain)
######################################################## # Fit models from sklearn.model_selection import train_test_split Xtrain, Xtest, Ytrain, Ytest = train_test_split(features, spike_counts, test_size=0.2, random_state=42) ######################################################## from pyglmnet import utils n_samples = Xtrain.shape[0] Tau = utils.tikhonov_from_prior(prior_cov, n_samples) glm = GLMCV(distr='poisson', alpha=0., Tau=Tau, score_metric='pseudo_R2', cv=3) glm.fit(Xtrain, Ytrain) print("train score: %f" % glm.score(Xtrain, Ytrain)) print("test score: %f" % glm.score(Xtest, Ytest)) weights = glm.beta_ ######################################################## # Visualize for time_bin_ in range(n_temporal_basis): RF = strf_model.make_image_from_spatial_basis( spatial_basis, weights[range(time_bin_, n_spatial_basis * n_temporal_basis, n_temporal_basis)]) plt.subplot(1, n_temporal_basis, time_bin_ + 1)
# Download and preprocess data files X, y = datasets.fetch_community_crime_data('/tmp/glm-tools') n_samples, n_features = X.shape ######################################################## # Split the data into training and test sets X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.33, random_state=0) ######################################################## # Fit a gaussian distributed GLM with elastic net regularization # use the default value for reg_lambda glm = GLMCV(distr='gaussian', alpha=0.05, score_metric='pseudo_R2') # fit model glm.fit(X_train, y_train) # score the test set prediction y_test_hat = glm.predict(X_test) print ("test set pseudo $R^2$ = %f" % glm.score(X_test, y_test)) ######################################################## # Plot the true and predicted test set target values plt.plot(y_test[:50], 'ko-') plt.plot(y_test_hat[:50], 'ro-') plt.legend(['true', 'pred'], frameon=False) plt.xlabel('Counties')
def fit_group_lasso( X, knockoffs, y, groups, use_pyglm=True, y_dist=None, group_lasso=True, **kwargs, ): """ Fits a group lasso model. :param X: n x p design matrix :param knockoffs: n x p knockoff matrix :param groups: p length numpy array of groups :param use_pyglm: If true, use the pyglmnet grouplasso Else use the regular one :param y_dist: Either "gaussian" or "binomial" (for logistic regression) :param group_lasso: If False, do not use group regularization. :param kwargs: kwargs for group-lasso GroupLasso class. In particular includes reg_vals, a list of regularizations (lambda values) which defaults to [(0.05, 0.05)]. In each tuple of the list, the first value is the group regularization, the second value is the individual regularization. """ warnings.filterwarnings("ignore") # Parse some kwargs/defaults if "max_iter" in kwargs: max_iter = kwargs.pop("max_iter") else: max_iter = 100 if "tol" in kwargs: tol = kwargs.pop("tol") else: tol = 1e-2 if "cv" in kwargs: cv = kwargs.pop("cv") else: cv = 5 if "learning_rate" in kwargs: learning_rate = kwargs.pop("learning_rate") else: learning_rate = 2 if y_dist is None: y_dist = parse_y_dist(y) # Bind data n = X.shape[0] p = X.shape[1] features = np.concatenate([X, knockoffs], axis=1) # By default, all variables are their own group if groups is None: groups = np.arange(1, p + 1, 1) m = np.unique(groups).shape[0] # If m == p, meaning each variable is their own group, # just fit a regular lasso if m == p or not group_lasso: return fit_lasso(X, knockoffs, y, y_dist, **kwargs) # Make sure variables and their knockoffs are in the same group # This is necessary for antisymmetry doubled_groups = np.concatenate([groups, groups], axis=0) # Randomize coordinates to make sure everything is symmetric inds, rev_inds = random_permutation_inds(2 * p) features = features[:, inds] doubled_groups = doubled_groups[inds] # Standardize - important for pyglmnet performance, # highly detrimental for group_lasso performance if use_pyglm: features = (features - features.mean()) / features.std() if y_dist == "gaussian": y = (y - y.mean()) / y.std() # Get regularizations if "reg_vals" in kwargs: reg_vals = kwargs.pop("reg_vals") else: reg_vals = [(x, x) for x in DEFAULT_REG_VALS] # Fit pyglm model using warm starts if use_pyglm: l1_regs = [x[0] for x in reg_vals] gl = GLMCV( distr=y_dist, tol=tol, group=doubled_groups, alpha=1.0, learning_rate=learning_rate, max_iter=max_iter, reg_lambda=l1_regs, cv=cv, solver="cdfast", ) gl.fit(features, y) # Pull score, rename best_score = -1 * calc_mse(gl, features, y) best_gl = gl # Fit model if not use_pyglm: best_gl = None best_score = -1 * np.inf for group_reg, l1_reg in reg_vals: # Fit logistic/gaussian group lasso if not use_pyglm: if y_dist.lower() == "gaussian": gl = GroupLasso( groups=doubled_groups, tol=tol, group_reg=group_reg, l1_reg=l1_reg, **kwargs, ) elif y_dist.lower() == "binomial": gl = LogisticGroupLasso( groups=doubled_groups, tol=tol, group_reg=group_reg, l1_reg=l1_reg, **kwargs, ) else: raise ValueError( f"y_dist must be one of gaussian, binomial, not {y_dist}" ) gl.fit(features, y.reshape(n, 1)) score = -1 * calc_mse(gl, features, y.reshape(n, 1)) # Score, possibly select if score > best_score: best_score = score best_gl = gl warnings.resetwarnings() return best_gl, inds, rev_inds
normalize=True) else: model = LassoCV(max_iter=100000, n_jobs=-1, n_alphas=1000, random_state=1, cv=5) model.fit(train, labels["incidence"].fillna(0)) result = model.predict(test) if using_elastic: print("Best L1_ratio: {}".format(model.l1_ratio_)) print("Best Alpha: {}".format(model.alpha_)) elif using_negbinomial: model = GLMCV(distr='neg-binomial', score_metric="pseudo_R2", cv=3) model.fit(train.values.copy(), labels["incidence"].fillna(0).copy().values) result = model.predict(test.values) elif using_poisson: model = cvglmnet(x=train.values.copy(), y=labels["incidence"].fillna(0).copy().values, family='poisson', alpha=1.0, ptype="mse", parallel=True, nfolds=10) result = cvglmnetPredict(model, test.values, ptype='response', s="lambda_min")
def fit(self, X, Y, get_history_terms=True): """ Fits the model to the data in X to predict the response Y. Imports models and creates model instance as well. Parameters ---------- X: float, n_samples x n_features, features of interest Y: float, n_samples x 1, population activity get_history_terms = Boolean. Whether to compute the temporal features. Note that if spike_history and cov_history are False, no history will be computed anyways and the flag does nothing. """ if self.default_params: warnings.warn( '\n Using default hyperparameters. Consider optimizing on' + ' a held-out dataset using, e.g. hyperopt or random search') # make the covariate matrix. Include spike or covariate history? # The different methods here are to satisfy the needs of recurrent keras # models if get_history_terms: if self.tunemodel == 'lstm': X, Y = self.get_all_with_history_keras(X, Y) else: X, Y = self.get_all_with_history(X, Y) if self.tunemodel == 'glm': model = GLMCV(**self.params) model.fit(X, Y) # we want the last of the regularization path # self.model = model[-1] self.GLMCV = model self.model = model.glm_ elif self.tunemodel == 'feedforward_nn': if np.ndim(X) == 1: X = np.transpose(np.atleast_2d(X)) params = self.params model = Sequential() model.add( Dense(params['n1'], input_dim=np.shape(X)[1], kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l2(params['l2']))) model.add(Dropout(params['dropout'])) model.add(BatchNormalization()) model.add( Dense(params['n2'], kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l2(params['l2']))) model.add(BatchNormalization()) model.add(Dense(1, activation='softplus')) optim = adam(lr=params['lr'], clipnorm=params['clipnorm'], decay=params['decay'], beta_1=1 - params['b1'], beta_2=1 - params['b2']) model.compile( loss='poisson', optimizer=optim, ) hist = model.fit(X, Y, batch_size=128, epochs=30, verbose=self.verbose) self.model = model elif self.tunemodel == 'xgboost': dtrain = xgb.DMatrix(X, label=Y) num_round = 200 self.model = xgb.train(self.params, dtrain, num_round) elif self.tunemodel == 'random_forest': self.model = RandomForestRegressor(**self.params) self.model.fit(X, Y) elif self.tunemodel == 'lstm': if np.ndim(X) == 1: X = np.transpose(np.atleast_2d(X)) params = self.params model = Sequential() #Declare model #Add recurrent layer model.add(LSTM(int(params['n_units']),input_shape=(X.shape[1],X.shape[2]),\ dropout_W=params['dropout'],dropout_U=params['dropout'])) #Within recurrent layer, include dropout model.add(Dropout(params['dropout']) ) #Dropout some units (recurrent layer output units) #Add dense connections to output layer model.add(Dense(1, activation='softplus')) #Fit model (and set fitting parameters) model.compile(loss='poisson', optimizer='rmsprop', metrics=['accuracy']) model.fit(X, Y, epochs=int(params['epochs']), batch_size=int(params['batch_size']), verbose=self.verbose) #Fit the model self.model = model else: #using predefined model self.model.fit(X, Y)
# Set up the training and testing sets from sklearn.model_selection import train_test_split # noqa Xtrain, Xtest, ytrain, ytest = \ train_test_split(X, y, test_size=0.2, random_state=42) ########################################################## # Setup the models # set up the group lasso GLM model gl_glm = GLMCV(distr="binomial", tol=1e-3, group=group, score_metric="pseudo_R2", alpha=1.0, learning_rate=3, max_iter=100, cv=3, verbose=True) # set up the lasso model glm = GLMCV(distr="binomial", tol=1e-3, score_metric="pseudo_R2", alpha=1.0, learning_rate=3, max_iter=100, cv=3, verbose=True)
# Download and preprocess data files X, y = datasets.fetch_community_crime_data('/tmp/glm-tools') n_samples, n_features = X.shape ######################################################## # Split the data into training and test sets X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.33, random_state=0) ######################################################## # Fit a gaussian distributed GLM with elastic net regularization # use the default value for reg_lambda glm = GLMCV(distr='gaussian', alpha=0.05, score_metric='pseudo_R2') # fit model glm.fit(X_train, y_train) # score the test set prediction y_test_hat = glm.predict(X_test) print ("test set pseudo $R^2$ = %f" % glm.score(X_test, y_test)) ######################################################## # Now use plain grid search cv to compare import numpy as np # noqa from sklearn.model_selection import GridSearchCV # noqa from sklearn.cross_validation import StratifiedKFold # noqa
# %% plt.figure() plt.plot(K) plt.plot(kernel(theta_infer, pad), '-o') plt.plot(kernel(theta0, pad), '--') # %% ############################################################################### ############################################################################### # %% test pyGLM ### This is super-easy if we rely on built-in GLM fitting code glm = GLMCV(distr="binomial", tol=1e-3, score_metric="pseudo_R2", alpha=1.0, learning_rate=3, max_iter=100, cv=3, verbose=True) glm.fit(X, np.squeeze(spk)) # %% plt.figure() pyglm_infer = glm.beta_ plt.plot(pyglm_infer / np.linalg.norm(pyglm_infer)) plt.plot(K / np.linalg.norm(K), '--') # %% Two-neuron circuit with pyGLMnet ############################################################################### ###############################################################################
normalize=True) else: model = LassoCV(max_iter=100000, n_jobs=-1, n_alphas=1000, random_state=1, cv=5) model.fit(train, labels["incidence"].fillna(0)) result = model.predict(test) if using_elastic: print("Best L1_ratio: {}".format(model.l1_ratio_)) print("Best Alpha: {}".format(model.alpha_)) elif using_negbinomial: model = GLMCV(distr='neg-binomial', score_metric="pseudo_R2", cv=3) model.fit(train.values.copy(), labels["incidence"].fillna(0).copy().values) result = model.predict(test.values) elif using_poisson: # Convert features into count based values train_converted = train.values.copy() incidence_converted = labels["incidence"].fillna(1).copy().values model = GLMCV(distr='poisson', score_metric="deviance", max_iter=5000, cv=3) model.fit(train_converted, incidence_converted) result = model.predict(test.values) # Get the feature coefficients
# Set up the training and testing sets from sklearn.cross_validation import train_test_split # noqa X = df[df.columns.difference(["Label"])].values y = df.loc[:, "Label"].values Xtrain, Xtest, ytrain, ytest = \ train_test_split(X, y, test_size=0.2, random_state=42) ########################################################## # Setup the models # set up the group lasso GLM model gl_glm = GLMCV(distr="binomial", tol=1e-2, group=group_idxs, score_metric="pseudo_R2", alpha=1.0) # set up the lasso model glm = GLMCV(distr="binomial", tol=1e-2, score_metric="pseudo_R2", alpha=1.0) print("gl_glm: ", gl_glm) print("glm: ", glm) ########################################################## # Fit models gl_glm.fit(Xtrain, ytrain) glm.fit(Xtrain, ytrain)