def test_few_at_least_as_good_as_default(): """test_few.py: few performs at least as well as the default ML """ np.random.seed(1006987) boston = load_boston() d = np.column_stack((boston.data,boston.target)) np.random.shuffle(d) features = d[:,0:-1] target = d[:,-1] print("feature shape:",boston.data.shape) learner = FEW(generations=1, population_size=5, mutation_rate=1, crossover_rate=1, ml = LassoLarsCV(), min_depth = 1, max_depth = 3, sel = 'tournament', fit_choice = 'r2',tourn_size = 2, random_state=0, verbosity=0, disable_update_check=False) learner.fit(features[:300], target[:300]) few_score = learner.score(features[:300], target[:300]) test_score = learner.score(features[300:],target[300:]) lasso = LassoLarsCV() lasso.fit(learner._training_features,learner._training_labels) lasso_score = lasso.score(features[:300], target[:300]) print("few score:",few_score,"lasso score:",lasso_score) print("few test score:",test_score,"lasso test score:",lasso.score(features[300:],target[300:])) assert few_score >= lasso_score print("lasso coefficients:",lasso.coef_)
def main(): print 'load datas...' train, test = data_util.load_dataset() y_train_all = train['y'] del train['ID'] del train['y'] id_test = test['ID'] del test['ID'] print 'train:', train.shape, ', test:', test.shape model = LassoLarsCV(fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=5, max_n_alphas=1000, n_jobs=-1, eps=2.2204460492503131e-16, copy_X=True, positive=False) model.fit(train.values, y_train_all) print 'predict submit...' y_pred = model.predict(test.values) df_sub = pd.DataFrame({'ID': id_test, 'y': y_pred}) df_sub.to_csv('lassolars_model_result.csv', index=False) # 0.55827
class OwnLassoLarsCV( AutoSklearnRegressionAlgorithm, ): def __init__(self, random_state=None): self.estimator = None def fit(self, X, y): from sklearn.linear_model import LassoLarsCV self.estimator = LassoLarsCV(cv=5) self.estimator.fit(X, y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'LL', 'name': 'LassoLarsCV', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() return cs
def cv_train_lasso_lars_with_sparse_refit(x_train, y_train, pval_cutoff=0.001, do_sparse_refit=True): model = LassoLarsCV(n_jobs=-1, cv=min(x_train.shape[0], 10)) model.fit(x_train, y_train) best_alpha_idx = int(np.argwhere(model.alpha_ == model.cv_alphas_)) if do_sparse_refit: sparse_alpha_idx = -1 for i in range(best_alpha_idx + 1, len(model.cv_alphas_)): pval = ttest_ind(model.mse_path_[best_alpha_idx], model.mse_path_[i]).pvalue if pval < pval_cutoff: sparse_alpha_idx = i - 1 break if sparse_alpha_idx == -1: # take the sparsest solution sparse_alpha_idx = len(model.cv_alphas_) - 1 model_sparse = LassoLars(alpha=model.cv_alphas_[sparse_alpha_idx]) model_sparse.fit(x_train, y_train) return model_sparse else: return model
def test_few_at_least_as_good_as_default(): """test_few.py: few performs at least as well as the default ML """ np.random.seed(1006987) boston = load_boston() d = np.column_stack((boston.data,boston.target)) np.random.shuffle(d) features = d[:,0:-1] target = d[:,-1] print("feature shape:",boston.data.shape) learner = FEW(generations=1, population_size=5, ml = LassoLarsCV(), min_depth = 1, max_depth = 3, sel = 'tournament') learner.fit(features[:300], target[:300]) few_score = learner.score(features[:300], target[:300]) few_test_score = learner.score(features[300:],target[300:]) lasso = LassoLarsCV() lasso.fit(features[:300], target[:300]) lasso_score = lasso.score(features[:300], target[:300]) lasso_test_score = lasso.score(features[300:],target[300:]) print("few score:",few_score,"lasso score:",lasso_score) print("few test score:",few_test_score,"lasso test score:", lasso_test_score) assert round(few_score,8) >= round(lasso_score,8) print("lasso coefficients:",lasso.coef_)
def lassolarscv(): print ("Doing cross-validated LassoLars") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf5 = LassoLarsCV(cv=cross_val) clf5.fit(base_X, base_Y) print ("Score = %f" % clf5.score(base_X, base_Y)) clf5_pred = clf5.predict(X_test) write_to_file("lassolars.csv", clf5_pred)
def train_test_lasso(input_data, output_data, train_key, test_key, n_cv=3): """ lasso回帰による学習/予測 """ # 例外処理 : 学習データ点数が分割数より少ない場合 if len(train_key) < n_cv: n_cv = len(train_key) #------------- # 学習 #------------- x = input_data[train_key,:] y = output_data[train_key] # インスタンス生成 x_scaler = StandardScaler() y_scaler = StandardScaler() clf = LassoLarsCV(cv=n_cv, positive=True) #clf = LassoLarsCV(cv=n_cv, positive=True) #clf = LassoLarsIC(criterion='bic', positive=True) # モデル構築 x_scaler.fit(x) #正規化 y_scaler.fit(y.reshape(-1,1)) #正規化 y_ = y_scaler.transform(y.reshape(-1,1)) y_ = y_.reshape(-1) #import pdb; pdb.set_trace() #error_flag = 0 #初期化 try: clf.fit(x_scaler.transform(x), y_) except ValueError: clf = LassoLarsIC(criterion='bic', positive=True) clf.fit(x_scaler.transform(x), y_) # モデルパラメータ取得 #alpha = clf.alpha_ #ハイパーパラメータ a = clf.coef_ #係数 b = clf.intercept_ #切片 p = np.append(a, b) #------------- # 予測 #------------- x = input_data[test_key,:] # 例外処理 : xのデータ点数 = 1の場合 ⇒配列を整形 if x.ndim == 1: x = x.reshape(1,-1) # 予測 tmp = clf.predict(x_scaler.transform(x)) y_pred = y_scaler.inverse_transform(tmp) #非正規化 return y_pred, p
def LassoLarsTest(dataMat, labelMat): clf1 = LassoLars(alpha=1, max_iter=100) clf1.fit(dataMat[0:99], labelMat[0:99]) labelTest1 = clf1.predict(dataMat[100:199]) print('LassoLars ', ((labelTest1 - labelMat[100:199])**2).sum()) clf2 = LassoLarsCV(max_n_alphas=10, max_iter=100) clf2.fit(dataMat[0:99], labelMat[0:99]) labelTest2 = clf2.predict(dataMat[100:199]) print('LassoLarsCV', ((labelTest2 - labelMat[100:199])**2).sum())
def _LASSOLars_Regression(self): # initialize a model object LassoLarsReg = LassoLarsCV(cv=10) # train model LassoLarsReg.fit(self.X_train, self.y_train) # optimal alpha print("The best alpha in LAR is: ", LassoLarsReg.alpha_) return LassoLarsReg
def lasso(x_train, y_train, x_test): #model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 #model = LassoCV(max_iter=3000) # LassoCV自动调节alpha可以实现选择最佳的alpha,0.0295。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha print(x_train.shape) print(y_train.shape) model.fit(x_train, y_train) # 线性回归建模 print('系数矩阵:\n', model.coef_) print('线性回归模型:\n', model) print('最佳的alpha:', model.alpha_) predicted = model.predict(x_test) print(predicted.shape) return (predicted)
def test(): X, y = genClassificationData(n_features=10, n_strel=2, n_redundant=0) model = LassoLarsCV(cv=5) normal_score = model.fit(X, y).score(X, y) X_NF = add_NFeature_to_X(X, 1, np.random.RandomState()) model = LassoLarsCV(cv=5) assert model.fit(X_NF, y).score(X_NF, y) > 0.5 stats = Stats(model, X, y, n_resampling=50, fpr=1e-3, check_importances=False) bounds = stats.score_stat assert type(bounds) is tuple assert bounds[0] < normal_score < bounds[1]
def run_lasso_lars_cv(X_train, y_train, X_test, y_test): """ :param X_train: :param y_train: :param X_test: :param y_test: :return: """ model_lars_cv = LassoLarsCV(cv=10) model_lars_cv.fit(X_train, y_train) print(model_lars_cv.alpha_) print(model_lars_cv.cv_alphas_) print(model_lars_cv.cv_mse_path_)
class _LassoLarsCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def UFS(data, K, d): ''' Cai D, Zhang C, He X. Unsupervised feature selection for multi-cluster data[C] //Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2010: 333-342. 本算法执行的是一个无监督特征选择算法 data: 是一个N*M的矩阵,每一行代表一个样本,每一列代表一个特征 K: 聚类的类簇数 d: 选择的特征数目 :return : data:特征选择后的数据N*d 每一行代表一个样本,每一列代表一个特征 seq:表示选择的特征序号1*d,seq中的每一个元素代表原来数据的特征的序号 ''' k = int(max(0.2 * data.shape[0], 10)) #近邻设置的数目 data = np.array(data) M = data.shape[1] #数据的维数 N = data.shape[0] #数据的样本数目 data = data.transpose() #将矩阵转换成列形式,与文中的形式保持一致 #寻找数据的样本的K近邻 dist = np.zeros((N, N)) delta = 2 #原文中热核函数的参数 W = np.zeros((N, N)) #邻接矩阵的权值 for i in range(N): #计算样本之间的距离 for j in range(N): dist[i, j] = np.linalg.norm(data[:, i] - data[:, j], ord=2) D = np.zeros((N, N)) #初始化度矩阵 for i in range(N): #找出每个样本之间的k近邻 neigbors = np.argsort(dist[i, :], axis=0) neigbors = neigbors[1:k + 1] #只选取前k个样本,因为第0个最短矩阵是本身与本身的距离,其距离为0 for j in neigbors: W[i, j] = math.exp( -math.pow(np.linalg.norm(data[:, i] - data[:, j]), 2) / delta) #计算权值矩阵中的元素 W[j, i] = W[i, j] D[i, i] = sum(W[i, :]) #确定度矩阵中的对角元素 L = D - W #计算拉普拉斯矩阵 feature_values, vectors = scipy.linalg.eig(L, D) #求取Eq.(1)中的广义特征值与特征向量 seq = np.argsort(feature_values) #对特征值进行排序 seq = seq[1:K + 1] #选取从次小后的K个特征值 Y = vectors[:, seq] #获取特征向量 #Y = np.real(Y) # 采用最小角回归来气球节a的参数 score = np.zeros((1, M)) #记录每个特征的得分 model = LassoLarsCV() #训练一个模型 for i in range(K): model.fit(data.transpose(), Y[:, i]) a = model.coef_ #获取线性回归模型的系数 score[0, i] = max(a) seq = np.argsort(-score) #对得分由大到小排序 seq = seq[0, 0:d] #选取前d个最大的得分所对应的特征序号 data = data.transpose() data = data[:, seq] #获得最终的结果 return data, seq
def lassolars(): lassolars = LassoLarsCV() X_train, X_test, Y_train, Y_test = train_test_split(train_pca_value, train_pro, test_size=0.1, random_state=9) lassolars.fit(X_train, Y_train) pre = lassolars.predict(X_test) loss = mean_squared_error(pre, Y_test) print(loss) pre = lassolars.predict(test_pca_data) write = open('data/lassolars.txt', 'w') for i in range(len(pre)): write.write("%f\r" % pre[i]) write.close()
def _lassolarscv(*, train, test, x_predict=None, metrics, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=2.220446049250313e-16, copy_X=True, positive=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html#sklearn.linear_model.LassoLarsCV """ model = LassoLarsCV(fit_intercept=fit_intercept, verbose=verbose, max_iter=max_iter, normalize=normalize, precompute=precompute, cv=cv, max_n_alphas=max_n_alphas, n_jobs=n_jobs, eps=eps, copy_X=copy_X, positive=positive) model.fit(train[0], train[1]) model_name = 'LassoLarsCV' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def LASSO_EXAMPLE(inputfile): # 导入使用的模块 import os import numpy as np import pandas as pd from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV data = pd.read_csv(inputfile) des = data.describe() r = des.T r = r[['min', 'max', 'mean', 'std']] np.round(r, 2) # 保留2位小数,四舍六入五留双(五留双即遇五看五前面是偶数则保留,奇数进位) np.round(data.corr(method='pearson'), 2) # method={'pearson','spearman','kendall'},计算相关系数,相关分析 model = LassoLarsCV(alpha=1) # LASSO回归的特点是在拟合广义线性模型的同时进行变量筛选和复杂度调整,剔除存在共线性的变量 model.fit(data.iloc[:, :data.shape[1] - 1], data.iloc[:, data.shape[1] - 1]) model_coef = pd.DataFrame(pd.DataFrame(model.coef_).T) model_coef.columns = ['x%d' % i for i in np.arange(13) + 1] print(model_coef)
def polyomial(): poly = PolynomialFeatures(degree=2) X_train, X_test, Y_train, Y_test = train_test_split(train_pca_value, train_pro, test_size=0.1, random_state=9) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) test_pca_data_poly = poly.fit_transform(test_pca_data) regressor_poly = LassoLarsCV() regressor_poly.fit(X_train_poly, Y_train) pre = regressor_poly.predict(X_test_poly) loss = mean_squared_error(pre, Y_test) print(loss) pre = regressor_poly.predict(test_pca_data_poly) write = open('data/poly.txt', 'w') for i in range(len(pre)): write.write("%f\r" % pre[i]) write.close()
def select_features_lasso_lars(X,Y): clf = LassoLarsCV() # Set a minimum threshold of 0.25 fit=clf.fit(X,Y) sfm = SelectFromModel(fit,prefit=True) values= SelectFromModel.get_support(sfm,indices=True) for i in range(0,len(values)): print(values[i]) new_features = sfm.transform(X) return new_features,values
def lassovar(data, lag=1, n_samples=None): Y = data.T[:, lag:] d = Y.shape[0] Z = np.vstack([data.T[:, lag - k:-k] for k in range(1, lag + 1)]) Y, Z = Y.T, Z.T if n_samples is not None: Y, Z = resample(Y, Z, replace=False, n_samples=n_samples) scores = np.zeros((d, d * lag)) ls = LassoLarsCV(cv=10, n_jobs=1) residuals = np.zeros(Y.shape) # one variable after the other as target for j in range(d): target = np.copy(Y[:, j]) selectedparents = np.full(d * lag, False) # we include one lag after the other for l in range(1, lag + 1): ind_a = d * (l - 1) ind_b = d * l ls.fit(Z[:, ind_a:ind_b], target) selectedparents[ind_a:ind_b] = ls.coef_ > 0 target -= ls.predict(Z[:, ind_a:ind_b]) residuals[:, j] = np.copy(target) # refit to get rid of the bias ZZ = Z[:, selectedparents] B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0] scores[j, selectedparents] = B # the more uncorrelated the residuals the higher the weight weight = 1 res = np.corrcoef(residuals.T) if np.linalg.matrix_rank(res) == res.shape[0]: weight = np.linalg.det(res) return scores * weight
def train_lars(train_features, train_labels, num_alphas, skip_cross_validation, alpha, num_jobs): """ Performs the cross validation of lars model, and returns the trained model with best params. Assume features are scaled/normalized """ best_alpha = alpha max_iter = 10000 if not skip_cross_validation: # use 5 fold cross validation model = LassoLarsCV(max_iter=max_iter, cv=5, max_n_alphas=min(num_alphas, 2000), n_jobs=num_jobs, normalize=False) model.fit(train_features, train_labels) best_alpha = model.alpha_ #print("number of iterations were {}".format(model.n_iter_)) model = LassoLars(alpha=alpha, normalize=False, max_iter=max_iter) model.fit(train_features, train_labels) return (model, {'alpha': best_alpha})
def lassovar(data, maxlags=1, n_samples=None, cv=5): # Stack data to perform regression of present on past values Y = data.T[:, maxlags:] d = Y.shape[0] Z = np.vstack([data.T[:, maxlags - k:-k] for k in range(1, maxlags + 1)]) Y, Z = Y.T, Z.T # Subsample data if n_samples is not None: Y, Z = resample(Y, Z, n_samples=n_samples) scores = np.zeros((d, d * maxlags)) ls = LassoLarsCV(cv=cv, n_jobs=1) residuals = np.zeros(Y.shape) # Consider one variable after the other as target for j in range(d): target = np.copy(Y[:, j]) selectedparents = np.full(d * maxlags, False) # Include one lag after the other for l in range(1, maxlags + 1): ind_a = d * (l - 1) ind_b = d * l ls.fit(Z[:, ind_a:ind_b], target) selectedparents[ind_a:ind_b] = ls.coef_ > 0 target -= ls.predict(Z[:, ind_a:ind_b]) residuals[:, j] = np.copy(target) # Refit OLS using the selected variables to get rid of the bias ZZ = Z[:, selectedparents] B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0] scores[j, selectedparents] = B return scores
#encoding=utf-8 import numpy as np import pandas as pd data = pd.read_csv('train2.csv', index_col='id') #print np.round(data.corr(method='pearson'),2) from sklearn.linear_model import LassoLarsCV #print len(data.columns) #print data.iloc[:,52] x = data.iloc[:, 0:52].as_matrix() y = data.iloc[:, 52].as_matrix() ll = LassoLarsCV() ll.fit(x, y) a = ll.coef_ b = [i != 0 for i in a] data1 = data.iloc[:, 0:52] data1 = data1[data1.columns[b]] x = data1.iloc[:, :].as_matrix() from xgboost import XGBRegressor reg = XGBRegressor() reg.fit(x, y) y_pred = reg.predict(x) print('有限特征%s' % data1.columns) testdata = pd.read_csv('test2.csv', index_col='id') testdata = testdata[testdata.columns[b]].as_matrix()
RMSE(lassolarscv, X_train, Y_train) #0.1154 # In[ ]: RMSE(elasticnetcv, X_train, Y_train) #0.1140 # What the hell ! Who could bielieve that a simple linear regression model performs so well ?! (Im behaving like i just discovered that : WOOOOAAAWW :p) # # The 4 models have very similar scores except for the ridge, if we average their predictions we could probably slightly reduce the error ! # In[ ]: lassocv.fit(X_train, Y_train) ridge.fit(X_train, Y_train) lassolarscv.fit(X_train, Y_train) elasticnetcv.fit(X_train, Y_train) # #### 6.1.2 Features coefficients # In[ ]: print("LassoCV regression has conserved %d features over %d" % (len(features[lassocv.coef_ != 0]), X_train.shape[1])) print("Ridge regression has conserved %d features over %d" % (len(features[ridge.coef_ != 0]), X_train.shape[1])) print("LassoLarsCV regression has conserved %d features over %d" % (len(features[lassolarscv.coef_ != 0]), X_train.shape[1])) print("ElasticNetCV regression has conserved %d features over %d" % (len(features[elasticnetcv.coef_ != 0]), X_train.shape[1]))
class FEW(SurvivalMixin, VariationMixin, EvaluationMixin, BaseEstimator): """FEW uses GP to find a set of transformations from the original feature space that produces the best performance for a given machine learner. """ update_checked = False def __init__(self, population_size=50, generations=100, mutation_rate=0.5, crossover_rate=0.5, ml=None, min_depth=1, max_depth=2, max_depth_init=2, sel='epsilon_lexicase', tourn_size=2, fit_choice=None, op_weight=False, seed_with_ml=True, erc=False, random_state=np.random.randint(9999999), verbosity=0, scoring_function=None, disable_update_check=False, elitism=True, boolean=False, classification=False, clean=False, track_diversity=False, mdr=False, otype='f'): # sets up GP. # Save params to be recalled later by get_params() self.params = locals( ) # Must be placed before any local variable definitions self.params.pop('self') # # Do not prompt the user to update during this session if they ever disabled the update check # if disable_update_check: # FEW.update_checked = True # # # Prompt the user if their version is out of date # if not disable_update_check and not FEW.update_checked: # update_check('FEW', __version__) # FEW.update_checked = True self._best_estimator = None self._training_features = None self._training_labels = None self._best_inds = None self.population_size = population_size self.generations = generations self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.min_depth = min_depth self.max_depth = max_depth self.max_depth_init = max_depth_init self.sel = sel self.tourn_size = tourn_size self.fit_choice = fit_choice self.op_weight = op_weight self.seed_with_ml = seed_with_ml self.erc = erc self.random_state = random_state self.verbosity = verbosity self.scoring_function = scoring_function self.gp_generation = 0 self.elitism = elitism self.max_fit = 99999999.666 self.boolean = boolean self.classification = classification self.clean = clean self.ml = ml self.track_diversity = track_diversity self.mdr = mdr self.otype = otype # if otype is b, boolean functions must be turned on if self.otype == 'b': self.boolean = True # instantiate sklearn estimator according to specified machine learner if self.ml is None: if self.classification: self.ml = LogisticRegression(solver='sag') else: self.ml = LassoLarsCV() if not self.scoring_function: if self.classification: self.scoring_function = accuracy_score else: self.scoring_function = r2_score # set default fitness metrics for various learners if not self.fit_choice: self.fit_choice = { #regression type(LassoLarsCV()): 'mse', type(SVR()): 'mae', type(LinearSVR()): 'mae', type(KNeighborsRegressor()): 'mse', type(DecisionTreeRegressor()): 'mse', type(RandomForestRegressor()): 'mse', #classification type(SGDClassifier()): 'r2', type(LogisticRegression()): 'r2', type(SVC()): 'r2', type(LinearSVC()): 'r2', type(RandomForestClassifier()): 'r2', type(DecisionTreeClassifier()): 'r2', type(DistanceClassifier()): 'silhouette', type(KNeighborsClassifier()): 'r2', }[type(self.ml)] # Columns to always ignore when in an operator self.non_feature_columns = ['label', 'group', 'guess'] # function set self.func_set = [ node('+'), node('-'), node('*'), node('/'), node('sin'), node('cos'), node('exp'), node('log'), node('^2'), node('^3'), node('sqrt') ] # if boolean operators are included but the output type is set to float, then # # include the if and if-else operations that allow use of both stacks # if self.boolean and self.otype=='f': # self.func_set += [ # {'name:','if','arity':2,'in_type':} # ] # terminal set self.term_set = [] # diversity self.diversity = [] #@profile def fit(self, features, labels): """Fit model to data""" np.random.seed(self.random_state) # setup data # imputation if self.clean: features = self.impute_data(features) # Train-test split routine for internal validation #### train_val_data = pd.DataFrame(data=features) train_val_data['labels'] = labels # print("train val data:",train_val_data[::10]) new_col_names = {} for column in train_val_data.columns.values: if type(column) != str: new_col_names[column] = str(column).zfill(10) train_val_data.rename(columns=new_col_names, inplace=True) # internal training/validation split train_i, val_i = train_test_split(train_val_data.index, stratify=None, train_size=0.75, test_size=0.25) x_t = train_val_data.loc[train_i].drop('labels', axis=1).values x_v = train_val_data.loc[val_i].drop('labels', axis=1).values y_t = train_val_data.loc[train_i, 'labels'].values y_v = train_val_data.loc[val_i, 'labels'].values # Store the training features and classes for later use self._training_features = x_t self._training_labels = y_t #### # set population size if type(self.population_size) is str: if 'x' in self.population_size: # self.population_size = int( float(self.population_size[:-1]) * features.shape[1]) else: self.population_size = int(self.population_size) if self.verbosity > 0: print("population size:", self.population_size) # print few settings if self.verbosity > 1: for arg in self.get_params(): print('{}\t=\t{}'.format(arg, self.get_params()[arg])) print('') # initial model initial_estimator = copy.deepcopy(self.ml.fit(x_t, y_t)) # self._best_estimator = copy.deepcopy(self.ml.fit(x_t,y_t)) self._best_score = self.ml.score(x_v, y_v) initial_score = self._best_score if self.verbosity > 2: print("initial estimator size:", self.ml.coef_.shape) if self.verbosity > 0: print("initial ML CV: {:1.3f}".format(self._best_score)) # create terminal set for i in np.arange(x_t.shape[1]): # dictionary of node name, arity, feature column index, output type and input type self.term_set.append(node('x', loc=i)) # features # add ephemeral random constants if flag if self.erc: self.term_set.append(node( 'k', value=np.random.rand())) # ephemeral random constants # edit function set if boolean if self.boolean or self.otype == 'b': # include boolean functions self.func_set += [ node('!'), node('&'), node('|'), node('=='), node('>_f'), node('<_f'), node('>=_f'), node('<=_f'), node('>_b'), node('<_b'), node('>=_b'), node('<=_b'), node('xor_b'), node('xor_f') ] # add mdr if specified if self.mdr: self.func_set += [node('mdr2')] # Create initial population # for now, force seed_with_ml to be off if otype is 'b', since data types` # are assumed to be float if self.otype == 'b': self.seed_with_ml = False pop = self.init_pop(self._training_features.shape[0]) # check that uuids are unique in population uuids = [p.id for p in pop.individuals] if len(uuids) != len(set(uuids)): pdb.set_trace() # Evaluate the entire population # X represents a matrix of the population outputs (number os samples x population size) # single thread pop.X = self.transform(x_t, pop.individuals, y_t).transpose() # parallel: # pop.X = np.asarray(Parallel(n_jobs=-1)(delayed(out)(I,x_t,self.otype,y_t) for I in pop.individuals), order = 'F') # calculate fitness of individuals # fitnesses = list(map(lambda I: fitness(I,y_t,self.ml),pop.X)) fitnesses = self.calc_fitness(pop.X, y_t, self.fit_choice, self.sel) # max_fit = self.max_fit # while len([np.mean(f) for f in fitnesses if np.mean(f) < max_fit and np.mean(f)>=0])<self.population_size and max_count < 100: # pop = self.init_pop() # pop.X = self.transform(x_t,pop.individuals,y_t) # fitnesses = self.calc_fitness(pop.X,y_t,self.fit_choice,self.sel) # # max_count+= 1 # print("fitnesses:",fitnesses) # Assign fitnesses to inidividuals in population for ind, fit in zip(pop.individuals, fitnesses): if isinstance( fit, (list, np.ndarray)): # calc_fitness returned raw fitness values fit[fit < 0] = self.max_fit fit[np.isnan(fit)] = self.max_fit fit[np.isinf(fit)] = self.max_fit ind.fitness_vec = fit ind.fitness = np.mean(ind.fitness_vec) else: ind.fitness = np.nanmin([fit, self.max_fit]) #with Parallel(n_jobs=10) as parallel: #################### ### Main GP loop self.diversity = [] # progress bar pbar = tqdm(total=self.generations, disable=self.verbosity == 0, desc='Internal CV: {:1.3f}'.format(self._best_score)) # for each generation g for g in np.arange(self.generations): if self.track_diversity: self.get_diversity(pop.X) if self.verbosity > 1: print(".", end='') if self.verbosity > 1: print(str(g) + ".)", end='') # if self.verbosity > 1: print("population:",stacks_2_eqns(pop.individuals)) if self.verbosity > 2: print("pop fitnesses:", ["%0.2f" % x.fitness for x in pop.individuals]) if self.verbosity > 1: print("median fitness pop: %0.2f" % np.median([x.fitness for x in pop.individuals])) if self.verbosity > 1: print("best fitness pop: %0.2f" % np.min([x.fitness for x in pop.individuals])) if self.verbosity > 1 and self.track_diversity: print("feature diversity: %0.2f" % self.diversity[-1]) if self.verbosity > 1: print("ml fitting...") # fit ml model with warnings.catch_warnings(): warnings.simplefilter("ignore") try: # if len(self.valid_loc(pop.individuals)) > 0: if self.valid(pop.individuals): self.ml.fit( pop.X[self.valid_loc(pop.individuals), :]. transpose(), y_t) # else: # self.ml.fit(pop.X.transpose(),y_t) except ValueError as detail: # pdb.set_trace() print( "warning: ValueError in ml fit. X.shape:", pop.X[:, self.valid_loc(pop.individuals)].transpose( ).shape, "y_t shape:", y_t.shape) print( "First ten entries X:", pop.X[self.valid_loc(pop.individuals), :].transpose() [:10]) print("First ten entries y_t:", y_t[:10]) print("equations:", stacks_2_eqns(pop.individuals)) print("FEW parameters:", self.get_params()) if self.verbosity > 1: print("---\ndetailed error message:", detail) raise (ValueError) # if self.verbosity > 1: print("number of non-zero regressors:",self.ml.coef_.shape[0]) # keep best model tmp_score = 0 try: # if len(self.valid_loc(pop.individuals)) > 0: if self.valid(pop.individuals): tmp_score = self.ml.score( self.transform( x_v, pop.individuals)[:, self.valid_loc(pop.individuals)], y_v) # else: # tmp_score = 0 # tmp = self.ml.score(self.transform(x_v,pop.individuals),y_v) except Exception as detail: if self.verbosity > 1: print(detail) if self.verbosity > 1: print("current ml validation score:", tmp_score) if self.valid(pop.individuals) and tmp_score > self._best_score: self._best_estimator = copy.deepcopy(self.ml) self._best_score = tmp_score self._best_inds = copy.deepcopy(self.valid(pop.individuals)) if self.verbosity > 1: print("updated best internal validation score:", self._best_score) # Variation if self.verbosity > 2: print("variation...") offspring, elite, elite_index = self.variation(pop.individuals) # evaluate offspring if self.verbosity > 2: print("output...") X_offspring = self.transform(x_t, offspring).transpose() #parallel: # X_offspring = np.asarray(Parallel(n_jobs=-1)(delayed(out)(O,x_t,y_t,self.otype) for O in offspring), order = 'F') if self.verbosity > 2: print("fitness...") F_offspring = self.calc_fitness(X_offspring, y_t, self.fit_choice, self.sel) # F_offspring = parallel(delayed(f[self.fit_choice])(y_t,yhat) for yhat in X_offspring) # print("fitnesses:",fitnesses) # Assign fitnesses to inidividuals in population for ind, fit in zip(offspring, F_offspring): if isinstance( fit, (list, np.ndarray)): # calc_fitness returned raw fitness values fit[fit < 0] = self.max_fit fit[np.isnan(fit)] = self.max_fit fit[np.isinf(fit)] = self.max_fit ind.fitness_vec = fit ind.fitness = np.mean(ind.fitness_vec) else: ind.fitness = np.nanmin([fit, self.max_fit]) # Survival if self.verbosity > 2: print("survival..") survivors, survivor_index = self.survival(pop.individuals, offspring, elite, elite_index) pop.individuals[:] = survivors pop.X = np.vstack((pop.X, X_offspring))[survivor_index, :] if self.verbosity > 2: print("median fitness survivors: %0.2f" % np.median([x.fitness for x in pop.individuals])) if self.verbosity > 2: print( "best features:", stacks_2_eqns(self._best_inds) if self._best_inds else 'original') pbar.set_description('Internal CV: {:1.3f}'.format( self._best_score)) pbar.update(1) # end of main GP loop #################### if self.verbosity > 0: print('finished. best internal val score: {:1.3f}'.format( self._best_score)) if self.verbosity > 0: print("final model:\n", self.print_model()) if not self._best_estimator: self._best_estimator = initial_estimator return self def transform(self, x, inds=None, labels=None): """return a transformation of x using population outputs""" if inds: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in inds)).transpose() return np.asarray([ self.out(I, x, labels, self.otype) for I in inds ]).transpose() else: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in self._best_inds)).transpose() return np.asarray([ self.out(I, x, labels, self.otype) for I in self._best_inds ]).transpose() def impute_data(self, x): """Imputes data set containing Nan values""" imp = Imputer(missing_values='NaN', strategy='mean', axis=0) return imp.fit_transform(x) def clean(self, x): """remove nan and inf rows from x""" return x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] def clean_with_zeros(self, x): """ set nan and inf rows from x to zero""" x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] = 0 return x def predict(self, testing_features): """predict on a holdout data set.""" # print("best_inds:",self._best_inds) # print("best estimator size:",self._best_estimator.coef_.shape) if self.clean: testing_features = self.impute_data(testing_features) if self._best_inds: X_transform = self.transform(testing_features) try: return self._best_estimator.predict( self.transform(testing_features)) except ValueError as detail: pdb.set_trace() print('shape of X:', testing_features.shape) print('shape of X_transform:', X_transform.transpose().shape) print('best inds:', stacks_2_eqns(self._best_inds)) print('valid locs:', self.valid_loc(self._best_inds)) raise ValueError(detail) else: return self._best_estimator.predict(testing_features) def fit_predict(self, features, labels): """Convenience function that fits a pipeline then predicts on the provided features Parameters ---------- features: array-like {n_samples, n_features} Feature matrix labels: array-like {n_samples} List of class labels for prediction Returns ---------- array-like: {n_samples} Predicted labels for the provided features """ self.fit(features, labels) return self.predict(features) def score(self, testing_features, testing_labels): """estimates accuracy on testing set""" # print("test features shape:",testing_features.shape) # print("testing labels shape:",testing_labels.shape) yhat = self.predict(testing_features) return self.scoring_function(testing_labels, yhat) def export(self, output_file_name): """exports engineered features Parameters ---------- output_file_name: string String containing the path and file name of the desired output file Returns ------- None """ if self._best_estimator is None: raise ValueError( 'A model has not been optimized. Please call fit() first.') # Write print_model() to file with open(output_file_name, 'w') as output_file: output_file.write(self.print_model()) # if decision tree, print tree into dot file if 'DecisionTree' in type(self.ml).__name__: export_graphviz(self._best_estimator, out_file=output_file_name + '.dot', feature_names=stacks_2_eqns(self._best_inds) if self._best_inds else None, class_names=['True', 'False'], filled=False, impurity=True, rotate=True) def init_pop(self, num_features=1): """initializes population of features as GP stacks.""" pop = Pop(self.population_size, num_features) # make programs if self.seed_with_ml: # initial population is the components of the default ml model if type(self.ml) == type(LassoLarsCV()): # add all model components with non-zero coefficients for i, (c, p) in enumerate( it.zip_longest([c for c in self.ml.coef_ if c != 0], pop.individuals, fillvalue=None)): if c is not None and p is not None: p.stack = [node('x', loc=i)] elif p is not None: # make program if pop is bigger than model componennts make_program( p.stack, self.func_set, self.term_set, np.random.randint(self.min_depth, self.max_depth + 1), self.otype) p.stack = list(reversed(p.stack)) else: # seed with raw features # if list(self.ml.coef_): #pdb.set_trace() try: if self.population_size < self.ml.coef_.shape[0]: # seed pop with highest coefficients coef_order = np.argsort(self.ml.coef_[::-1]) for i, (c, p) in enumerate( zip(coef_order, pop.individuals)): p.stack = [node('x', loc=i)] else: raise (AttributeError) except Exception: # seed pop with raw features for i, p in it.zip_longest(range( self._training_features.shape[1]), pop.individuals, fillvalue=None): if p is not None: if i is not None: p.stack = [node('x', loc=i)] else: make_program( p.stack, self.func_set, self.term_set, np.random.randint(self.min_depth, self.max_depth + 1), self.otype) p.stack = list(reversed(p.stack)) # print initial population if self.verbosity > 2: print("seeded initial population:", stacks_2_eqns(pop.individuals)) else: for I in pop.individuals: depth = np.random.randint(self.min_depth, self.max_depth + 1) # print("hex(id(I)):",hex(id(I))) # depth = 2; # print("initial I.stack:",I.stack) make_program(I.stack, self.func_set, self.term_set, depth, self.otype) # print(I.stack) I.stack = list(reversed(I.stack)) # print(I.stack) return pop def print_model(self, sep='\n'): """prints model contained in best inds, if ml has a coefficient property. otherwise, prints the features generated by FEW.""" model = '' if self._best_inds: if type(self.ml).__name__ != 'SVC' and type( self.ml).__name__ != 'SVR': # this is need because svm has a bug that throws valueerror on attribute check: if hasattr(self.ml, 'coef_'): if self._best_estimator.coef_.shape[0] == 1 or len( self._best_estimator.coef_.shape) == 1: if self._best_estimator.coef_.shape[0] == 1: s = np.argsort( np.abs(self._best_estimator.coef_[0]))[::-1] scoef = self._best_estimator.coef_[0][s] else: s = np.argsort(np.abs( self._best_estimator.coef_))[::-1] scoef = self._best_estimator.coef_[s] bi = [self._best_inds[k] for k in s] model = (' +' + sep).join([ str(round(c, 3)) + '*' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, scoef)) if round(scoef[i], 3) != 0 ]) else: # more than one decision function is fit. print all. for j, coef in enumerate(self._best_estimator.coef_): s = np.argsort(np.abs(coef))[::-1] scoef = coef[s] bi = [self._best_inds[k] for k in s] model += sep + 'class' + str( j) + ' :' + ' + '.join([ str(round(c, 3)) + '*' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, coef)) if coef[i] != 0 ]) elif hasattr(self._best_estimator, 'feature_importances_'): s = np.argsort( self._best_estimator.feature_importances_)[::-1] sfi = self._best_estimator.feature_importances_[s] bi = [self._best_inds[k] for k in s] model = 'importance : feature\n' model += sep.join([ str(round(c, 3)) + '\t:\t' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, sfi)) if round(sfi[i], 3) != 0 ]) else: return stacks_2_eqns(self._best_inds) else: return stacks_2_eqns(self._best_inds) else: return 'original features' return model def representation(self): """return stacks_2_eqns output""" return stacks_2_eqns(self._best_inds) def valid_loc(self, individuals): """returns the indices of individuals with valid fitness.""" return [ index for index, i in enumerate(individuals) if i.fitness < self.max_fit and i.fitness >= 0 ] def valid(self, individuals): """returns the sublist of individuals with valid fitness.""" return [ i for i in individuals if i.fitness < self.max_fit and i.fitness >= 0 ] def get_params(self, deep=None): """Get parameters for this estimator This function is necessary for FEW to work as a drop-in feature constructor in, e.g., sklearn.model_selection.cross_val_score Parameters ---------- deep: unused Only implemented to maintain interface for sklearn Returns ------- params: mapping of string to any Parameter names mapped to their values """ return self.params def get_diversity(self, X): """compute mean diversity of individual outputs""" # diversity in terms of cosine distances between features feature_correlations = np.zeros(X.shape[0] - 1) for i in np.arange(1, X.shape[0] - 1): feature_correlations[i] = max(0.0, r2_score(X[0], X[i])) # pdb.set_trace() self.diversity.append(1 - np.mean(feature_correlations))
marked_pixel = (4, 2) from matplotlib import gridspec from matplotlib.patches import Rectangle fig = plt.figure(figsize=(12, 8)) fig.suptitle('Receptive fields of the marked voxels', fontsize=25) # GridSpec allows us to do subplots with more control of the spacing gs1 = gridspec.GridSpec(2, 3) # we fit the Lasso for each of the three voxels of the upper row for i, index in enumerate([1780, 1951, 2131]): ax = plt.subplot(gs1[0, i]) # we reshape the coefficients into the form of the original images rf = lasso.fit(stimuli, fmri_data[:, index]).coef_.reshape((10, 10)) # add a black background ax.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray') ax_im = ax.imshow(np.ma.masked_less(rf, 0.1), interpolation="nearest", cmap=['Blues', 'Greens', 'Reds'][i], vmin=0., vmax=0.75) # add the marked pixel ax.add_patch(Rectangle( (marked_pixel[1] - .5, marked_pixel[0] - .5), 1, 1, facecolor='none', edgecolor='r', lw=4)) plt.axis('off') plt.colorbar(ax_im, ax=ax) # and then for the voxel at the bottom gs1.update(left=0., right=1., wspace=0.1) ax = plt.subplot(gs1[1, 1])
parser.add_argument("--lat", help="Training Latitude", type=float) parser.add_argument("--lon", help="Training Longitude", type=float) args = parser.parse_args() train_data = load_data.load_supervised(1950, 1985, args.lat, args.lon, 50, which='train') test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test') lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon)) if os.path.exists(lasso_file): print "Reading PCA from file" L = pickle.load(open(lasso_file, 'r')) else: print "Fitting Lasso" L = LassoLarsCV(cv=5) L.fit(train_data.X, train_data.y[:,0]) pickle.dump(L, open(lasso_file, 'w')) ## Print Fit stats print "Alpha", L.alpha_ print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X)) print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X)) yhat = L.predict(test_data.X) print "Pearson Corr", pearsonr(test_data.y[:,0], yhat) print "Spearman Corr", spearmanr(test_data.y[:,0], yhat) print "SSE", sum((yhat - test_data.y[:,0])**2) ## Compute monthly data
block = delay * num_filter chan = num / block f = (num % block) / delay t = (num % block) % delay return (chan, f, t) if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) subj = 'sub1' finger = 1 with h5py.File('ECoG_data.h5', 'r+') as f: u = f[subj]['unmixing_matrix'][:] X = f[subj]['train_data'][:] X -= X.mean(0) X = X.dot(u) Y = f[subj]['cleaned_train_dg'][:] X1, y1, _ = preprocessing(X, Y[:, finger]) ls = LassoLarsCV() ls.fit(X1, y1[:, 0]) pickle.dump(ls, open('linear_model_'+subj+'_'+str(finger), 'wb')) channel_count = Counter([num2info(c)[0] for c in ls.coef_.nonzero()[0]]) X2, _, yb = preprocessing(X[:, list(set(channel_count.keys()))], Y[:, finger]) ls2 = LogisticRegressionCV() ls2.fit(X2, yb[:, 0]) pickle.dump(ls2, open('logistic_model_'+subj+'_'+str(finger), 'wb')) with h5py.File('selected_channel.h5', 'w') as f: f.create_dataset('selected_channel', data=list(set(channel_count.keys())))
pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps')) pl.clf() ### Compute receptive fields from sklearn.linear_model import LassoLarsCV lasso = LassoLarsCV(max_iter=10, ) p = (4, 2) # Mask for chosen pixel pixmask = np.zeros((10, 10), dtype=bool) pixmask[p] = 1 for index in [1780, 1951, 2131, 1935]: rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10) pl.figure(figsize=(8, 8)) # Black background pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray') pl.imshow(np.ma.masked_equal(rf, 0.), vmin=0., vmax=0.75, interpolation="nearest", cmap=cm.bluegreen) plot_lines(pixmask, linewidth=6, color='r') pl.axis('off') pl.subplots_adjust(left=0., right=1., bottom=0., top=1.) pl.savefig(os.path.join('miyawaki', 'encoding_%d.pdf' % index)) pl.savefig(os.path.join('miyawaki', 'encoding_%d.eps' % index)) pl.clf()
def lasso(X, y, value): regressor = LassoLarsCV(cv=10, precompute=False) regressor.fit(X, y) y_pred = regressor.predict(value) return y_pred
y_trainset_001.append(1.0) num_2 += 1 print num_1, num_2 classify_model_001 = RandomForestClassifier(n_estimators=55, random_state=1) classify_model_001.fit(X_trainset_001, y_trainset_001) ### 构建0.003的回归模型 from sklearn.linear_model import LassoLarsCV, BayesianRidge X_trainset_0003 = [] y_trainset_0003 = [] for i in range(0, y_trainset.__len__(), 1): if y_trainset[i] < 0.003: X_trainset_0003.append(X_trainset[i]) y_trainset_0003.append(y_trainset[i]) reg_0003 = LassoLarsCV(max_n_alphas=100, positive=True) reg_0003.fit(X_trainset_0003, y_trainset_0003) ### 构建0.003-0.01的回归模型 from sklearn.linear_model import LassoLarsCV X_trainset_001 = [] y_trainset_001 = [] for i in range(0, y_trainset.__len__(), 1): if y_trainset[i] >= 0.003 and y_trainset[i] < 0.015: X_trainset_001.append(X_trainset[i]) y_trainset_001.append(y_trainset[i]) reg_001 = LassoLarsCV(max_n_alphas=100, cv=10) reg_001.fit(X_trainset_001, y_trainset_001) ### 构建大于0.01的回归模型 from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV X_trainset_1 = []
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, use_lars=True, fit_intercept=True, normalize=True, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Lasso/LassoLars model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] alpha alpha parameter for LassoLars (See Note 5) [None] use_lars bool to use LassoLars instead of Lasso [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] Returns ------- group with trained LassoLars model, to be used with lasso_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. alpha is the regularization parameter. if alpha is None it will be set using LassoLarsSCV """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws.update(dict(fit_intercept=fit_intercept, normalize=normalize)) creator = LassoLars if use_lars else Lasso model = None rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) if alpha is None: lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, max_iter=1e7, eps=1.e-12, **kws) lcvmod.fit(spectra, ydat) alpha = lcvmod.alpha_ model = creator(alpha=alpha, **kws) resid = [] for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :]) resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) if alpha is None: cvmod = creator(**kws) cvmod.fit(spectra, ydat) alpha = cvmod.alpha_ if model is None: model = creator(alpha=alpha, **kws) # final fit without cross-validation out = model.fit(spectra, ydat) ypred = model.predict(spectra) rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, alpha=alpha, active=model.active_, coefs=model.coef_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, fit_intercept=fit_intercept, normalize=normalize, groupnames=groupnames, keywords=kws)
ivars2 = [] depvars = [] columns = [] for pyear in player_years: ivars.append([pt_projs[pyear][system] for system in proj_systems]) depvars.append(pt_actuals[pyear]['actual']) for pyear in pt_projs_curr.keys(): ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems]) x = numpy.array(ivars) x2 = numpy.array(ivars2) y = numpy.array(depvars) model_pt = LassoLarsCV(cv=cv_num) model_pt.fit(x,y) print("Rough PT model, to choose sample") for system, coef in zip(proj_systems, model_pt.coef_): print("%40s : %f" % (system, coef)) print("%40s : %f" % ('intercept', model_pt.intercept_)) sample_proj_pt_arr = model_pt.predict(x) curr_proj_pt_arr = model_pt.predict(x2) sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr)) curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr)) models = {} final_projs = {}
y = sc_y.fit_transform(input_data['label'].values.reshape(-1,1)) else: X = input_data.drop('label', axis=1).values.astype(float) y = input_data['label'].values sss = ShuffleSplit(n_splits=50,train_size=0.7,test_size=0.3,random_state=63) for i,(train,test) in enumerate(sss.split(X,y)): # Create the pipeline for the model est = LassoLarsCV() #fit model # pdb.set_trace() t0 = time.time() est.fit(X[train],y[train]) #get fit time runtime = time.time()-t0 # print("training done") # pdb.set_trace() # predict on test set y_true = y[test] y_pred = est.predict(X[test]) if problem in scale_these: test_mse = mean_squared_error(sc_y.inverse_transform(y_true), sc_y.inverse_transform(y_pred)) test_r2 = r2_score(sc_y.inverse_transform(y_true), sc_y.inverse_transform(y_pred)) else:
pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps')) pl.clf() ### Compute receptive fields from sklearn.linear_model import LassoLarsCV lasso = LassoLarsCV(max_iter=10,) p = (4, 2) # Mask for chosen pixel pixmask = np.zeros((10, 10), dtype=bool) pixmask[p] = 1 for index in [1780, 1951, 2131, 1935]: rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10) pl.figure(figsize=(8, 8)) # Black background pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray') pl.imshow(np.ma.masked_equal(rf, 0.), vmin=0., vmax=0.75, interpolation="nearest", cmap=cm.bluegreen) plot_lines(pixmask, linewidth=6, color='r') pl.axis('off') pl.subplots_adjust(left=0., right=1., bottom=0., top=1.) pl.savefig(os.path.join('miyawaki', 'encoding_%d.pdf' % index)) pl.savefig(os.path.join('miyawaki', 'encoding_%d.eps' % index)) pl.clf() ### Plot the colorbar ######################################################### import matplotlib as mpl
class LinearAll: """ A repertoire of Linear Variable Selection and Prediction Models Parameters ---------- n_jobs : int, optional Number of jobs to run in parallel (default 1). If -1 all CPUs are used. This will only provide speedup for n_targets > 1 and sufficient large problems pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’ refit : boolean Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting. iid : boolean, optional If True, the data is assumed to be identically distributed across the folds, and the score is computed from all samples individually, and not the mean loss across the folds. (If the number of data points is the same across folds, either returns the same thing) Attributes ---------- ols_train, predictions models before variable selection predictions models after variable selection """ def __init__ (self, cv=20, scoring = 'mean_squared_error', n_jobs=1, refit=False, iid=False, pre_pred=True, param_ridge_post=list(np.arange(1,3,0.1)), rlasso_selection_threshold = 0.5): #self.__name__ = '__main__' """ CAUTION: we changed to __main__ so that parallelization works """ self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.refit = refit self.iid = iid self.pre_pred =pre_pred self.param_ridge_post = param_ridge_post self.rlasso_selection_threshold = rlasso_selection_threshold def run_models(self, X, y, param_ridge): """ Prediction Models. OLS, PLS, Ridge """ ################################## ## OLS CV ################################## #ols = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_cv_score = cross_validation.cross_val_score( # ols, X, y, # cv=self.cv, scoring=self.scoring, # n_jobs=self.n_jobs) """ self.ols_cv_score.shape = (cv,) """ ################################## ## PLS CV ################################## tuned_parameters = [{'n_components': range(1, 5)}] pls = PLSRegression() pls_cv = GridSearchCV(pls, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) pls_cv.fit(X, y) ################################## ## Ridge CV ################################## tuned_parameters = [{'alpha': param_ridge}] ridge = linear_model.Ridge(alpha = 1) ridge_cv = GridSearchCV(ridge, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) ridge_cv.fit(X, y) return (pls_cv, ridge_cv) def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self def predict(self, X_test): assert(self.refit == True) if self.pls_post.best_score_ > self.ridge_post.best_score_: self.best_model = self.pls_post print "Chosen Model: pls" else: self.best_model = self.ridge_post print "Chosen Model: ridge" if self.rlasso_selection_threshold == 0: X_test_selected = X_test[:, self.active] else: X_test_selected = self.rlasso.transform(X_test) return self.best_model.best_estimator_.predict(X_test_selected)
from xgboost.sklearn import XGBClassifier from xgboost import DMatrix df = pd.read_csv("processed.csv", header=0, index_col="ID") #df.TARGET.describe() y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values lr = LassoLarsCV() sfm = SelectFromModel(lr, threshold=1e-3) X_std = StandardScaler().fit_transform(X, y) sfm.fit(X_std,y) lr.fit(X_std, y) #feat_imp = pd.DataFrame(lr.coef_, index=X_labels) #feat_imp.plot(kind="bar", title="Feature Importance", use_index=False) chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ] #chosen_feat = pickle.load(open("feat", "rb")) print(len(chosen_feat)) chosen_feat # kaggle forum df.var3 = df.var3.replace(-999999,2) y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values
[0.607492, 3.965162], [0.358622, 3.514900], [0.147846, 3.125947], [0.637820, 4.094115], [0.230372, 3.476039], [0.070237, 3.210610], [0.067154, 3.190612], [0.925577, 4.631504], [0.717733, 4.295890], [0.015371, 3.085028], [0.335070, 3.448080], [0.040486, 3.167440], [0.212575, 3.364266], [0.617218, 3.993482], [0.541196, 3.891471]] #生成X和y矩阵 dataMat = np.array(data) X = dataMat[:, 0:1] # 变量x y = dataMat[:, 1] #变量y # ========Lasso回归======== # model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(X, y) # 线性回归建模 print('系数矩阵:\n', model.coef_) print('线性回归模型:\n', model) # print('最佳的alpha:',model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 # 使用模型预测 predicted = model.predict(X) # 绘制散点图 参数:x横轴 y纵轴 plt.scatter(X, y, marker='x') plt.plot(X, predicted, c='r') # 绘制x轴和y轴坐标 plt.xlabel("x") plt.ylabel("y") # 显示图形
N_SEG.append(X.shape[0]) # parameters search range #param_ridge_post = list(np.arange(200,400,10)) #param_ridge_post.append(0.5) param_ridge_post= np.concatenate((np.arange(0.1,1,0.1),np.arange(3,5,0.1))) #param_ridge_post = [330, 0.5] #p=24489 #param_ridge_post = [3.7, 0.5] #p=303 # fit from sklearn.linear_model import LassoLarsCV from sklearn import linear_model lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=5, n_jobs=2) lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ lasso_refit = linear_model.LassoLars(alpha=lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) lasso_refit.fit(X, y) active = lasso_refit.coef_ for i, x in enumerate(active[0]): if x != 0 and i > main.shape[1] - 1: collect[j].add(i)