def learn_for(self, i): reviews = AbstractEstimateBase.reviews reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) nusers,nmovies = reviews.shape u = reviews[i] us = np.arange(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.ravel() > 0) x = reviews[us][:, ps].T kf = KFold(len(ps), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, u[train] - x1) xc = x[test].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = reg.predict(xc).ravel() predictions[test] = p fill_preds = np.zeros(nmovies) fill_preds[ps] = predictions return fill_preds
def train_elasticnet(train_features, train_labels, test_features, num_alphas, skip_cross_validation, alpha, l1_ratio, num_jobs): """ Performs the cross validation, and returns the trained model """ if not skip_cross_validation: # use 5 fold cross validation model = ElasticNetCV( l1_ratio=[0.5, 0.7, 0.9, 0.95, 0.99, 0.995, 0.9995, 1], max_iter=30000, cv=5, n_alphas=num_alphas, n_jobs=num_jobs, normalize=True, tol=0.005) else: model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, normalize=True, max_iter=30000, tol=0.005) model.fit(train_features, train_labels) if not skip_cross_validation: print("Optimal alpha is {}".format(model.alpha_)) print("Optimal l1_ratio is {}".format(model.l1_ratio_)) print("number of iterations were {}".format(model.n_iter_)) return model
def data_preprocessing(X, Y): # scaling of data Y[Y == 0] = -1 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) X_train = X_train.reshape((3840, 68 * 2)) X_test = X_test.reshape((960, 68 * 2)) scaler = StandardScaler() # doctest: +SKIP # Don't cheat - fit only on training data scaler.fit(X_train) # doctest: +SKIP X_train = scaler.transform(X_train) # doctest: +SKIP # apply same transformation to test data X_test = scaler.transform(X_test) # PCA analysis pca = PCA(n_components=68) pca.fit(X_train) X_train = pca.transform(X_train) X_test = pca.transform(X_test) # Feature selection using lasso and ridge regression ElasticNet = ElasticNetCV(cv=10, random_state=0) ElasticNet.fit(X_train, Y_train) all_features = ElasticNet.coef_ not_important_features_indices = np.where(all_features == 0)[0] X_train = np.delete(X_train, not_important_features_indices, axis=1) X_test = np.delete(X_test, not_important_features_indices, axis=1) return X_train, X_test, Y_train, Y_test
def fit(self, X, y): if self.cross_validate_: cv_model = ElasticNetCV( l1_ratio=[.1, .5, .7, .9, .95, .99, .995, 1], eps=0.001, n_alphas=100, fit_intercept=True, normalize=True, precompute='auto', max_iter=2000, tol=0.0001, cv=5, copy_X=True, verbose=0, n_jobs=-1, positive=False, selection='cyclic') cv_model.fit(X, y) if self.verbose: print('Optimal alpha: %.8f' % cv_model.alpha_) print('Optimal l1_ratio: %.3f' % cv_model.l1_ratio_) print('Number of iterations %d' % cv_model.n_iter_) self.classifier_ = ElasticNet(l1_ratio=cv_model.l1_ratio_, alpha=cv_model.alpha_, max_iter=cv_model.n_iter_, fit_intercept=True, normalize=True) self.classifier_.fit(X, y)
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) u = reviews[i] us = range(reviews.shape[0]) del us[i] ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:, ps].T y = u.data kf = KFold(len(y), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train] - x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = np.array(map(reg.predict, xc)).ravel() predictions[test] = p return predictions
def algor_ElasticNetCV(): request_content = request.form.to_dict() df = pd.read_csv(session.get('file')) X_train,Y_train = onehot(df) params = request_content if params['alpha'] != 'None': params['alpha'] = [float(params['alpha'])] else: params['alpha'] = None # print(type(params['max_depth'])) # print(params['max_iter']) # elif params['class_weight'] == 'l1': # max_iter = int(round(float(params['max_iter']))) model = ElasticNetCV(alphas=params['alpha'], l1_ratio=float(params['l1_rotio']), fit_intercept=bool(params['fit_intercept']), normalize=bool(params['normalize']), max_iter=int(params['max_iter']), tol=float(params['tol'],) ) model.fit(X_train, Y_train) y_pred = model.predict(X_train) context = { 'algor': '弹性网回归', 'roc_AUC': 'None(仅用于分类器)', 'ACC': 'None(仅用于分类器)', 'Recall': 'None(仅用于分类器)', 'F1_score': 'None(仅用于分类器)', 'Precesion': 'None(仅用于分类器)', 'R_2' : round(metrics.r2_score(Y_train,y_pred),2) } return render_template('ElasticNetCV.html', **context)
def l1_enet(ratio): ''' input l1 ratio and return the model, non zero coefficients and cv scores training elastic net properly ''' enet_cv = ElasticNetCV(cv=rkf, l1_ratio=ratio, max_iter=1e7, tol=0.001, fit_intercept=fit_int_flag, random_state=rs) enet_cv.fit(X_train, y_train) # the optimal alpha enet_alpha = enet_cv.alpha_ enet_coefs = enet_cv.coef_ n_nonzero = len(np.where(abs(enet_coefs) >= 1e-7)[0]) # Access the errors y_predict_test = enet_cv.predict(X_test) y_predict_train = enet_cv.predict(X_train) # error per cluster enet_RMSE_test = np.sqrt(mean_squared_error(y_test, y_predict_test)) enet_RMSE_train = np.sqrt(mean_squared_error(y_train, y_predict_train)) return enet_cv, enet_alpha, n_nonzero, enet_RMSE_test, enet_RMSE_train
def calculateAccuracyWithModel(indbest, X_train, y_train, X_test, y_test): indbest = list(indbest) evalTrain = evaluatedMatrix(indbest, X_train) evalTest = evaluatedMatrix(indbest, X_test) # Linear regression with elastic net regr = ElasticNetCV(random_state=0) regr.fit(evalTrain, y_train) y_pred = regr.predict(evalTest) print(r2_score(y_test, y_pred)) indbest, regr.coef_ = sortCoef(indbest, regr.coef_) model = "" i = 0 if regr.intercept_ not in [0, -0]: model = str(coefStr(regr.intercept_)) for ind in indbest: if regr.coef_[i] not in [0, -0]: if "-" in str(regr.coef_[i]): indCoef = str(coefStr(regr.coef_[i])) + "*" + str(ind) elif len(model) > 0: indCoef = "+" + str(coefStr(regr.coef_[i])) + "*" + ind else: indCoef = str(coefStr(regr.coef_[i])) + "*" + ind model = model + indCoef i = i + 1 print(model)
def score(inEval, X, y): indMatrix = pd.DataFrame() i = 0 listEval = list(inEval) for ele in listEval: evalString = updatedEvalString(ele) #Exception handling against log(0) try: indMatrix[str.format('col{0}', i)] = eval(evalString) except ZeroDivisionError: continue i = i + 1 # Remove inf with 1 indMatrix = indMatrix.replace([np.inf, -np.inf], 1) # Linear regression with elastic net """ regr = ElasticNet(random_state=0, l1_ratio = 0.1) regr.fit(indMatrix,y_train) y_p = regr.predict(indMatrix) regr.score(indMatrix,y_train)""" regr = ElasticNetCV(cv=2, random_state=0, max_iter=5000) regr.fit(indMatrix, y) return (regr.score(indMatrix, y))
def main(): print 'load datas...' train, test = data_util.load_dataset() y_train_all = train['y'] del train['ID'] del train['y'] id_test = test['ID'] del test['ID'] print 'train:', train.shape, ', test:', test.shape random_state = 420 cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, .995, 1], eps=0.001, n_alphas=100, fit_intercept=True, normalize=True, precompute='auto', max_iter=2000, tol=0.0001, cv=5, copy_X=True, verbose=0, n_jobs=-1, positive=False, random_state=random_state, selection='cyclic') cv_model.fit(train.values, y_train_all) print('Optimal alpha: %.8f' % cv_model.alpha_) print('Optimal l1_ratio: %.3f' % cv_model.l1_ratio_) print('Number of iterations %d' % cv_model.n_iter_) print 'train model with best parameters from CV...' model = ElasticNet(l1_ratio=cv_model.l1_ratio_, alpha=cv_model.alpha_, max_iter=cv_model.n_iter_, fit_intercept=True, normalize=True) model.fit(train.values, y_train_all) print 'predict submit...' y_pred = model.predict(test.values) df_sub = pd.DataFrame({'ID': id_test, 'y': y_pred}) df_sub.to_csv('elasticnet_model_result.csv', index=False) # 0.55828
def creat_model1(x,y): # rng = np.random.RandomState(31337) # kf = KFold(n_splits=10, shuffle=True, random_state=rng) # ground = [] # pred = [] # xgb_model = xgb.XGBRegressor(n_estimators = 1000, learning_rate=0.05).fit(x,y) # score = make_scorer(my_custom_loss_func, greater_is_better=False) # scores = -cross_val_score(xgb_model, x, y,cv=10,scoring=score) # print(scores) # for train_index,test_index in kf.split(x): # xgb_model = xgb.XGBRegressor(n_estimators = 1000, subsample = 0.8, learning_rate=0.1 ).fit(x[train_index],y[train_index]) # predictions = xgb_model.predict(x[test_index]) # actuals = y[test_index] # print(my_custom_loss_func(actuals, predictions)) # xgb_model = xgb.XGBRegressor() # clf = # clf = LogisticRegressionCV()#cv=10, penalty = 'l2',solver = 'liblinear') # print(y) # clf.fit(x, y) # score = make_scorer(my_custom_loss_func, greater_is_better=False) # scores = -cross_val_score(clf, x, y,cv=10,scoring=score) # print(scores) enet = ElasticNetCV(l1_ratio = 0.7, cv=10) enet.fit(x, y) score = make_scorer(my_custom_loss_func, greater_is_better=False) scores = -cross_val_score(enet, x, y,cv=10,scoring=score) print(scores)
def regression_NumMosquitos(Xtr, ytr, Xte): from sklearn.linear_model import ElasticNetCV #model_nm = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, cv=4) model_nm = ElasticNetCV() model_nm.fit(Xtr, ytr) results_nm = model_nm.predict(Xte) return results_nm
def Model(Encoding, Scores, Run_name, step_size, loop_dict, var_dict, round_data, ElasticNet_dict, l1_ratios, All_data): Pearson_correlations = [] Data = Encoding.copy() #copy, so it does not change# Data_sets = CV_split(Data, 5) # The Big 5# for cv_round in range(len(Data_sets)): score_dict = Scores.copy() #Randomized scores at the start each time# Test_set = Data_sets[cv_round] Train_set = exclude(Data_sets, cv_round) #Keeps everything but the train set# Train_set = pd.concat(Train_set) #All train sets into on dataframe# X = Train_set.iloc[:, :Train_set.shape[1] - 1] #features# X['Intercept'] = 1 #add intercept# y = pd.DataFrame(Train_set['pMeas']) #targets# AM_EndOfLoopError = [] AM_EndOfLoopError.append(Get_Error( X, y, score_dict)) # The Error Before AM Tuning # """AM Tuning Looping Starts Here and Adds a value to End of Loop Error""" Loop_num = 1 # AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict while ((AM_EndOfLoopError[-1] - AM_EndOfLoopError[-2]) / (AM_EndOfLoopError[-2])) < -0.001: Loop_num += 1 AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict loop_dict['AM Time Series Data'] = AM_EndOfLoopError loop_dict['Final Scores'] = score_dict """ AM Tuning is now Finished for the CV_split, Elastic Net is Next """ EN = ElasticNetCV(l1_ratio=l1_ratios, cv=5, copy_X=True, normalize=True, random_state=23) X_train = X.copy() X_train.replace(score_dict, inplace=True) y_train = y.copy() X_test = Test_set.iloc[:, :Test_set.shape[1] - 1] X_test.replace(score_dict, inplace=True) X_test['Intercept'] = 1 y_test = pd.DataFrame(Test_set['pMeas']) EN.fit(X_train, y_train) y_pred = pd.DataFrame(EN.predict(X_test)) Pearson_correlations.append(np.corrcoef(y_test.T, y_pred.T)[0][1]) """Save Everything """ ElasticNet_dict["y_pred"] = y_pred ElasticNet_dict['y_test'] = y_test ElasticNet_dict['Alpha'] = EN.alpha_ ElasticNet_dict['l1_ratio'] = EN.l1_ratio_ ElasticNet_dict['Parameters'] = EN.get_params() ElasticNet_dict["AlphaSpace"] = EN.alphas_ loop_dict['ElasticNet'] = ElasticNet_dict round_data[cv_round] = loop_dict All_data[Run_name] = round_data np.save("All Data.npy", All_data) return np.mean(Pearson_correlations)
def train_EN_model(train_x, train_y, _predict_x): '''train_x, predict_x = \ standarize_feature(_train_x, _predict_x)''' #l1_ratios = [1e-4, 1e-3, 1e-2, 1e-1] #l1_ratios = [1e-5, 1e-4, 1e-3] l1_ratios = [0.9, 0.92, 0.95, 0.97, 0.99] #l1_ratios = [0.5] min_mse = 1 best_l1_ratio = 0.95 best_alpha = 0.5 for r in l1_ratios: t1 = time.time() reg_en = ElasticNetCV(l1_ratio=r, cv=5, n_jobs=4, verbose=1, precompute=True) reg_en.fit(train_x, train_y) n_nonzeros = (reg_en.coef_ != 0).sum() _mse = np.mean(reg_en.mse_path_, axis=1)[np.where(reg_en.alphas_ == reg_en.alpha_)[0][0]] if _mse < min_mse: min_mse = _mse best_l1_ratio = r best_alpha = reg_en.alpha_ t2 = time.time() return best_l1_ratio, best_alpha
def elastic_net_cv(problem, **kwargs): r"""High level description. Parameters ---------- kwargs['elastic_net_reg_coefs'] must be a list of nonnegative float. These are the multiplier for the penalty term in cross-validation of EN kwargs['elastic_net_ratio'] must be between 0 and 1 kwargs['coef_tolerance'] must be a nonnegative float Returns ------- output : tuple (optimum, maximum) """ data_list = [datum['data']['values'] for datum in problem.data] data = numpy.array(data_list) elastic_net = ElasticNetCV(alphas=kwargs['elastic_net_reg_coefs'], l1_ratio=kwargs['elastic_net_ratio']) elastic_net.fit(data.T, problem.goal['data']['values']) elastic_net_coefficients = elastic_net.coef_ optimum = [ problem.data[index] for index, element in enumerate(elastic_net_coefficients) if abs(element) > kwargs['coef_tolerance'] ] maximum = elastic_net.score(data.T, problem.goal['data']['values']) output = (optimum, maximum) return output
class _SkLearnElasticNetSolver(BaseSolver): @ex.capture def __init__(self, data_features: Matrix, output_samples: ColumnVector, n_alphas: int, cross_validation_folds: int, elastic_net_factor: Scalar, _rnd): """ The standard solver of Scikit-Learn for Lasso-Regression. Args: data_features(Matrix): The input data matrix ``nxd``. output_samples(ColumnVector): The output for the given inputs, ``nx1``. n_alphas(int): The number of total regularization terms which will be tested by this solver. cross_validation_folds(int): The number of cross-validation folds used in this solver. """ super(_SkLearnElasticNetSolver, self).__init__(data_features, output_samples, n_alphas, cross_validation_folds) self._model = ElasticNetCV(cv=cross_validation_folds, n_alphas=n_alphas, random_state=_rnd, l1_ratio=elastic_net_factor, normalize=False) def fit(self) -> ColumnVector: """ The method which fits the requested model to the given data. """ self._model.fit(self._data_features, self._output_samples) self._fitted_coefficients = self._model.coef_ return self._fitted_coefficients
def elasticNetRegNT(self, X, Y, nCV, l1_weights=None): """Run elastic net with the given params :param X: design matrix :param Y: true labels :param nCV: number of CVs :param l1_weights: weights of the lasso term :return: """ # very difficult to choose alpha, better use CV # enet = ElasticNet(alpha=self.alpha, l1_ratio=0.8, fit_intercept=False) # enet = ElasticNetCV(fit_intercept=False, cv=nCV) if (self.useCV): enet = ElasticNetCV(cv=nCV, max_iter=self.maxItr, l1_weights=l1_weights, fit_intercept=self.fit_intercept, alphas=self.alphas, l1_ratio=self.l1_ratio) enet.fit(X, Y) self.cv_alpha = enet.alpha_ else: enet = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio, max_iter=self.maxItr, l1_weights=l1_weights) enet.fit(X, Y) if self.verbose: print("Num of iter: %d"%enet.n_iter_) # print("Best alpha: {}, l1_ratio: {}" # .format(enet.alpha_, enet.l1_ratio_)) # print(enet.get_params()) ## plot regulation path for testing # testReg.lassoElasticnetPaths(X, Y) return enet.coef_, enet.intercept_
def GLM(X_train, X_test, y_train): GLM_Model = ElasticNetCV(random_state=0, tol=0.01, cv=5, max_iter=20000) GLM_Model.fit(X_train, y_train) y_prediction = GLM_Model.predict(X_test) return y_prediction
def elastic_net(Xtrain, Ytrain, Xdev, Ydev, verbose=False): """ Trains and Elastic Net Linear Model on the provided. Scores the model and returns both the model and the score. It also prints the optimal hyperparameters. Inputs: Xtrain Ytrain Xdev Ydev Returns: float: the R^2 on the dev data for the best model specifications. ElasticNetCV: the trained model. """ print("\n========================\nTraining Elastic Net\n") enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=1e4, tol=1e-2) enet.fit(Xtrain, Ytrain) best_score = enet.score(Xdev, Ydev) results = { "R2": best_score, "alpha": enet.alpha_, "l1_ratio": enet.l1_ratio_ } if verbose: results['coefficients'] = enet.coef_.tolist() print(results, indent=4) return best_score, enet
def runsklelasticnetcv(alpha, x_data, y_data, descent_type): """ This function runs Sci-Kit Learn's ElasticNetCV running the coordinate descent algorithm with cross-validation to select optimal lambda (regularization penalty). Function takes input of alpha, predictor and response data, and descent type ('cyclic' or 'random') :param alpha: float Value for controlling the ElasticNet's L1_ratio where alpha = 0 is a full L2 penalty and alpha = 1 is a full L1 penalty. Values between represent a combination of L1 and L2 penalty. :param x_data: numpy array Data containing the predictors :param y_data: numpy array Data containing the response :param descent_type: str Selection of the coordinate descent algorithm type, either 'random' or 'cyclic :return betaskl: list of float values :return lambskl: float """ encv = ElasticNetCV(l1_ratio=alpha, fit_intercept=False, tol=0.000001, selection=descent_type, max_iter=10000) encv.fit(x_data, y_data) lambskl = encv.alpha_ betaskl = encv.coef_ return betaskl, lambskl
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) nusers, nmovies = reviews.shape u = reviews[i] us = np.arange(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.ravel() > 0) x = reviews[us][:, ps].T kf = KFold(len(ps), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, u[train] - x1) xc = x[test].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = reg.predict(xc).ravel() predictions[test] = p fill_preds = np.zeros(nmovies) fill_preds[ps] = predictions return fill_preds
def predict(X_train, X_test, y_train, y_test, features, pic_name, dir): """ The function predicts the tags of X_test by the elastic net model :param X_train: :param X_test: :param y_train: :param y_test: :param labels: :param pic_name: :param dir: :return: """ model = ElasticNetCV(cv=4) model.fit(X_train, y_train) predict = model.predict(X_test) print("mean absolute error: ", mean_absolute_error(y_test, predict)) print("r2 error: ", sklearn.metrics.r2_score(y_test, predict)) print("alpha: ", model.alpha_) print("alphas: ", model.alphas_) print("iter: ", model.n_iter_) x = len(features) y = len(model.coef_) coefficients = [(d, c) for d, c in zip(features, model.coef_)] coefficients_str = "" for a, b in coefficients: coefficients_str += a + ": " + str("%.4f" % b) + "\n" coefficients_str = coefficients_str[:-2] print("coef: ", coefficients_str) Plot_output.plot_coefficients(coefficients_str, pic_name=pic_name, dir=dir) Plot_output.plot_graph(X_test, y_test, predict, pic_name, dir)
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) u = reviews[i] us = range(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:, ps].T y = u.data kf = KFold(n_splits=4) predictions = np.zeros(len( u.toarray().ravel())) # 他のモデルと形を合わせるため,評価が行われていない映画はpredictionsを0にする for train, test in kf.split(y): xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train] - x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = np.array(reg.predict(xc)).ravel() predictions[test] = p return predictions
def ElasticNet(dataX, dataY, pre_indice): ''' 弹性回归得到每个特征的权重系数 :param dataX: :param dataY: :param indices: :return: ''' Encv = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000, random_state=0) Encv.fit(dataX, dataY) print(Encv) weight_coef = Encv.coef_ print("弹性网络回归的权重系数", list(weight_coef)) remain_indice = [] for index, coef in enumerate(weight_coef): if (coef != 0): remain_indice.append(index) print("弹性网络保留的特征序号", remain_indice) # Oringal_indice=[] # for i in remain_indice: # Oringal_indice.append(pre_indice[i]) # weight_coef=[abs(weight) for weight in weight_coef] # print("弹性网络回归权重系数为",weight_coef)#权重系数越大, # # 表明该特征对响应变量的影响越大,所以该特征的重要度越高 # indice=np.argsort(weight_coef) # print(indice) return remain_indice
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) u = reviews[i] us = range(reviews.shape[0]) del us[i] ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:, ps].T y = u.data kf = KFold(len(y), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train] - x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = np.array(map(reg.predict, xc)).ravel() predictions[test] = p return predictions
def predict(self, X): binary = X > 0 if self.normalize == True: X = self.norm.fit_transform(X) num_users, num_movies = X.shape clf = ElasticNetCV(alphas=[0.1]) predicted = X.copy() for user in range(num_users): #bool array for movies rated by user movie_user = binary[user] #which users to consider as attributes for regression, in this case all except current user neighbors = np.ones((num_users), dtype=bool) neighbors[user] = False X_train_user = X[neighbors] X_train_user = X_train_user[:, movie_user].T y_train_user = X[user, movie_user] clf.fit(X_train_user, y_train_user) X_test_user = X[neighbors] X_test_user = X_test_user[:, ~movie_user].T predicted[user, ~movie_user] = clf.predict(X_test_user) if self.normalize == True: predicted = self.norm.inverse_transform(predicted) return predicted
class my_EN_Classifier(BaseEstimator): def __init__(self): self.clf = ElasticNetCV(l1_ratio=np.linspace(0.1, 1.0, 15), n_alphas=100, max_iter=10000, n_jobs=-1, cv=10) self.sd = StandardScaler() self.win = Winsorizer(quantile=0.01) # 100*quantile = percentail def fit(self, X, y): X_norm1 = self.sd.fit_transform(X) X_norm = self.win.fit(X_norm1) X_norm = self.win.transform(X_norm1) self.clf.fit(X_norm, y) alpha = self.clf.alpha_ l1 = self.clf.l1_ratio_ self.refit_estimator = ElasticNet(alpha=alpha, l1_ratio=l1, max_iter=10000) self.refit_estimator.fit(X_norm, y) return self def predict(self, X): X_norm1 = self.sd.transform(X) X_norm = self.win.transform(X_norm1) return self.refit_estimator.predict(X_norm) def get_info(self): info = {} info['best_alpha'] = self.clf.alpha_ info['best_l1'] = self.clf.l1_ratio_ info['refit_coef'] = self.refit_estimator.coef_ return info
def mutation_impact(data, attempt): impact = {m: None for m in MUTATIONS} # Begin CODE Y = [] X = [] for patient in data: hamd = patient.get('baseline_hamd') Y.append(hamd) patient_mutations = [] for mutation in patient.iter('Mutation'): chromosome = mutation.get('chromosome') pos = mutation.get('pos') ref = mutation.get('ref') alt = mutation.get('alt') s = f'chrom_{chromosome}.pos_{pos}.ref_{ref}.alt_{alt}' name = clean_name(s, MUTATIONS) patient_mutations.append(name) patient_X = [] for mutation in MUTATIONS: if mutation in patient_mutations: patient_X.append(1) else: patient_X.append(0) X.append(patient_X) reg = ElasticNetCV() reg.fit(X, Y) for i, m in enumerate(MUTATIONS): impact[m] = float(reg.coef_[i]) # End CODE return impact
def use_ElasticNet(): en = ElasticNet(alpha=0.001, l1_ratio=0.8, normalize=True) en_scores = cross_val_score(en, X, y, cv=16, scoring='r2') encv = ElasticNetCV(alphas=(0.1, 0.01, 0.005, 0.0025, 0.001), l1_ratio=(0.1, 0.25, 0.5, 0.75, 0.8), normalize=True) encv.fit(X, y) print("ElasticNet : ", en_scores.mean())
def elasticnet_reg(x, y): elasticnetcv = ElasticNetCV(cv=20) elasticnetcv.fit(x, y) elasticnetcv_score = elasticnetcv.score(x, y) elasticnetcv_alpha = elasticnetcv.alpha_ print('ElasticNet R square', elasticnetcv_score) print('ElasticNet Alpha', elasticnetcv_alpha) return elasticnetcv.coef_
def enetCV(): print ("Doing elastic net") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf4 = ElasticNetCV(cv=cross_val) clf4.fit(base_X, base_Y) print ("Score = %f" % clf4.score(base_X, base_Y)) clf4_pred = clf4.predict(X_test) write_to_file("elasticCV.csv", clf4_pred)
def fit_elastic_net_cv(X, y, nfolds=5): from sklearn.linear_model import ElasticNetCV # The parameter l1_ratio corresponds to alpha in the glmnet R package # while alpha corresponds to the lambda parameter in glmnet enet = ElasticNetCV(l1_ratio=np.linspace(0.01, 1.0, 20), alphas=np.exp(np.linspace(-6, 5, 20)), cv=nfolds) enet.fit(X,y) return enet
def _elasticnetcv(*, train, test, x_predict=None, metrics, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection='cyclic'): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV """ model = ElasticNetCV(l1_ratio=l1_ratio, eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, max_iter=max_iter, tol=tol, cv=cv, copy_X=copy_X, verbose=verbose, n_jobs=n_jobs, positive=positive, random_state=random_state, selection=selection) model.fit(train[0], train[1]) model_name = 'ElasticNetCV' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def elasticNet(argv): data = pd.read_csv(argv, index_col=0) y = data['target'] X = data.drop('target', axis=1) featureNames = X.columns.values enet = ElasticNetCV(n_jobs=-1, normalize=True) enet.fit(X, y) dropIdx = featureNames[enet.coef_ < 1e-5] print "Elastic Net drop: %d" % len(dropIdx) print dropIdx data.drop(dropIdx, axis=1, inplace=True) data.to_csv(argv+'.enet.csv') return enet
def LCCB_coevo(fitness_fn, pop): y = fitness_fn.train_y # Make a new array composed of pop[i].semantics for all i # (pop[i].semantics has already been calculated) X = None for ind in pop: if (ind.phenotype and ind.fitness != sys.maxint and all(np.isfinite(ind.semantics))): col = ind.semantics else: print("Omitting a column") col = np.zeros(len(y)) if X is None: X = col else: X = np.c_[X, col] eps = 5e-3 # FIXME FFX processes the data so that has zero mean and unit # variance before applying the LR... should we do that? # Use ElasticNet with cross-validation, which will automatically # get a good value for regularisation model = ElasticNetCV() model.fit(X, y) coefs = model.coef_ output = model.predict(X) rmse = fitness_fn.rmse(y, output) print("rmse", rmse) # Assign the magnitude of coefficients as individual fitness # values. Have to construct a new individual because tuples are # immutable. FIXME this is not a great method -- it's likely that # the population will converge on one or a few basis functions, # and then the performance of the ENet will decrease because there # won't be enough independent basis functions to work with. pop = [variga.Individual(genome=pop[i].genome, used_codons=pop[i].used_codons, fitness=-abs(coefs[i]), phenotype=pop[i].phenotype, readable_phenotype=pop[i].readable_phenotype, semantics=pop[i].semantics) for i in range(len(pop))] pop.sort(key=variga.ind_compare)
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # 모든 사용자에 대해 반복 for u in range(train.shape[0]): # 훈련에서 현재 사용자 제거 curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # 이전에 없는 값을 넣는다 filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def regress(x, y, title): clf = ElasticNetCV(max_iter=200, cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1]) clf.fit(x, y) print "Score", clf.score(x, y) pred = clf.predict(x) plt.title("Scatter plot of prediction and " + title) plt.xlabel("Prediction") plt.ylabel("Target") plt.scatter(y, pred) # Show perfect fit line if "Boston" in title: plt.plot(y, y, label="Perfect Fit") plt.legend() plt.grid(True) plt.show()
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # iterate over all users for u in range(train.shape[0]): # remove the current user for training curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # Fill the values that were not there already filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def enet_granger_causality_cv(X_t, y_t, cv, alphas, top_num=None, top_perc=4,max_iter=100, lambdas=None): # alph ais the l1_ratio if lambdas != None: use_lambdas = np.tile(lambdas, len(alphas)).reshape(len(alphas), len(lambdas)) enet = ElasticNetCV(l1_ratio=alphas, alphas=use_lambdas, cv=cv, max_iter=max_iter) fit = enet.fit(X_t, y_t) use_lambdas = fit.alphas_ use_lambdas = np.tile(use_lambdas, len(alphas)).reshape(len(alphas), len(lambdas)) print "Used lambdas" print use_lambdas else: enet = ElasticNetCV(l1_ratio=alphas, cv=cv, max_iter=max_iter) fit = enet.fit(X_t, y_t) use_lambdas = fit.alphas_ # lambdas is a matrix cv_mses = enet.mse_path_.sum(axis=2).flatten() cv_alphas = np.repeat(alphas, use_lambdas.shape[1]) cv_lambdas = use_lambdas.flatten() if top_num == None: print "Num cv alphas: ", len(cv_alphas) top_num = int(len(cv_alphas) * top_perc / 100.0) print "Top num ", top_num # this will keep the smallest top_indices, top_mses = get_min_k(cv_mses, top_num) top_lambdas = cv_lambdas[top_indices] top_alphas = cv_alphas[top_indices] top_df = pd.DataFrame(data={"lambda.min": top_lambdas, "alpha": top_alphas, "error.min": top_mses}) return top_df
def train_model(data, target, n_iter, rate): """Bootstraps, trains ElasticNetCV model, selects features, and trains final linear regression model. Returns model and selected features. """ coefs = [] for i in range(n_iter): print "bootstrap iter {}".format(i) indices = np.random.choice(len(data), size=len(data), replace=True) sample_data = data[indices] sample_target = target[indices] model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, n_jobs=4) model.fit(sample_data, sample_target) coefs.append(model.coef_) coefs = np.vstack(coefs) rate_selected = make_rates(coefs) selected1 = np.nonzero(rate_selected >= rate)[0] selected2 = np.argsort(rate_selected)[-50:] selected = selected1 if len(selected1) < len(selected2) else selected2 model = LinearRegression() model.fit(data[:, selected], target) model_full = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, n_jobs=4) model_full.fit(data, target) return model_full, model, selected, coefs
def eNetModel(data, labels, featureNames, texts, documents, nFolds): # run SVM with grid search for parameters and leave-one-out cross validation kf = KFold(len(texts), n_folds=nFolds) acc = 0 mean_coefs = [] for train, test in kf: # test_docs = {} label_train = labels[train] #selected_feats = getSelectedFeatures(train, test, texts, featureNames, documents, label_train, nFeats) full_train_data, full_test_data, label_train, label_test = data[train], data[test], labels[train], labels[test] #data_train = sortBySelected(full_train_data, selected_feats, featureNames) #data_test = sortBySelected(full_test_data, selected_feats, featureNames) data_train = full_train_data data_test = full_test_data enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) enet.fit(data_train, label_train) data_train = np.asarray(data_train,dtype=float) label_train = np.asarray(label_train,dtype=float) vals = enet.path(data_train, label_train) mean_coefs.append(np.mean(vals[1],axis=1)) if label_test == 1 and enet.predict(data_test) > 0.5: acc += 1 elif label_test == 0 and enet.predict(data_test) < 0.5: acc += 1 if len(mean_coefs) % 10 == 0: print str(len(mean_coefs)), 'out of %s subs finished' %(str(len(data))) mean_coefs = np.mean(np.array(mean_coefs), axis=0) return Decimal(acc)/Decimal(len(data)), mean_coefs
def elastic_net_cv(self, drug_name, l1_ratio=0.5, alphas=None, n_folds=10): # Get the data for the requested drug xscaled, Y = self._get_one_drug_data(drug_name) en = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, cv=n_folds) encv = en.fit(xscaled, Y) self.encv = encv print("Best alpha on %s folds : %s" % (n_folds, encv.alpha_)) #df.sort_values().plot(kind='bar') return encv.alpha_
lr.fit(X,y) predicted=lr.predict(X) '''validation''' kf=KFold(len(X),n_folds=5) p=np.zeros_like(y) for train,test in kf: lr.fit(X[train],y[train]) p[test]=lr.predict(X[test]) rmse_cv=np.sqrt(mean_squared_error(p,y)) print "RMSE of 5-fold cv {:.2}".format(rmse_cv) '''ElasticNet''' from sklearn.linear_model import ElasticNetCV met=ElasticNetCV(n_jobs=-1) p=np.zeros_like(y) for t,tst in kf: met.fit(X[t],y[t]) p[tst]=met.predict(X[tst]) p2=r2_score(y,p) print met.score(X,y) print p2,"Elastic" exit() plt.scatter(predicted,y) plt.xlabel("Predicted") plt.ylabel("Actual ") plt.plot([y.min(),y.max()],[[y.min()],[y.max()]]) plt.show()
plt.title('Linear Regression with sklearn of Housing prices') plt.show() plt.savefig('image5.png') plt.close() print('Linear Regression') print(lr.rank_) print([x for x in zip(names, lr.coef_)]) from sklearn.linear_model import RidgeCV rcv = RidgeCV( alphas=np.array([0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]), normalize=True) rcv.fit(boston.data, boston.target) print('RidgeCV') print(rcv.alpha_) print([x for x in zip(boston.feature_names, rcv.coef_)]) from sklearn.linear_model import LassoCV lcv = LassoCV( alphas=np.array([0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]), normalize=True) lcv.fit(boston.data, boston.target) print('LassoCV') print(lcv.alpha_) print([x for x in zip(boston.feature_names, lcv.coef_)]) from sklearn.linear_model import ElasticNetCV encv = ElasticNetCV(alphas=np.array([0.0001, 0.0003, 0.01, 0.03, 0.1, 0.3, 1.0]), l1_ratio=np.array([0.5, 0.8, 0.9, 0.95, 0.99, 0.995, 1.0]), normalize=True) encv.fit(boston.data, boston.target) print('ElasticNetCV') print([x for x in zip(boston.feature_names, encv.coef_)])
md=dnn_reg(X_train,y_train,X_test,y_test) reg_eval(X_test,y_test,md) ###Lasso CV regression def reg_eval2(y_test,model): y_pred=model.predict(X_test) print("evaluation the results for model:",model) print("MSE:",mean_squared_error(y_test,y_pred)) print("R2:",r2_score(y_test,y_pred)) print("EVS:",explained_variance_score(y_test,y_pred)) lasso = LassoCV(cv=5, random_state=0,max_iter=10000) lasso.fit(X_train,y_train) reg_eval2(y_test,lasso) #ElasticNet Regressionb ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77) ela.fit(X_train,y_train) print("R square:",ela.score(X_test,y_test)) reg_eval2(y_test,ela) #SVR Regression from sklearn.svm import LinearSVR LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000) # scaler=RobustScaler() # pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)]) LSVR.fit(X_train,y_train) reg_eval2(y_test,LSVR))
import numpy as np X, y = make_sparse_data(n_samples=500, n_features=2000, n_informative=200) n_cores = [1, 2, 4] n_alpha = [5, 10, 50, 100] times = [0] * 12 counter = 0 for _ in range(3): for core in n_cores: for alpha in n_alpha: clf = ElasticNetCV(n_jobs=core, n_alphas=alpha, l1_ratio=0.5, cv=10) print "core = %d, alpha = %d" % (core, alpha) t = time() clf.fit(X, y) times[counter%12] += (time() - t) print times counter += 1 # Got after doing the above. Just for future reference. core1_mp = [57.457534631093345, 72.31527137756348, 210.2204163869222, 379.9918119907379] core2_mp = [55.89718206723531, 51.196732918421425, 138.35079900423685, 239.67310031255087] core3_mp = [42.53018967310587, 49.97517212231954, 122.26631005605061, 204.76643363634744] core1_t = [60.99967805544535, 75.41305232048035, 219.61244002978006, 390.601344982783] core2_t = [46.21716833114624, 54.701584259668984, 144.06910300254822, 242.6696043809255] core3_t = [43.21849703788757, 49.07820804913839, 122.74103697141011, 205.75086871782938] _, [axis1, axis2, axis3] = pl.subplots(3, 1, sharex=True) ind = np.arange(4)
#### assessing performance of the negative binomial regression model performance_negativebinomial = [] for x in [0.01,0.1,1,5,10]: cost = [] for a,b in cross_validation_object: resultingmodel = sm.NegativeBinomial(Y[a],X[a],loglike_method = 'geometric') #res = resultingmodel.fit(disp=False, maxiter = 200) res2 = resultingmodel.fit_regularized(alpha = x, maxiter = 200) cost.append(mean_squared_error(res2.predict(X[b]), Y[b])) performance_negativebinomial.append(np.mean(cost)) ##### Log linear model ########## not even close. from sklearn.linear_model import ElasticNetCV linear_fit = ElasticNetCV(cv = cross_validation_object, alphas = [0.01,0.1,1,5,10]) linear_fit.fit(X,np.log(Y+1)) mean_squared_error(np.exp(linear_fit.predict(X)) - 1, Y) ########## creating final model using train data + test data X_test,Y_test,junk = prepare_for_model('Dogs_Final_Test.csv',1) X,Y,junk = prepare_for_model('Dogs_Final_Train.csv',1) scaler = MinMaxScaler([0,1]) X_all = scaler.fit_transform(np.vstack((X_test,X))) Y_all = np.hstack((Y_test,Y)) Y_all = np.array([30 if i > 30 else i for i in Y_all]) final_model = sm.NegativeBinomial(Y_all,X_all,loglike_method = 'geometric') res2 = final_model.fit_regularized( alpha = 5, maxiter = 200)
from sklearn.linear_model import ElasticNetCV met = ElasticNetCV() features = sales_merged[['PMI_Portfolio_AVB_Boost', 'PMI_Portfolio_PFP_Boost', 'PMI_Portfolio_PPRP', 'PMI_Portfolio_SA', 'SubFam_Hostess', 'SubFam_PFP_Boost', 'SubFam_RAP', 'SubFam_SA', 'Fam_AVB_Boost', 'Fam_Hostess', 'Fam_PFP_Boost', 'Fam_RAP', 't', 'Affinity', 'Brand Character', 'Functional Performance']].as_matrix() target = sales_merged['Volume_Sales'].as_matrix() met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(features[train], target[train]) pred[test] = met.predict(features[test]) print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('')
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
l1_ratio=0.7 enet = ElasticNet(alpha = alpha, l1_ratio = l1_ratio) enet_model = enet.fit(X_train, y_train) y_pred_enet = enet_model.predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet) # r^2 on test data : 0.100723 # plt.plot(enet.coef_, label='Elastic net coefficients') # plt.plot(coef, '--', label='original coefficients') # plt.legend(loc='best') # plt.title("R^2: %f" % (r2_score_enet)) # plt.show() # set the parameters alpha and l1_ratio by cross-validation from sklearn.linear_model import ElasticNetCV enetcv = ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.9]) enetcv_model = enetcv.fit(X_train, y_train) y_pred_enetcv = enetcv_model.predict(X_test) r2_score_enetcv = r2_score(y_test, y_pred_enetcv) print(enetcv) print("r^2 on test data : %f" % r2_score_enetcv) # r^2 on test data : 0.22553 assert(r2_score_enetcv > r2_score_enet)
def fit(self, raw_array, aux_data_a_d=None, diff=False, feature_s_l=[], holdout_col=0, lag=1, positive_control=False, regression_algorithm_s = 'elastic_net', **kwargs): """ Performs an auto-regression of a given lag on the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the fitting, which is ideal for training the algorithm. """ # Apply optional parameters if holdout_col > 0: raw_array = raw_array[:, :-holdout_col] if diff: array = np.diff(raw_array, 1, axis=1) else: array = raw_array # Create model and fit parameters Y = array[:, lag:].reshape(-1) X = np.ndarray((Y.shape[0], 0)) for i in range(lag): X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1) # Y = X_t = A_1 * X_(t-lag) + A_2 * X_(t-lag+1)) + ... + A_lag * X_(t-1) + A_(lag+1) if positive_control: X = np.concatenate((X, array[:, lag:].reshape(-1, 1)), axis=1) if aux_data_a_d: for feature_s in feature_s_l: if holdout_col > 0: raw_array = aux_data_a_d[feature_s][:, :-holdout_col] else: raw_array = aux_data_a_d[feature_s] if diff: array = np.diff(raw_array, 1, axis=1) else: array = raw_array for i in range(lag): X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1) estimatorX = Imputer(axis=0) X = estimatorX.fit_transform(X) estimatorY = Imputer(axis=0) Y = estimatorY.fit_transform(Y.reshape(-1, 1)).reshape(-1) if regression_algorithm_s == 'elastic_net': l1_ratio_l = [.1, .5, .7, .9, .95, .99, 1] alpha_l = np.logspace(-15, 5, num=11).tolist() max_iter = 1e5 # It's too slow when I make it high, so I'll keep it low for now model = ElasticNetCV(l1_ratio=l1_ratio_l, alphas=alpha_l, max_iter=max_iter, fit_intercept=True, normalize=True) elif regression_algorithm_s == 'gaussian_process': model = GaussianProcess() # This currently gives the following error: "Exception: Multiple input features cannot have the same target value." elif regression_algorithm_s == 'gradient_boosting': model = GradientBoostingRegressor(max_features='sqrt') elif regression_algorithm_s == 'linear_regression': model = LinearRegression(fit_intercept=True, normalize=True) elif regression_algorithm_s == 'random_forest': model = RandomForestRegressor(max_features='auto') model.fit(X, Y) if regression_algorithm_s in ['elastic_net', 'linear_regression']: with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f: f.write('Lag of {0:d}:\n'.format(lag)) # f.write('\nElastic net: R^2 = %0.5f, l1_ratio = %0.2f, alpha = %0.1g' % # (model.score(X, Y), model.l1_ratio_, model.alpha_)) coeff_t = model.coef_ assert(not positive_control) # The coefficients won't currently line up for i_lag in range(lag): f.write('\ti_lag = {0:d}: {1:0.2g}\n'.format(lag-i_lag, coeff_t[i_lag])) for i_feature, feature_s in enumerate(feature_s_l): for i_lag in range(lag): f.write('\t{0}:\n\t\ti_lag = {1:d}: {2:0.2g}\n'.format(feature_s, lag-i_lag, coeff_t[lag*(i_feature+1) + i_lag])) return model
else: binary_y_pre.append(0) binary_y = np.array(binary_y_pre) coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas) coef_path_logistic_cv = LogisticRegression( tol=Tol) coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol) coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features) binary_X = vectorizer_binary.fit_transform(corpus) coef_path_forest_cv.fit(X,binary_y) coef_path_lasso_cv.fit(X,y) coef_path_binary_x_logistic_cv.fit(binary_X,binary_y) coef_path_logistic_cv.fit(X,binary_y) coef_path_elastic_cv.fit(X,y) forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_] forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest'] lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_] lasso_scores = [lasso_cv_score, r2_score(y,lasso_results_parameters[0]), 'lasso']
X_train,X_test,Y_train,Y_test = train_test_split(alldata,newY,test_size=0.3) #frequencies X_train,X_test,Y_train,Y_test = train_test_split(allfreqdata,newY,test_size=0.3) svr = SVR(cache_size=1500) svr_params = { 'C' : [1e-2,1,1e2] , 'epsilon' : [1e-3,1e-2,1e-1] } #fit without transforms 0.009 #fit with kld 0.017 #test with newy hier. interc. #takes looong enet_cv = ElasticNetCV(l1_ratio=[0.1,0.3,0.5,0.7,0.9],max_iter=2000) enet_cv.fit(X_tr_new,Y_train) rcv = RidgeCV(alphas=[1e-2,1e-1,1,10]) #rcv.fit(X_train,Y_train) svr_gs = GridSearchCV(svr,svr_params,verbose=1,n_jobs=-1) #svr_gs.fit(X_train,Y_train) #%% #visualization of posterior ERPs averaged over Pbs and epochs #for chan Fz posteriors = np.unique(np.round(bc_dict["01"],decimals=2)) avr_ERP_p_post_list = [get_average_ERPs_per_posterior(mat_dict[k],bc_dict[k],chan=4) for k in sorted(mat_dict.keys())]
plt.show() #RidgeCV from sklearn.linear_model import RidgeCV model = RidgeCV(cv=20) model_ridge = model.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) rating_predicted = model_ridge.predict(ratings_ext_input_sim2[X_features]) error = (rating_predicted - ratings_ext_input_sim2['rating']) np.mean(error*error) # 4.77 (0.633 good?) score=model_ridge.score(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) model_ridge.coef_ # Elastic Net from sklearn.linear_model import ElasticNetCV enet = ElasticNetCV(l1_ratio=0.5,cv = 10) # 1 for LASSO model_enet = enet.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) rating_predicted = model_enet.predict(ratings_ext_input_sim2[X_features]) error = (rating_predicted - ratings_ext_input_sim2['rating']) np.mean(error*error) # 4.168 # alpha = 1, l1_ration = 0: very high 4.67 # alpha = 0.1, l1_ration = 0: very high 4.57 # alpha = 0.5, l1_ration = 0: very high 4.64 # alpha = 0.7, l1_ration = 0: very high 4.65 from sklearn.linear_model import lasso_path, enet_path model_enet.mse_path_ plt.figure(1) ax = plt.gca() ax.set_color_cycle(2 * ['b', 'r', 'g', 'c', 'k']) #l1 = plt.plot(-np.log10(alphas_lasso), coefs_lasso.T) l1 = plt.plot(-np.log10(model_enet.alphas_), model_enet.coef_, linestyle='--')
#add intercept X = np.hstack((np.ones(X.shape[0])[:,None],X)) train_X,test_X,train_Y,test_Y = train_test_split(X,y,test_size=0.1) #%% #try elastic net #alpha equals lambda here lambda_grid = [0.01, 0.1 , 1, 10,100] l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9] enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,alphas=lambda_grid,cv=3,n_jobs=-1,verbose=True) enet_CV.fit(train_X,train_Y) #%% #show enet_CV.score(test_X,test_Y) plt.plot(enet_CV.predict(test_X),test_Y,'o') #%% #try svr svr = SVR(kernel = 'rbf',C=1,cache_size=2000) SVR_params = { 'C' : [1e-1,1.0,1e2,1e3,1e4] } svr_rs = grid_search.RandomizedSearchCV(svr,SVR_params,verbose=True,n_jobs=-1) svr.fit(train_X[:,whichones[0]],train_Y)
features = features.dropna(axis=1) alpha_values = [] for a in range(1, 10001): alpha_values.append(a / 100) print "Started at " + str(datetime.now()) estimator_ridge = RidgeCV(alphas=alpha_values, cv=3) estimator_ridge.fit(features, goal) scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5) print "Ridge alpha " + str(estimator_ridge.alpha_) print str(np.mean(scores)) print scores estimator_lasso = LassoCV(alphas=alpha_values, cv=3) estimator_lasso.fit(features, goal) scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5) print "Lasso alpha " + str(estimator_lasso.alpha_) print str(np.mean(scores)) print scores estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1) estimator_elastic_net.fit(features, goal) scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5) print "ElasticNet alpha " + str(estimator_elastic_net.alpha_) print str(np.mean(scores)) print scores print "Finished at " + str(datetime.now())
met.fit(x[train], y[train]) p[test] = met.predict(x[test]) r2_cv = r2_score(y, p) print('Method: {}'.format(name)) print('R2 on training: {}'.format(r2_train)) print('R2 on 5-fold CV: {}'.format(r2_cv)) print() # Construct an ElasticNetCV object (use all available CPUs) met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(x), n_folds=5) pred = np.zeros_like(y) for train, test in kf: met.fit(x[train], y[train]) pred[test] = met.predict(x[test]) print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(y, p)))) print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(y, p))) print('') ''' # unit version from time import time import numpy as np from step3_vectorize_text import preprocess_4 from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score
l = [] with h5py.File("ECoG_big_data.h5", "r+") as f1: with h5py.File("selected.h5", "r+") as f2: for i in range(1, 4): sid = "sub" + str(i) X = f1[sid]["train_data"][:] Y = f1[sid]["train_clabel"][:] Yb = f1[sid]["train_blabel"][:] Xt = f1[sid]["test_data"][:] Yt = f1[sid]["test_clabel"][:] Ytb = f1[sid]["test_blabel"][:] for finger in range(5): for method in ["l1", "mcp", "scad"]: idxc = f2[sid]["finger" + str(finger + 1)][method][:] - 1 idxb = f2[sid]["finger" + str(finger + 1)]["l1_l"][:] - 1 en = ElasticNetCV() en.fit(X[:, idxc].astype("float64"), Y[:, finger]) yp = en.predict(Xt[:, idxc]) corr = np.corrcoef(yp, Yt[:, finger])[0, 1] if corr < 0.3: break else: l.append([sid + "//" + "finger" + str(finger + 1), corr]) lr = LogisticRegressionCV() lr.fit(X[:, idxc], Yb[:, finger]) tp = yp * fun(lr.predict(Xt[:, idxc])) m = np.where(np.convolve(tp, np.ones((40,)) / 40, mode="same") < 0.5, 0, 1) b, a = butter(2, 9.0 / 25, "low") yy = relu(filtfilt(b, a, tp * m)) print corr, np.corrcoef(Yt[:, finger], yy)[0, 1]