def l1_enet(ratio): ''' input l1 ratio and return the model, non zero coefficients and cv scores training elastic net properly ''' enet_cv = ElasticNetCV(cv=rkf, l1_ratio=ratio, max_iter=1e7, tol=0.001, fit_intercept=fit_int_flag, random_state=rs) enet_cv.fit(X_train, y_train) # the optimal alpha enet_alpha = enet_cv.alpha_ enet_coefs = enet_cv.coef_ n_nonzero = len(np.where(abs(enet_coefs) >= 1e-7)[0]) # Access the errors y_predict_test = enet_cv.predict(X_test) y_predict_train = enet_cv.predict(X_train) # error per cluster enet_RMSE_test = np.sqrt(mean_squared_error(y_test, y_predict_test)) enet_RMSE_train = np.sqrt(mean_squared_error(y_train, y_predict_train)) return enet_cv, enet_alpha, n_nonzero, enet_RMSE_test, enet_RMSE_train
def _elasticnetcv(*, train, test, x_predict=None, metrics, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection='cyclic'): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV """ model = ElasticNetCV(l1_ratio=l1_ratio, eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, max_iter=max_iter, tol=tol, cv=cv, copy_X=copy_X, verbose=verbose, n_jobs=n_jobs, positive=positive, random_state=random_state, selection=selection) model.fit(train[0], train[1]) model_name = 'ElasticNetCV' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def Regularized_Smap(abund, target_otu, theta, l_grid, iteration, cv, train_len): print('Process data for otu No. %s' % str(target_otu+1)) # Make input for the elastic_net block = np.append(abund[1:, target_otu], abund[0:-1, ], axis=1) ##Delete the uncontinuous states block = np.delete(block, [abund.shape[0] / 3 - 1, abund.shape[0] / 3 * 2 - 1], axis=0) ##Scaling the input ##Each time series is normalized to have a mean of 0 and standard deviation of 1 before analysis with S-maps block = (block - np.average(block, axis=0)) / np.std(block, axis=0) ##Select data and fitting print('Start fitting.') lib = range(block.shape[0]) coefs = np.empty(shape=(block.shape[0], block.shape[1] - 1)) fit_results = np.empty(shape=(block.shape[0], 13)) for ipred in lib: print('\r', 'Complete percentage: %.2f%%' % (ipred / len(lib) * 100), end="", flush=True) sub_block = np.delete(block, ipred, axis=0) q = block[lib[ipred], :] ###Calculate weights E_dist = np.sqrt(np.sum(np.array(sub_block[:, 1:] - q[:, 1:]) ** 2, axis=1)) w = make_weights(E_dist, theta) ###Weighted predictors and responses X_wp = weight_data(sub_block[:, 1:], w) Y_wp = np.ravel(weight_data(sub_block[:, 0], w)) X_target = block[ipred, 1:] Y_target = block[ipred, 0] ##Split training and test data pick_test = np.random.choice(range(X_wp.shape[0]), size=train_len, replace=False) X_train = np.append(np.delete(X_wp, pick_test, axis=0), X_target, axis=0) X_test = X_wp[pick_test, :] Y_train = np.append(np.delete(Y_wp, pick_test, axis=0), Y_target) Y_test = Y_wp[pick_test] ###Fit function regr = ElasticNetCV(cv=cv, random_state=0, max_iter=iteration, l1_ratio=[(i + 1) * l_grid for i in range(int(1 / l_grid))]) regr.fit(X_train, Y_train) rmse = np.sqrt(np.mean((regr.predict(X_train) - Y_train) ** 2)) rmse_o = np.sqrt(np.mean((regr.predict(X_test) - Y_test) ** 2)) coefs[ipred, :] = regr.coef_ fit_results[ipred, :] = regr.intercept_, regr.alpha_, regr.l1_ratio_, rmse, np.std(Y_train), rmse_o, np.std( Y_test), regr.score(X_test, Y_test), regr.score(X_train, Y_train), max(Y_train), min(Y_train), max( Y_test), min(Y_test) print('\r', 'Complete percentage: %.2f%%' % ((ipred + 1) / len(lib) * 100), end="", flush=True) # Output results coefs = pd.DataFrame(data=coefs) coefs.to_csv('../Output/test/0/coefs/%s_%s_coefs.csv' % (target_otu, theta)) fit_results = pd.DataFrame( columns=['Intercept', 'Best alpha', 'Best l1_ratio', 'RMSE', 'Std', 'RMSE_o', 'Std_o', 'Test set score', 'Test set score_train', 'ymax_train', 'ymin_train', 'ymax_test', 'ymin_test'], data=fit_results) fit_results.to_csv('../Output/test/0/fit_result/%s_%s_fit_results.csv' % (target_otu, theta))
def train_elasticnet_model(self, mode, ffm): # X_train = np.array(self.X_train[mode]) X_train = np.array(self.X_train2) y_train = np.array(self.y_train[ffm]) # X_val = np.array(self.X_val[mode]) X_val = np.array(self.X_val2) y_val = np.array(self.y_val[ffm]) l1ratios = np.linspace(0.1, 1, 10) mses = [] alps = [] verr = [] for l1 in l1ratios: print(l1) enet = ElasticNetCV(l1_ratio=l1, cv=10) enet.fit(X_train, y_train) y_pred = enet.predict(X_val) mse = mean_squared_error(y_val, y_pred) v = enet.score(X_val, y_val) mses.append(mse) alps.append(enet.alpha_) verr.append(v) i_opt = np.argmin(mses) l1_opt = l1ratios[i_opt] alpha_opt = alps[i_opt] print("optimal l1", l1_opt) print("optimal alpha", alpha_opt) enet2 = ElasticNetCV(l1_ratio=l1_opt) enet2.fit(X_train, y_train) y_pred = enet2.predict(X_val) y_pred_train = enet2.predict(X_train) print("Training MSE", mean_squared_error(y_train, y_pred_train)) print("Validation MSE", mean_squared_error(y_val, y_pred)) print("Training Pearson R", pearsonr(y_train, y_pred_train)) print("Validation Pearson R", pearsonr(y_val, y_pred)) print("Training R2 score:", enet.score(X_train, y_train)) print("Validation R2 score:", enet.score(X_val, y_val)) # print(enet2.alpha_) key = tuple(mode + [ffm]) self.elasticnet[key] = enet2 return self.elasticnet[key]
def run_repeated(self, feature_prefix, n_trials=10, kfold_num=5): if type(feature_prefix) == str: feature_prefix = [feature_prefix] X, Y = self.build(feature_prefix) X = X.values folder = RepeatedKFold(n_splits=kfold_num, n_repeats=n_trials, random_state=self.seed) results = list() desc = "+".join(feature_prefix) cv_iterator = tqdm(folder.split(X), total=n_trials * kfold_num, ncols=50, desc=desc) for train_index, test_index in cv_iterator: for f in Y.columns: y = Y[f].values X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = ElasticNetCV( random_state=self.seed, n_alphas=50, cv=10, n_jobs=4, l1_ratio=[.01, .1, 0.3, .5, 0.7, 0.9, 0.99], selection='random', tol=5e-3, verbose=0) model.fit(X_train, y_train) y_test_pred = model.predict(X_test) y_train_pred = model.predict(X_train) train_r2 = r2_score(y_train, y_train_pred) test_r2 = r2_score(y_test, y_test_pred) cv_iterator.set_description("{} {}: {:.2f}".format( desc, f, test_r2)) cv_iterator.refresh() sleep(0.01) r_row = { "foundation": f, "test_r2": test_r2, "train_r2": train_r2, "alpha": model.alpha_, "l1_ratio": model.l1_ratio_ } results.append(r_row) df = pd.DataFrame(results) return df
def elastic_net_reg(X_train_scaled, X_test_scaled, y_train, y_test): from sklearn.linear_model import ElasticNetCV #n_alphas (int) số lượng số alphas trong quá trình regularization, được sử dụng cho mỗi l1_ratio n_alphas = 300 #float between 0 and 1 passed to ElasticNet (scaling between l1 and l2 penalties) l1_ratio = [.1, .3, .5, .7, .9] #cv: chỉ định số lượng k-folds rr = ElasticNetCV(n_alphas=n_alphas, l1_ratio=l1_ratio, cv=10, random_state=0) rr.fit(X_train_scaled, y_train) y_pred_train = rr.predict(X_train_scaled) y_pred_test = rr.predict(X_test_scaled) metrics_en = [accuracy_score(y_test, np.round(y_pred_test)), mean_squared_error(y_test, y_pred_test), r2_score(y_test, y_pred_test)] return metrics_en
def elasticnet(): elasticnet = ElasticNetCV() X_train, X_test, Y_train, Y_test = train_test_split(train_pca_value, train_pro, test_size=0.1, random_state=9) elasticnet.fit(X_train, Y_train) pre = elasticnet.predict(X_test) loss = mean_squared_error(pre, Y_test) print(loss) pre = elasticnet.predict(test_pca_data) write = open('data/elasticnet.txt', 'w') for i in range(len(pre)): write.write("%f\r" % pre[i]) write.close()
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) u = reviews[i] us = range(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:, ps].T y = u.data kf = KFold(n_splits=4) predictions = np.zeros(len( u.toarray().ravel())) # 他のモデルと形を合わせるため,評価が行われていない映画はpredictionsを0にする for train, test in kf.split(y): xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train] - x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = np.array(reg.predict(xc)).ravel() predictions[test] = p return predictions
def main(file_name): df_X_train = pd.read_csv("../datasets/{}_X_train.csv".format(file_name), index_col=0) df_X_test = pd.read_csv("../datasets/{}_X_test.csv".format(file_name), index_col=0) df_y_train = pd.read_csv("../datasets/{}_y_train.csv".format(file_name), index_col=0) df_y_test = pd.read_csv( "../datasets/{}_y_test.csv".format(file_name), index_col=0, ) bg_genes = open(os.path.join("../datasets/", "bg_genes.txt")).read().split("\n") affecting_genes = open(os.path.join( "../datasets/", "affecting_genes.txt")).read().split("\n") if affecting_genes[0] == '': affecting_genes = [] fs = bg_genes + affecting_genes + ['T'] df_X_train = df_X_train.loc[:, fs] df_X_test = df_X_test.loc[:, fs] model = ElasticNetCV().fit(df_X_train, df_y_train) y_hat = model.predict(df_X_test) loss = np.sqrt(mean_squared_error(y_hat, df_y_test)) file("../output/elastic_net_{}".format(file_name), 'w+').write(pickle.dumps(model)) print "loss_elastic_net: {}".format(loss)
def regression_NumMosquitos(Xtr, ytr, Xte): from sklearn.linear_model import ElasticNetCV #model_nm = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, cv=4) model_nm = ElasticNetCV() model_nm.fit(Xtr, ytr) results_nm = model_nm.predict(Xte) return results_nm
def predict(X_train, X_test, y_train, y_test, features, pic_name, dir): """ The function predicts the tags of X_test by the elastic net model :param X_train: :param X_test: :param y_train: :param y_test: :param labels: :param pic_name: :param dir: :return: """ model = ElasticNetCV(cv=4) model.fit(X_train, y_train) predict = model.predict(X_test) print("mean absolute error: ", mean_absolute_error(y_test, predict)) print("r2 error: ", sklearn.metrics.r2_score(y_test, predict)) print("alpha: ", model.alpha_) print("alphas: ", model.alphas_) print("iter: ", model.n_iter_) x = len(features) y = len(model.coef_) coefficients = [(d, c) for d, c in zip(features, model.coef_)] coefficients_str = "" for a, b in coefficients: coefficients_str += a + ": " + str("%.4f" % b) + "\n" coefficients_str = coefficients_str[:-2] print("coef: ", coefficients_str) Plot_output.plot_coefficients(coefficients_str, pic_name=pic_name, dir=dir) Plot_output.plot_graph(X_test, y_test, predict, pic_name, dir)
def GLM(X_train, X_test, y_train): GLM_Model = ElasticNetCV(random_state=0, tol=0.01, cv=5, max_iter=20000) GLM_Model.fit(X_train, y_train) y_prediction = GLM_Model.predict(X_test) return y_prediction
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) nusers, nmovies = reviews.shape u = reviews[i] us = np.arange(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.ravel() > 0) x = reviews[us][:, ps].T kf = KFold(len(ps), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, u[train] - x1) xc = x[test].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = reg.predict(xc).ravel() predictions[test] = p fill_preds = np.zeros(nmovies) fill_preds[ps] = predictions return fill_preds
def Model(Encoding, Scores, Run_name, step_size, loop_dict, var_dict, round_data, ElasticNet_dict, l1_ratios, All_data): Pearson_correlations = [] Data = Encoding.copy() #copy, so it does not change# Data_sets = CV_split(Data, 5) # The Big 5# for cv_round in range(len(Data_sets)): score_dict = Scores.copy() #Randomized scores at the start each time# Test_set = Data_sets[cv_round] Train_set = exclude(Data_sets, cv_round) #Keeps everything but the train set# Train_set = pd.concat(Train_set) #All train sets into on dataframe# X = Train_set.iloc[:, :Train_set.shape[1] - 1] #features# X['Intercept'] = 1 #add intercept# y = pd.DataFrame(Train_set['pMeas']) #targets# AM_EndOfLoopError = [] AM_EndOfLoopError.append(Get_Error( X, y, score_dict)) # The Error Before AM Tuning # """AM Tuning Looping Starts Here and Adds a value to End of Loop Error""" Loop_num = 1 # AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict while ((AM_EndOfLoopError[-1] - AM_EndOfLoopError[-2]) / (AM_EndOfLoopError[-2])) < -0.001: Loop_num += 1 AM_EndOfLoopError.append( Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name, cv_round, loop_dict, var_dict)) round_data[cv_round] = loop_dict loop_dict['AM Time Series Data'] = AM_EndOfLoopError loop_dict['Final Scores'] = score_dict """ AM Tuning is now Finished for the CV_split, Elastic Net is Next """ EN = ElasticNetCV(l1_ratio=l1_ratios, cv=5, copy_X=True, normalize=True, random_state=23) X_train = X.copy() X_train.replace(score_dict, inplace=True) y_train = y.copy() X_test = Test_set.iloc[:, :Test_set.shape[1] - 1] X_test.replace(score_dict, inplace=True) X_test['Intercept'] = 1 y_test = pd.DataFrame(Test_set['pMeas']) EN.fit(X_train, y_train) y_pred = pd.DataFrame(EN.predict(X_test)) Pearson_correlations.append(np.corrcoef(y_test.T, y_pred.T)[0][1]) """Save Everything """ ElasticNet_dict["y_pred"] = y_pred ElasticNet_dict['y_test'] = y_test ElasticNet_dict['Alpha'] = EN.alpha_ ElasticNet_dict['l1_ratio'] = EN.l1_ratio_ ElasticNet_dict['Parameters'] = EN.get_params() ElasticNet_dict["AlphaSpace"] = EN.alphas_ loop_dict['ElasticNet'] = ElasticNet_dict round_data[cv_round] = loop_dict All_data[Run_name] = round_data np.save("All Data.npy", All_data) return np.mean(Pearson_correlations)
def elastic_net(name, cv=5): '''Outputs a fitted Elastic Net Regression Model with tuning parameters found through cross validation. Inputs must be standardized. l1_ratios are spread out on a log scale as recommended by package authors. Number of folds in cross validation is by default 5. n_jobs = -1 allows for all local processors to be utilized. # ''' # if np.any(X_train.mean(axis = 0) > 1): # raise ValueError('Numerical features must be standardized') display_name = ds.get_names()[name] X_train, X_test, y_train, y_test, train = split.split_subset(name) X_train, X_test = split.standardize(X_train, X_test) l1_ratios = np.geomspace(1e-8, 1, 50) model = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=50, cv=5, verbose=0, n_jobs=-1, random_state=18).fit(X_train, y_train) performance = metrics.apply_metrics('{} Elastic Net'.format(display_name), y_test, model.predict(X_test), y_train) performance['Tuning Parameters'] = [{ 'Alpha': model.alpha_, 'L1 Ratio': model.l1_ratio_ }] return model, performance
def calculateAccuracyWithModel(indbest, X_train, y_train, X_test, y_test): indbest = list(indbest) evalTrain = evaluatedMatrix(indbest, X_train) evalTest = evaluatedMatrix(indbest, X_test) # Linear regression with elastic net regr = ElasticNetCV(random_state=0) regr.fit(evalTrain, y_train) y_pred = regr.predict(evalTest) print(r2_score(y_test, y_pred)) indbest, regr.coef_ = sortCoef(indbest, regr.coef_) model = "" i = 0 if regr.intercept_ not in [0, -0]: model = str(coefStr(regr.intercept_)) for ind in indbest: if regr.coef_[i] not in [0, -0]: if "-" in str(regr.coef_[i]): indCoef = str(coefStr(regr.coef_[i])) + "*" + str(ind) elif len(model) > 0: indCoef = "+" + str(coefStr(regr.coef_[i])) + "*" + ind else: indCoef = str(coefStr(regr.coef_[i])) + "*" + ind model = model + indCoef i = i + 1 print(model)
def algor_ElasticNetCV(): request_content = request.form.to_dict() df = pd.read_csv(session.get('file')) X_train,Y_train = onehot(df) params = request_content if params['alpha'] != 'None': params['alpha'] = [float(params['alpha'])] else: params['alpha'] = None # print(type(params['max_depth'])) # print(params['max_iter']) # elif params['class_weight'] == 'l1': # max_iter = int(round(float(params['max_iter']))) model = ElasticNetCV(alphas=params['alpha'], l1_ratio=float(params['l1_rotio']), fit_intercept=bool(params['fit_intercept']), normalize=bool(params['normalize']), max_iter=int(params['max_iter']), tol=float(params['tol'],) ) model.fit(X_train, Y_train) y_pred = model.predict(X_train) context = { 'algor': '弹性网回归', 'roc_AUC': 'None(仅用于分类器)', 'ACC': 'None(仅用于分类器)', 'Recall': 'None(仅用于分类器)', 'F1_score': 'None(仅用于分类器)', 'Precesion': 'None(仅用于分类器)', 'R_2' : round(metrics.r2_score(Y_train,y_pred),2) } return render_template('ElasticNetCV.html', **context)
def elasticnet_cv(self, nsplits: int, lam: float = None, l1_ratio: float = None): """ runs a cross validation on the data set and returns the cross validation performance :param nsplits: number of cv splits :param lam: tuning parameter :param l1_ratio: balance l1 and l2 penalization, 0 means ridge, 1 means lasso :return: the cross-validated mse """ if lam is None or l1_ratio is None: model = ElasticNetCV(cv=nsplits, l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.95, 0.99, 1]).fit(self.x, self.y) if lam is None: lam = model.alpha_ if l1_ratio is None: l1_ratio = model.l1_ratio_ cv = KFold(n_splits=nsplits) mse_result = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = ElasticNet(alpha=lam, l1_ratio=l1_ratio).fit(x_train, y_train) y_predict = model.predict(x_test) mse_result.append(mse(y_test, y_predict)) return np.mean(mse_result)
def learn_for(self, i): reviews = AbstractEstimateBase.reviews reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) nusers,nmovies = reviews.shape u = reviews[i] us = np.arange(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.ravel() > 0) x = reviews[us][:, ps].T kf = KFold(len(ps), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, u[train] - x1) xc = x[test].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = reg.predict(xc).ravel() predictions[test] = p fill_preds = np.zeros(nmovies) fill_preds[ps] = predictions return fill_preds
def predict(self, X): binary = X > 0 if self.normalize == True: X = self.norm.fit_transform(X) num_users, num_movies = X.shape clf = ElasticNetCV(alphas=[0.1]) predicted = X.copy() for user in range(num_users): #bool array for movies rated by user movie_user = binary[user] #which users to consider as attributes for regression, in this case all except current user neighbors = np.ones((num_users), dtype=bool) neighbors[user] = False X_train_user = X[neighbors] X_train_user = X_train_user[:, movie_user].T y_train_user = X[user, movie_user] clf.fit(X_train_user, y_train_user) X_test_user = X[neighbors] X_test_user = X_test_user[:, ~movie_user].T predicted[user, ~movie_user] = clf.predict(X_test_user) if self.normalize == True: predicted = self.norm.inverse_transform(predicted) return predicted
def Elastic_net_fitting(block, target_otu, interest_otu, theta, train_len, cv, iteration, l_grid, output_dir): ##Select data and fitting print('Start fitting.') lib = range(block.shape[0]) coefs = np.empty(shape=(block.shape[0], block.shape[1] - 1)) fit_results = np.empty(shape=(block.shape[0], 13)) for ipred in lib: print('\r', 'Complete percentage: %.2f%%' % (ipred / len(lib) * 100), end="", flush=True) sub_block = np.delete(block, ipred, axis=0) q = block[lib[ipred], :] ###Calculate weights E_dist = np.sqrt(np.sum(np.array(sub_block[:, 1:] - q[:, 1:]) ** 2, axis=1)) w = make_weights(E_dist, theta) ###Weighted predictors and responses X_wp = weight_data(sub_block[:, 1:], w) Y_wp = np.ravel(weight_data(sub_block[:, 0], w)) X_target = block[ipred, 1:] Y_target = block[ipred, 0] ##Split training and test data pick_test = np.random.choice(range(X_wp.shape[0]), size=train_len, replace=False) X_train = np.append(np.delete(X_wp, pick_test, axis=0), X_target, axis=0) X_test = X_wp[pick_test, :] Y_train = np.append(np.delete(Y_wp, pick_test, axis=0), Y_target) Y_test = Y_wp[pick_test] ###Fit function regr = ElasticNetCV(cv=cv, random_state=0, max_iter=iteration, l1_ratio=[(i + 1) * l_grid for i in range(int(1 / l_grid))]) regr.fit(X_train, Y_train) rmse = np.sqrt(np.mean((regr.predict(X_train) - Y_train) ** 2)) rmse_o = np.sqrt(np.mean((regr.predict(X_test) - Y_test) ** 2)) coefs[ipred, :] = regr.coef_ fit_results[ipred, :] = regr.intercept_, regr.alpha_, regr.l1_ratio_, rmse, np.std(Y_train), rmse_o, np.std( Y_test), regr.score(X_test, Y_test), regr.score(X_train, Y_train), max(Y_train), min(Y_train), max( Y_test), min(Y_test) print('\r', 'Complete percentage: %.2f%%' % ((ipred + 1) / len(lib) * 100), end="", flush=True) # Output results coefs = pd.DataFrame(data=coefs) coefs.to_csv('/'.join([output_dir,'coefs/%s_%s_%s_fit_results.csv' % (interest_otu, target_otu, theta)])) fit_results = pd.DataFrame( columns=['Intercept', 'Best alpha', 'Best l1_ratio', 'RMSE', 'Std', 'RMSE_o', 'Std_o', 'Test set score', 'Test set score_train', 'ymax_train', 'ymin_train', 'ymax_test', 'ymin_test'], data=fit_results) fit_results.to_csv('/'.join([output_dir,'fit_result/%s_%s_%s_fit_results.csv' % (interest_otu, target_otu, theta)]))
class LinearModel: def fit(self, X, y): self.clf = ElasticNetCV(cv=5, random_state=0).fit(X, y) def predict(self, X): y_pred_prob = self.clf.predict(X) y_pred_prob_vec = np.array([[i, 1 - i] for i in y_pred_prob]) return _, _, y_pred_prob_vec
def enetCV(): print ("Doing elastic net") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf4 = ElasticNetCV(cv=cross_val) clf4.fit(base_X, base_Y) print ("Score = %f" % clf4.score(base_X, base_Y)) clf4_pred = clf4.predict(X_test) write_to_file("elasticCV.csv", clf4_pred)
def train_test_en(input_data, output_data, train_key, test_key, n_cv=3): """ elastic net回帰による学習/予測 """ # set parameter #alphas = 10 ** np.arange(-2, 1, 0.1) # 例外処理 : 学習データ点数が分割数より少ない場合 if len(train_key) < n_cv: n_cv = len(train_key) #------------- # 学習 #------------- x = input_data[train_key,:] y = output_data[train_key] # インスタンス x_scaler = StandardScaler() #正規化 y_scaler = StandardScaler() #正規化 clf = ElasticNetCV(l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1], n_jobs=8, n_alphas=20, cv=n_cv) # モデル構築 x_scaler.fit(x) #正規化 y_scaler.fit(y.reshape(-1,1)) #正規化 y_ = y_scaler.transform(y.reshape(-1,1)) y_ = y_.reshape(-1) #import pdb; pdb.set_trace() # モデル構築 with warnings.catch_warnings(): #警告無視 warnings.simplefilter("ignore") clf.fit(x_scaler.transform(x), y_) # モデルパラメータ取得 #alpha = clf.alpha_ #ハイパーパラメータ a = clf.coef_ #係数 b = clf.intercept_ #切片 p = np.append(a, b) #------------- # 予測 #------------- x = input_data[test_key,:] # 例外処理 : xのデータ点数 = 1の場合 ⇒配列を整形 if x.ndim == 1: x = x.reshape(1,-1) # 予測 tmp = clf.predict(x_scaler.transform(x)) y_pred = y_scaler.inverse_transform(tmp) #非正規化 return y_pred, p
def train(self, y, folds, l1_ratio=0.8, normalize=True): K = len(folds) yhats = [] rmses = [] for k in range(K): start = time.time() train = folds[k]['train'] valid = folds[k]['valid'] X_train = train.drop(columns=[y]).values y_train = train[y].values X_valid = valid.drop(columns=[y]).values y_valid = valid[y].values if self.model == 'en': kf = KFold(n_splits=10, random_state=100 + k, shuffle=True) lm = ElasticNetCV(cv=kf, random_state=k, normalize=normalize, max_iter=5000, l1_ratio=l1_ratio) elif self.model == 'lm': lm = LinearRegression(normalize=normalize) lm.fit(X_train, y_train) y_pred = lm.predict(X_valid) yhats.append( pd.DataFrame({ 'y': valid[y], 'yhat': y_pred }, index=valid.index)) pickle.dump( lm, open( os.path.join(self.model_path, '{}_{}.tar'.format(self.model, k)), 'wb')) rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) rmses.append(rmse) if self.verbose: print( 'done training with {} fold and rmse={}. took {}s'.format( k, round(rmse, 4), int(time.time() - start))) rmse = round(np.mean(rmses), 4) yhats = pd.concat(yhats).sort_index() yhats.to_csv( os.path.join(self.output_path, '{}_{}'.format(self.model, rmse))) if self.verbose: print('avg rmse: {}'.format(rmse)) return rmse
def calculateAccuracy(indbest, X_train, y_train, X_test, y_test): indbest = list(indbest) evalTrain = evaluatedMatrix(indbest, X_train) evalTest = evaluatedMatrix(indbest, X_test) # Linear regression with elastic net regr = ElasticNetCV(random_state=0) regr.fit(evalTrain, y_train) y_pred = regr.predict(evalTest) print("Test Accuracy: ", r2_score(y_test, y_pred)) return r2_score(y_test, y_pred)
def local_elasticnet(dataset, cid_input): x_df = dataset[cid_input.X_SCHEMA_CID3] y_df = dataset[cid_input.Y_SCHEMA_CID3] #x_df = (x_df - x_df.mean()) / (x_df.max() - x_df.min()) #elasticnet绝对不能用上面方法进行数据归一化 x_np = x_df.values y_np = y_df.values.ravel() lm = ElasticNetCV() lm.fit(x_np, y_np) coefficients = lm.coef_ intercept = lm.intercept_ initial_fitting = lm.predict(x_df) initial_fitting = np.exp(initial_fitting) x_non_promo = non_promo_inputs(x_df, cid_input.cid3_promotion_features).values s_initial_fitting = lm.predict(x_non_promo) s_initial_fitting = np.exp(s_initial_fitting) current_dt = dataset.copy() current_dt['initial_base_line'] = s_initial_fitting return current_dt[cid_input.LOCAL_SCHEMA_CID3]
def elastic_regression(X,y,X_test): elastic_net = ElasticNetCV(alphas=[0.5,1.0,5.0,10], l1_ratio=[.1, .5, .9, 0.95,0.99], tol=0.001, max_iter=5000) elastic_net.fit(X, y) y_elastic_pred = elastic_net.predict(X_test) pred_elastic = pd.DataFrame(y_elastic_pred) best_alpha = elastic_net.alpha_ best_l1_ratio = elastic_net.l1_ratio_ print ("Best Alpha for ElasticNet:", best_alpha) print ("L1 Ratio for ElasticNet:", best_l1_ratio) return pred_elastic,best_alpha,best_l1_ratio
def eNetModel(data, labels, featureNames, texts, documents, nFolds): # run SVM with grid search for parameters and leave-one-out cross validation kf = KFold(len(texts), n_folds=nFolds) acc = 0 mean_coefs = [] for train, test in kf: # test_docs = {} label_train = labels[train] #selected_feats = getSelectedFeatures(train, test, texts, featureNames, documents, label_train, nFeats) full_train_data, full_test_data, label_train, label_test = data[train], data[test], labels[train], labels[test] #data_train = sortBySelected(full_train_data, selected_feats, featureNames) #data_test = sortBySelected(full_test_data, selected_feats, featureNames) data_train = full_train_data data_test = full_test_data enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) enet.fit(data_train, label_train) data_train = np.asarray(data_train,dtype=float) label_train = np.asarray(label_train,dtype=float) vals = enet.path(data_train, label_train) mean_coefs.append(np.mean(vals[1],axis=1)) if label_test == 1 and enet.predict(data_test) > 0.5: acc += 1 elif label_test == 0 and enet.predict(data_test) < 0.5: acc += 1 if len(mean_coefs) % 10 == 0: print str(len(mean_coefs)), 'out of %s subs finished' %(str(len(data))) mean_coefs = np.mean(np.array(mean_coefs), axis=0) return Decimal(acc)/Decimal(len(data)), mean_coefs
def regtsls(data, opts): T_test, Z, T, Y = data trans = PolynomialFeatures(degree=_get(opts, 'lin_degree', 1), include_bias=False) polyT = trans.fit_transform(T) first = Pipeline([('poly', PolynomialFeatures(degree=_get(opts, 'lin_degree', 1))), ('elasticnet', MultiTaskElasticNetCV(cv=3))]) first.fit(Z, polyT) second = ElasticNetCV(cv=3) second.fit(first.predict(Z), Y.ravel()) polyT_test = trans.fit_transform(T_test) return second.predict(polyT_test).reshape(T_test.shape[:1] + Y.shape[1:])
def train_elasticNetCV(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training ElasticNetCV...') start_time = self.timer() enet = ElasticNetCV(normalize=True, n_alphas=2000, max_iter=2000, cv=10) enet.fit(x_tr, y_tr) print("The R2 is: {}".format(enet.score(x_tr, y_tr))) print("The alpha choose by CV is:{}".format(enet.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(enet.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/enetCV.pkl', 'wb') as f: pickle.dump(enet, f) print('Making prediction and saving into a csv') y_test = enet.predict(self.x_test) return y_test
def elastic_net_reg(): from sklearn.linear_model import ElasticNetCV n_alphas = 300 l1_ratio = [.1, .3, .5, .7, .9] rr = ElasticNetCV(n_alphas=n_alphas, l1_ratio=l1_ratio, cv=10, random_state=0) rr.fit(X_train_scaled, y_train) y_pred_train = rr.predict(X_train_scaled) #y_pred_train_round = np.round(y_pred_train) y_pred_test = rr.predict(X_test_scaled) #y_pred_test_round = np.round(y_pred_test) print(rr.alpha_, rr.l1_ratio_) print(rr.score(X_test_scaled, y_test)) #plot_conf_mat(y_test, _pred_round) global metrics_en metrics_en = [ accuracy_score(y_test, np.round(y_pred_test)), mean_squared_error(y_test, y_pred_test), r2_score(y_test, y_pred_test) ] return scores_results(y_train, y_test, y_pred_train, y_pred_test)
def LCCB_coevo(fitness_fn, pop): y = fitness_fn.train_y # Make a new array composed of pop[i].semantics for all i # (pop[i].semantics has already been calculated) X = None for ind in pop: if (ind.phenotype and ind.fitness != sys.maxint and all(np.isfinite(ind.semantics))): col = ind.semantics else: print("Omitting a column") col = np.zeros(len(y)) if X is None: X = col else: X = np.c_[X, col] eps = 5e-3 # FIXME FFX processes the data so that has zero mean and unit # variance before applying the LR... should we do that? # Use ElasticNet with cross-validation, which will automatically # get a good value for regularisation model = ElasticNetCV() model.fit(X, y) coefs = model.coef_ output = model.predict(X) rmse = fitness_fn.rmse(y, output) print("rmse", rmse) # Assign the magnitude of coefficients as individual fitness # values. Have to construct a new individual because tuples are # immutable. FIXME this is not a great method -- it's likely that # the population will converge on one or a few basis functions, # and then the performance of the ENet will decrease because there # won't be enough independent basis functions to work with. pop = [variga.Individual(genome=pop[i].genome, used_codons=pop[i].used_codons, fitness=-abs(coefs[i]), phenotype=pop[i].phenotype, readable_phenotype=pop[i].readable_phenotype, semantics=pop[i].semantics) for i in range(len(pop))] pop.sort(key=variga.ind_compare)
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # iterate over all users for u in range(train.shape[0]): # remove the current user for training curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # Fill the values that were not there already filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def regress(x, y, title): clf = ElasticNetCV(max_iter=200, cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1]) clf.fit(x, y) print "Score", clf.score(x, y) pred = clf.predict(x) plt.title("Scatter plot of prediction and " + title) plt.xlabel("Prediction") plt.ylabel("Target") plt.scatter(y, pred) # Show perfect fit line if "Boston" in title: plt.plot(y, y, label="Perfect Fit") plt.legend() plt.grid(True) plt.show()
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # 모든 사용자에 대해 반복 for u in range(train.shape[0]): # 훈련에서 현재 사용자 제거 curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # 이전에 없는 값을 넣는다 filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def build_regression(dat, start, n): print('Building linear regression...') from sklearn import datasets, linear_model from sklearn.linear_model import ElasticNetCV from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error dat = dat.drop(dat.columns[1:-5], axis=1) df = feature_engineer(dat,start,n) # # Split the targets into training/testing sets train = df[df['train']==1] test = df[df['train']==0] df_x_train = train.ix[:,1:-1] df_x_test = test.ix[:,1:-1] df_y_train = train.ix[:,0] df_y_test = test.ix[:,0] ##### The parameter l1_ratio corresponds to alpha in the glmnet R package # while alpha corresponds to the lambda parameter in glmnet. Specifically, # l1_ratio = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, # unless you supply your own sequence of alpha. df_x_test= df_x_test.reset_index(drop=True) lenn = df_x_test[df_x_test['qindex']==max(df_x_test['qindex'])-1].shape[0] lena = df_x_test[df_x_test['qindex']==max(df_x_test['qindex'])].shape[0] ############### elasticnet cv ########## temp=[] enetcv = ElasticNetCV(l1_ratio=[.01, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99, 1]) enetcv.fit(df_x_train, df_y_train) ########## next quarter ########### pred_next=[] for i in range(lenn): y_pre = enetcv.predict(df_x_test.iloc[i,:].values.reshape(1,-1))[0] pred_next.append(y_pre) if i < lenn: week = df_x_test.ix[i,'time'] quarter = df_x_test.ix[i,'qindex'] nextq = np.where((df_x_test['time']==week) & (df_x_test['qindex']==quarter+1))[0][0] df_x_test.ix[nextq,'lastq'] = y_pre df_x_test.ix[nextq,'avepnl'] = df_x_test.ix[(nextq-n+1):nextq,'lolastweek'].mean(axis=0) df_x_test.ix[nextq,'prop'] = df_x_test.ix[nextq,'avepnw']/df_x_test.ix[nextq,'avepnl']*df_x_test.ix[nextq,'lastq'] if i < lenn-1: nextw = np.where((df_x_test['time']==week+1) & (df_x_test['qindex']==quarter))[0][0] nextwq = np.where((df_x_test['time']==week+1) & (df_x_test['qindex']==quarter+1))[0][0] df_x_test.ix[nextw,'lastw'] = y_pre df_x_test.ix[nextwq,'lolastweek'] = y_pre df_x_test.ix[nextw,'avepnw'] = df_x_test.ix[max(nextw-n+1,1):nextw,'lastw'].mean(axis=0) df_x_test.ix[nextw,'prop'] = df_x_test.ix[nextw,'avepnw']/df_x_test.ix[nextw,'avepnl']*df_x_test.ix[nextw,'lastq'] ############ quarter after next ########## pred_afternext = [] for i in range(lena): y_pre = enetcv.predict(df_x_test.iloc[(i+lenn),:].values.reshape(1,-1))[0] pred_afternext.append(y_pre) if i < lena-1: week = df_x_test.ix[i+lenn,'time'] quarter = df_x_test.ix[i+lenn,'qindex'] nextw = np.where((df_x_test['time']==week+1) & (df_x_test['qindex']==quarter))[0][0] df_x_test.ix[nextw,'lastw'] = y_pre df_x_test.ix[nextw,'avepnw'] = df_x_test.ix[max(nextw-n+1,1):nextw,'lastw'].mean(axis=0) df_x_test.ix[nextw, 'prop'] = df_x_test.ix[nextw,'avepnw']/df_x_test.ix[nextw,'avepnl']*df_x_test.ix[nextw,'lastq'] y_pred_enetcv = pred_next + pred_afternext #print(mean_absolute_error(df_y_test, y_pred_enetcv, sample_weight=None, multioutput='uniform_average')) r2_score_enetcv = r2_score(df_y_test, y_pred_enetcv) # print("r^2 on test data : %f" % r2_score_enetcv) return np.array(pred_next), np.array(pred_afternext)
#%% #try elastic net #alpha equals lambda here lambda_grid = [0.01, 0.1 , 1, 10,100] l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9] enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,alphas=lambda_grid,cv=3,n_jobs=-1,verbose=True) enet_CV.fit(train_X,train_Y) #%% #show enet_CV.score(test_X,test_Y) plt.plot(enet_CV.predict(test_X),test_Y,'o') #%% #try svr svr = SVR(kernel = 'rbf',C=1,cache_size=2000) SVR_params = { 'C' : [1e-1,1.0,1e2,1e3,1e4] } svr_rs = grid_search.RandomizedSearchCV(svr,SVR_params,verbose=True,n_jobs=-1) svr.fit(train_X[:,whichones[0]],train_Y) #%% #try bagging/boosting etc #rfr = RandomForestRegressor(n_estimators = 30,n_jobs = 2) #rfr.fit(train_X,train_Y)
performance_negativebinomial = [] for x in [0.01,0.1,1,5,10]: cost = [] for a,b in cross_validation_object: resultingmodel = sm.NegativeBinomial(Y[a],X[a],loglike_method = 'geometric') #res = resultingmodel.fit(disp=False, maxiter = 200) res2 = resultingmodel.fit_regularized(alpha = x, maxiter = 200) cost.append(mean_squared_error(res2.predict(X[b]), Y[b])) performance_negativebinomial.append(np.mean(cost)) ##### Log linear model ########## not even close. from sklearn.linear_model import ElasticNetCV linear_fit = ElasticNetCV(cv = cross_validation_object, alphas = [0.01,0.1,1,5,10]) linear_fit.fit(X,np.log(Y+1)) mean_squared_error(np.exp(linear_fit.predict(X)) - 1, Y) ########## creating final model using train data + test data X_test,Y_test,junk = prepare_for_model('Dogs_Final_Test.csv',1) X,Y,junk = prepare_for_model('Dogs_Final_Train.csv',1) scaler = MinMaxScaler([0,1]) X_all = scaler.fit_transform(np.vstack((X_test,X))) Y_all = np.hstack((Y_test,Y)) Y_all = np.array([30 if i > 30 else i for i in Y_all]) final_model = sm.NegativeBinomial(Y_all,X_all,loglike_method = 'geometric') res2 = final_model.fit_regularized( alpha = 5, maxiter = 200)
from sklearn.linear_model import ElasticNetCV from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.cross_validation import train_test_split __doc__ = "See newcomparison.m" l1_ratio = 0.5 k_fold = 10 test_frac = 0.5 data_root = path.expanduser('~/data') # Load MNIST data mnist = fetch_mldata('MNIST original', data_home=data_root) X = mnist.data y = mnist.target # Split into train/test_frac X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_frac, random_state=0) # Construct and fit model en = ElasticNetCV(cv=k_fold, n_jobs=-1, random_state=0) en.fit(X_train, y_train) # Evaluate performance y_pred = np.round(en.predict(X_test)) conf_mat = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred) print(acc)
predicted=lr.predict(X) '''validation''' kf=KFold(len(X),n_folds=5) p=np.zeros_like(y) for train,test in kf: lr.fit(X[train],y[train]) p[test]=lr.predict(X[test]) rmse_cv=np.sqrt(mean_squared_error(p,y)) print "RMSE of 5-fold cv {:.2}".format(rmse_cv) '''ElasticNet''' from sklearn.linear_model import ElasticNetCV met=ElasticNetCV(n_jobs=-1) p=np.zeros_like(y) for t,tst in kf: met.fit(X[t],y[t]) p[tst]=met.predict(X[tst]) p2=r2_score(y,p) print met.score(X,y) print p2,"Elastic" exit() plt.scatter(predicted,y) plt.xlabel("Predicted") plt.ylabel("Actual ") plt.plot([y.min(),y.max()],[[y.min()],[y.max()]]) plt.show()
forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_] forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest'] lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_] lasso_scores = [lasso_cv_score, r2_score(y,lasso_results_parameters[0]), 'lasso'] elastic_results_parameters = [ coef_path_elastic_cv.predict(X), coef_path_elastic_cv.get_params, coef_path_elastic_cv.alphas_ , coef_path_elastic_cv.coef_] elastic_scores = [elastic_cv_score, r2_score(y,elastic_results_parameters[0]), 'elastic'] logistic_results_parameters = [coef_path_logistic_cv.predict(X), coef_path_logistic_cv.get_params, coef_path_logistic_cv.coef_] logistic_scores = [logistic_cv_score, classification_report(binary_y, logistic_results_parameters[0]), 'logistic'] binary_x_logistic_results_parameters = [coef_path_binary_x_logistic_cv.predict(X), coef_path_binary_x_logistic_cv.get_params, coef_path_binary_x_logistic_cv.coef_] binary_x_logistic_scores = [binary_x_logistic_cv_score, classification_report(binary_y, binary_x_logistic_results_parameters[0]), 'binary_logistic'] ##LINEAR REGRESSION METHOD BEGIN reduced_feature_matrix_logistic = [] print "list of features from logistic regression:%d" % len(logistic_results_parameters[2][0]) print len(X[0])
met = ElasticNetCV() features = sales_merged[['PMI_Portfolio_AVB_Boost', 'PMI_Portfolio_PFP_Boost', 'PMI_Portfolio_PPRP', 'PMI_Portfolio_SA', 'SubFam_Hostess', 'SubFam_PFP_Boost', 'SubFam_RAP', 'SubFam_SA', 'Fam_AVB_Boost', 'Fam_Hostess', 'Fam_PFP_Boost', 'Fam_RAP', 't', 'Affinity', 'Brand Character', 'Functional Performance']].as_matrix() target = sales_merged['Volume_Sales'].as_matrix() met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(features[train], target[train]) pred[test] = met.predict(features[test]) print('[EN CV] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN CV] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('')
svr = svm.SVC() clf = grid_search.GridSearchCV(svr, param_grid) clf.fit(train, train_label) predictions = clf.predict(test) correct = isCorrect(predictions, False) acc = correct/len(predictions) print 'SVM acc:', acc #### eNet #### enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) enet.fit(train, train_label) predictions = enet.predict(test) correct = isCorrect(predictions, True) acc = correct/len(predictions) print 'eNet acc:', acc #
l = [] with h5py.File("ECoG_big_data.h5", "r+") as f1: with h5py.File("selected.h5", "r+") as f2: for i in range(1, 4): sid = "sub" + str(i) X = f1[sid]["train_data"][:] Y = f1[sid]["train_clabel"][:] Yb = f1[sid]["train_blabel"][:] Xt = f1[sid]["test_data"][:] Yt = f1[sid]["test_clabel"][:] Ytb = f1[sid]["test_blabel"][:] for finger in range(5): for method in ["l1", "mcp", "scad"]: idxc = f2[sid]["finger" + str(finger + 1)][method][:] - 1 idxb = f2[sid]["finger" + str(finger + 1)]["l1_l"][:] - 1 en = ElasticNetCV() en.fit(X[:, idxc].astype("float64"), Y[:, finger]) yp = en.predict(Xt[:, idxc]) corr = np.corrcoef(yp, Yt[:, finger])[0, 1] if corr < 0.3: break else: l.append([sid + "//" + "finger" + str(finger + 1), corr]) lr = LogisticRegressionCV() lr.fit(X[:, idxc], Yb[:, finger]) tp = yp * fun(lr.predict(Xt[:, idxc])) m = np.where(np.convolve(tp, np.ones((40,)) / 40, mode="same") < 0.5, 0, 1) b, a = butter(2, 9.0 / 25, "low") yy = relu(filtfilt(b, a, tp * m)) print corr, np.corrcoef(Yt[:, finger], yy)[0, 1]
# It is made available under the MIT License import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNetCV from sklearn.metrics import mean_squared_error, r2_score data, target = load_svmlight_file('data/E2006.train') # Edit the lines below if you want to switch method: # met = LinearRegression(fit_intercept=True) met = ElasticNetCV() kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) pred[test] = met.predict(data[test]) print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('') met.fit(data, target) pred = met.predict(data) print('[EN 0.1] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on training, {:.2}'.format(r2_score(target, pred)))
p[test] = met.predict(x[test]) r2_cv = r2_score(y, p) print('Method: {}'.format(name)) print('R2 on training: {}'.format(r2_train)) print('R2 on 5-fold CV: {}'.format(r2_cv)) print() # Construct an ElasticNetCV object (use all available CPUs) met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(x), n_folds=5) pred = np.zeros_like(y) for train, test in kf: met.fit(x[train], y[train]) pred[test] = met.predict(x[test]) print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(y, p)))) print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(y, p))) print('') ''' # unit version from time import time import numpy as np from step3_vectorize_text import preprocess_4 from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score
print lasso_cv_score plt.figure() plt.hist2d(y, lasso_prediction) plt.ylabel("Predicted Values") plt.xlabel("Truth Values") plt.title("Lasso Linear Regression") plt.savefig("figures/lasso_predicted_truth.png") print "#######ELASTIC#####" coef_path_elastic_cv.fit(X,y) print coef_path_elastic_cv.get_params print "alphas:" print coef_path_elastic_cv.alphas_ print "coef_:" print coef_path_elastic_cv.coef_ print "length of elastic terms:%d" % len(coef_path_elastic_cv.coef_) elastic_predict = coef_path_elastic_cv.predict(X) elastic_score = coef_path_elastic_cv.score(X,y) print "elastic_score:%.3g" % elastic_score elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=5) print elastic_cv_score #print "elastic precision:%.3g" % precision_score(y, elastic_predict, average='macro') plt.figure() plt.hist2d(y, elastic_predict) plt.ylabel("Predicted Values") plt.xlabel("Truth Values") plt.title("Elastic Linear Regression") plt.savefig("figures/elastic_predicted_truth.png") print "#######Logistic#####" coef_path_logistic_cv.fit(X,binary_y) print coef_path_logistic_cv.get_params print "coef_:"
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
#%% #try elastic net #alpha equals lambda here lambda_grid = [0.01, 0.1 , 1, 10,100] l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9] enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,cv=3,n_jobs=-1,verbose=True) enet_CV.fit(train_X,train_Y) #%% #show enet_CV.score(test_X,test_Y) plt.plot(enet_CV.predict(test_X),test_Y,'o') #%% #try svr svr = SVR(kernel = 'rbf',C=1,cache_size=2000) SVR_params = { 'C' : [1e-1,1.0,1e2,1e3,1e4] } svr_rs = grid_search.RandomizedSearchCV(svr,SVR_params,verbose=True,n_jobs=-1) svr.fit(train_X[:,whichones[0]],train_Y) #%% #try bagging/boosting etc #rfr = RandomForestRegressor(n_estimators = 30,n_jobs = 2) #rfr.fit(train_X,train_Y)
print '\n------------------------------------------------------------------------------' ############################################################################################################# # 2. Elastic Net combines both L1 (Ridge) and L2 (Lasso) penalty estimators ############################################################################################################# # Like Lasso, Elastic Net can e used for dimensionality reduction from sklearn.linear_model import ElasticNetCV from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score elastic = ElasticNetCV(alphas=np.logspace(-10, 10, 10), normalize=True, cv=10) elastic.fit(xtrain, ytrain) # Train dataset performance elastic_train_pred = elastic.predict(xtrain) elastic_train_r2 = r2_score((ytrain), elastic_train_pred) elastic_train_error = np.sqrt(mean_squared_error(ytrain, elastic_train_pred)) # Test dataset performance elastic_test_pred = elastic.predict(xtest) elastic_test_r2 = r2_score((ytest), elastic_test_pred) elastic_test_error = np.sqrt(mean_squared_error(ytest, elastic_test_pred)) # Build coefficients table from pandas import DataFrame elasticcoeff = DataFrame(data.columns, columns = ['Features']) elasticcoeff['Coefficients'] = elastic.coef_ print 'ELASTIC NET -------------------------------------------------------------------' print '\nThe alpha (L1) level selected: {}' .format(elastic.alpha_)
# use same code as before r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) print('R2 score (binary movie neighbors): {:.1%}'.format(r2)) from sklearn.linear_model import ElasticNetCV # NOT IN BOOK reg = ElasticNetCV(alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) filled = train.copy() # iterate over all users: for u in range(train.shape[0]): curtrain = np.delete(train, u, axis=0) bu = binary[u] reg.fit(curtrain[:,bu].T, train[u, bu]) filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) predicted = norm.inverse_transform(filled) r2 = metrics.r2_score(test[test > 0], predicted[test > 0]) print('R2 score (user regression): {:.1%}'.format(r2)) # SHOPPING BASKET ANALYSIS # This is the slow version of the code, which will take a long time to # complete. from collections import defaultdict from itertools import chain # File is downloaded as a compressed file import gzip