def lasso_regr(wine_set): pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] predictors = pred.copy() targets = wine_set.quality # standardize predictors to have mean=0 and sd=1 predictors = pd.DataFrame(preprocessing.scale(predictors)) predictors.columns = pred.columns # print(predictors.head()) # split into training and testing sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123) # specify the lasso regression model model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train) print('Predictors and their regression coefficients:') d = dict(zip(predictors.columns, model.coef_)) for k in d: print(k, ':', d[k]) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) # ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) print('\nAlpha:', model.alpha_) plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # plot mean squared error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.show() # Mean squared error from training and test data train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('\nMean squared error for training data:', train_error) print('Mean squared error for test data:', test_error) rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('\nR-square for training data:', rsquared_train) print('R-square for test data:', rsquared_test)
def lassolarscv(): print ("Doing cross-validated LassoLars") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf5 = LassoLarsCV(cv=cross_val) clf5.fit(base_X, base_Y) print ("Score = %f" % clf5.score(base_X, base_Y)) clf5_pred = clf5.predict(X_test) write_to_file("lassolars.csv", clf5_pred)
class LassoLarsCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def lassovar(data, lag=1, n_samples=None): Y = data.T[:, lag:] d = Y.shape[0] Z = np.vstack([data.T[:, lag - k:-k] for k in range(1, lag + 1)]) Y, Z = Y.T, Z.T if n_samples is not None: Y, Z = resample(Y, Z, replace=False, n_samples=n_samples) scores = np.zeros((d, d * lag)) ls = LassoLarsCV(cv=10, n_jobs=1) residuals = np.zeros(Y.shape) # one variable after the other as target for j in range(d): target = np.copy(Y[:, j]) selectedparents = np.full(d * lag, False) # we include one lag after the other for l in range(1, lag + 1): ind_a = d * (l - 1) ind_b = d * l ls.fit(Z[:, ind_a:ind_b], target) selectedparents[ind_a:ind_b] = ls.coef_ > 0 target -= ls.predict(Z[:, ind_a:ind_b]) residuals[:, j] = np.copy(target) # refit to get rid of the bias ZZ = Z[:, selectedparents] B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0] scores[j, selectedparents] = B # the more uncorrelated the residuals the higher the weight weight = 1 res = np.corrcoef(residuals.T) if np.linalg.matrix_rank(res) == res.shape[0]: weight = np.linalg.det(res) return scores * weight
def lassovar(data, maxlags=1, n_samples=None, cv=5): # Stack data to perform regression of present on past values Y = data.T[:, maxlags:] d = Y.shape[0] Z = np.vstack([data.T[:, maxlags - k:-k] for k in range(1, maxlags + 1)]) Y, Z = Y.T, Z.T # Subsample data if n_samples is not None: Y, Z = resample(Y, Z, n_samples=n_samples) scores = np.zeros((d, d * maxlags)) ls = LassoLarsCV(cv=cv, n_jobs=1) residuals = np.zeros(Y.shape) # Consider one variable after the other as target for j in range(d): target = np.copy(Y[:, j]) selectedparents = np.full(d * maxlags, False) # Include one lag after the other for l in range(1, maxlags + 1): ind_a = d * (l - 1) ind_b = d * l ls.fit(Z[:, ind_a:ind_b], target) selectedparents[ind_a:ind_b] = ls.coef_ > 0 target -= ls.predict(Z[:, ind_a:ind_b]) residuals[:, j] = np.copy(target) # Refit OLS using the selected variables to get rid of the bias ZZ = Z[:, selectedparents] B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0] scores[j, selectedparents] = B return scores
[0.067154, 3.190612], [0.925577, 4.631504], [0.717733, 4.295890], [0.015371, 3.085028], [0.335070, 3.448080], [0.040486, 3.167440], [0.212575, 3.364266], [0.617218, 3.993482], [0.541196, 3.891471]] #生成X和y矩阵 dataMat = np.array(data) X = dataMat[:, 0:1] # 变量x y = dataMat[:, 1] #变量y # ========Lasso回归======== # model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(X, y) # 线性回归建模 print('系数矩阵:\n', model.coef_) print('线性回归模型:\n', model) # print('最佳的alpha:',model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 # 使用模型预测 predicted = model.predict(X) # 绘制散点图 参数:x横轴 y纵轴 plt.scatter(X, y, marker='x') plt.plot(X, predicted, c='r') # 绘制x轴和y轴坐标 plt.xlabel("x") plt.ylabel("y") # 显示图形 plt.show()
plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.ylim(ymin, ymax) # LassoLarsCV: least angle regression from sklearn.linear_model import LassoLarsCV # Compute paths print("Computing regularization path using the Lars lasso...") LassoLarsCV_fit = LassoLarsCV(cv=20).fit(X, y) LassoLarsCV_pred = LassoLarsCV_fit.predict(X_test) R2_LassoLarsCV = metrics.r2_score(LassoLarsCV_pred, y_test) # 0.776 du coup un peu meilleur # Display results m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--',
from sklearn.linear_model import LassoLarsCV from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt X, y = make_regression(n_features=1, noise=4.0, random_state=0) y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100) reg = LassoLarsCV(cv=5).fit(X_train, y_train) print(reg.score(X_train, y_train)) print(reg.score(X_test, y_test)) print(reg.alpha_) y_pred = reg.predict(X) print(X_train.shape, y_train.shape) plt.scatter(X_train, y_train, label='train') plt.scatter(X_test, y_test, label='test') plt.plot(X, y_pred) plt.show()
class LassoPredictor(Persistent): @contract(hypers='dict') def __init__(self, hypers): modelHypers = self.extract_model_hypers(hypers) self.model = LassoLarsCV(**modelHypers) @timing def fit(self, df, features, targetCol, validationSplit=0.2): print("Running fit function:") print(df) XTrain, yTrain = df2xy(df, features, targetCol) if XTrain.shape[0] < 3: print("not enough data to form a model!") return False success = True try: self.model.fit(XTrain, yTrain) #try: #Parallel(n_jobs=2, verbose=10, batch_size=20)(delayed(self.fit_helper)(date) for date in self.dates) except ValueError: traceback.print_exc() success = False return success def predict(self, df, features, targetCol): XPred, _ = df2xy(df, features, targetCol) try: yPred = self.model.predict(XPred) except ValueError: traceback.print_exc() return None #df['pred' + targetCol] = yPred return yPred #def score (self, userXTest): # # *** Needs reworking! # ''' # :returns: Score calculated by taking the last yTrain (all data) # and comparing to predicted result. # ''' # if self.modelScore is None: # lastDate = self.dates[-1] # actualY = self.yTrains[lastDate] # #preddf = self.predict(userXTest) # preddf = loads(preddf, preserve_order=True) # preddf = pd.DataFrame(preddf['arr'], columns = [self.targetCol]) # predY = preddf[self.targetCol] # predY = predY.shift(-self.batchSize) # predY = predY.iloc[:-self.batchSize] # score = metrics.r2_score(actualY, predY) # self.modelScore = score # else: # score = self.modelScore # return score def lc(self): ''' Makes learning curve for a player ''' if self.lcScores is None: self.lcModel = LassoLarsCV() lastDate = self.dates[-1] X = self.XTrains[lastDate] y = self.yTrains[lastDate] N = len(X) chopOff = N - (N % 7) X = X.iloc[:chopOff] y = y.iloc[:chopOff] idxs = np.arange(chopOff) cvSplits = [(idxs[:i], idxs[i:]) for i in range(7, chopOff, 7)] trainSizes, trainScores, testScores = \ learning_curve(estimator=self.lcModel, X=X.as_matrix(), y=np.array(y), cv=cvSplits, train_sizes=[7], n_jobs=2, ) trainSizes = [len(t[0]) for t in cvSplits] self.lcScores = dumps((trainSizes, trainScores, testScores)) result = self.lcScores else: result = self.lcScores return result def get_params(self): for i, model in self.models.items(): params = order_dict(model.get_params()) break return params def extract_model_hypers(self, hypers): ''' Extracts the parameterse that relevant to the model and are not other meta params ''' params = ['verbose'] modelHypers = {} for param in params: paramVal = hypers.get(param) if paramVal is not None: modelHypers[param] = paramVal modelHypers = order_dict(modelHypers) return modelHypers
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred)) y_test_score = rd.score(x_test, y_test) print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score)) print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score)) '''========9.Lasso回归========''' import numpy as np import matplotlib.pyplot as plt # 可视化绘制 from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取 #model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(x_train, y_train) # 线性回归建模 print('系数矩阵:\n', model.coef_, model.intercept_) print('线性回归模型:\n', model) print('最佳的alpha:', model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 # 使用模型预测 #分别预测训练数据和测试数据 y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) #分别计算其均方根误差和拟合优度 y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred)) y_train_score = model.score(x_train, y_train) y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred)) y_test_score = model.score(x_test, y_test) print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score)) print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(training_target, model.predict(training_data)) test_error = mean_squared_error(test_target, model.predict(test_data)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) # R-square from training and test data rsquared_train = model.score(training_data, training_target) rsquared_test = model.score(test_data, test_target) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test)
for pyear in pt_projs_curr.keys(): ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems]) x = numpy.array(ivars) x2 = numpy.array(ivars2) y = numpy.array(depvars) model_pt = LassoLarsCV(cv=cv_num) model_pt.fit(x,y) print("Rough PT model, to choose sample") for system, coef in zip(proj_systems, model_pt.coef_): print("%40s : %f" % (system, coef)) print("%40s : %f" % ('intercept', model_pt.intercept_)) sample_proj_pt_arr = model_pt.predict(x) curr_proj_pt_arr = model_pt.predict(x2) sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr)) curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr)) models = {} final_projs = {} ivars = {} depvars = {} ptvars = {} player_lists = {}
model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) # R-square from training and test data rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test) #-------------------------------------------------------------------------------
for pyear in pt_projs_curr.keys(): ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems]) x = numpy.array(ivars) x2 = numpy.array(ivars2) y = numpy.array(depvars) model_pt = LassoLarsCV(cv=cv_num,fit_intercept=False) model_pt.fit(x,y) print("Rough PT model, to choose sample") for system, coef in zip(proj_systems, model_pt.coef_): print("%40s : %f" % (system, coef)) print("%40s : %f" % ('intercept', model_pt.intercept_)) sample_proj_pt_arr = model_pt.predict(x) curr_proj_pt_arr = model_pt.predict(x2) sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr)) curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr)) models = {} final_projs = {} ivars = {} depvars = {} ptvars = {} player_lists = {}
color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') #plt.savefig('Fig02') #print(pred_train.head()) #plt.show() print(model.alpha_) print(model.coef_) print(model.intercept_) print(pred_train.head()) np.unique(model.predict(pred_test)) model ######################################################################################## ########################### Part 1 RESPONSE ############################## ####################################################################################### #(1) #a. TABLE: #AGE, SYS, HRA, RACE_1, RACE_2, RACE_3, TYP_1, CPR_1S #0.01946696, -0.01645696, -0.00596813, -0.2566194, -0.23701148, -0.04399663, 1.12856158, 0.87772558 #b. Viewing the coefficient of CPR, we have 2.41 odds? or about 71% chance of survival TODO: is this right #c. Optimal alpha value from the Lasso section: 0.0013716207531124826 #d. #The coefficients are:
def lasso_single_prediction(city, state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) if sum(y_train) == 0: print('aaaah', city) return None city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) print(city, 'done') except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) return None
def lasso_single_state_prediction(state, lookback, horizon, predictors): ##LASSO WITHOUT CLUSTER SERIES cities = list(get_cities_from_state('Ceará')) for city in cities: if os.path.isfile( '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl' .format(state, city)): print(city, 'done') continue data = combined_data(city, DATA_TYPES) data = data[predictors] data.drop('casos', axis=1, inplace=True) target = 'casos_est' data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso_no_cluster', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train), path='lasso_no_cluster') # plt.show() return None
# plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train=model.score(pred_train,tar_train) rsquared_test=model.score(pred_test,tar_test) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
import numpy as np import pandas as pd from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=55) # Average CV score on the training set was:-832843188.6270168 exported_pipeline = LassoLarsCV(normalize=True) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
class determine_attribute_quality(object): def __init__(self,red,white): self.red=red self.white=white def remove_column_spaces(self,wine_data): wine_data.columns = [x.strip().replace(' ', '_') for x in wine_data.columns] return wine_data def regression(self,wine_data): self.pred = wine_data[['density', 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] self.predictors = self.pred.copy() self.targets = wine_data.quality # Normalization self.predictors = pd.DataFrame(preprocessing.scale(self.predictors)) self.predictors.columns = self.pred.columns # Split into Training and Testing sets (self.pred_train, self.pred_test, self.target_train, self.target_test) = train_test_split(self.predictors, self.targets, test_size=.2, random_state=123) # Lasso Regression Model self.model = LassoLarsCV(cv=10, precompute=False).fit(self.pred_train, self.target_train) print('Predictors and their Regression coefficients:') d = dict(zip(self.predictors.columns, self.model.coef_)) for k in d: print(k, ':', d[k]) # Plot Coefficient Progression m_log_alphas = -np.log10(self.model.alphas_) plt.plot(m_log_alphas, self.model.coef_path_.T) print('\nAlpha:', self.model.alpha_) plt.axvline(-np.log10(self.model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # Plot MSE for each fold m_log_alphascv = -np.log10(self.model.cv_alphas_) plt.plot(m_log_alphascv, self.model.cv_mse_path_, ':') plt.plot(m_log_alphascv, self.model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean Squared Error') plt.title('Mean Squared Error on Each Fold') plt.show() # Mean Squared Error from Training and Test data self.train_error = mean_squared_error(self.target_train, self.model.predict(self.pred_train)) self.test_error = mean_squared_error(self.target_test, self.model.predict(self.pred_test)) print('\nMean squared error for training data:', self.train_error) print('Mean squared error for test data:', self.test_error) self.rsquared_train = self.model.score(self.pred_train, self.target_train) self.rsquared_test = self.model.score(self.pred_test, self.target_test) print('\nR-square for training data:', self.rsquared_train) print('R-square for test data:', self.rsquared_test)
plt.title('Regression Coefficients Progression for Lasso Paths') # plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(training_target, model.predict(training_data)) test_error = mean_squared_error(test_target, model.predict(test_data)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train=model.score(training_data, training_target) rsquared_test=model.score(test_data, test_target) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
) errors = pd.Series() if run_type == 'train': target_validate = target_train.copy() baseline_target_validate = baseline_target_train.copy() if run_type == 'test': target_validate = target_test.copy() baseline_target_validate = baseline_target_test.copy() for key in metrics.keys(): # Transfer Error X = np.asarray(target_validate['rep'].values.tolist()) yhat = model_trained_on_S.predict(X) np.save( f"./predictions/transfer__{d}__{s}__{rep}__{run_type}__predictions_S_T.npy", yhat) errors.loc[f'{key}_S_T'] = metrics[key]( target_validate['target'], yhat) # In-domain Error X = np.asarray(target_validate['rep'].values.tolist()) yhat = model_trained_on_T.predict(X) np.save( f"./predictions/transfer__{d}__{s}__{rep}__{run_type}__predictions_T_T.npy", yhat) errors.loc[f'{key}_T_T'] = metrics[key]( target_validate['target'], yhat)
g.set_ylabel("Features", fontsize=12) g.tick_params(labelsize=9) g.set_title(name + " regression coefs") nregressors += 1 plt.tight_layout() plt.show() plt.gcf().clear() # Here, the features coefficients (show only top 40 features). It appears that GrLivArea has an important weight in the 4 models. # # According to the RMSE scores, i choosed the Lassocv, LassolarsCV and the ElasticNetCV models. # In[ ]: Y_pred_lassocv = np.expm1(lassocv.predict(test)) Y_pred_lassolarscv = np.expm1(lassolarscv.predict(test)) Y_pred_elasticnetcv = np.expm1(elasticnetcv.predict(test)) # Don't forget to transform the log1p(SalePrice) to their real values using expm1. # ### 6.2 Tree based modeling # #### 6.2.1 Cross validate models # Next i wanted to combine the linear models to tree based models. I'hve tested the random forest it shows bad performances (~0.14 with hyperparameters tunning). # # I decided to focus on, the kaggle "darling" algorithm :p XGBoost, the LightGBM and the Gradient Boosting algorithm. # # Thanks to the excellent @Serigne kernel, i get near-optimal parameters for these 3 algorithms. # # This spare us a lot of hyperparameters tunning :D!
test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test') lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon)) if os.path.exists(lasso_file): print "Reading PCA from file" L = pickle.load(open(lasso_file, 'r')) else: print "Fitting Lasso" L = LassoLarsCV(cv=5) L.fit(train_data.X, train_data.y[:,0]) pickle.dump(L, open(lasso_file, 'w')) ## Print Fit stats print "Alpha", L.alpha_ print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X)) print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X)) yhat = L.predict(test_data.X) print "Pearson Corr", pearsonr(test_data.y[:,0], yhat) print "Spearman Corr", spearmanr(test_data.y[:,0], yhat) print "SSE", sum((yhat - test_data.y[:,0])**2) ## Compute monthly data import datetime import pandas t0 = datetime.date(1986, 1, 1) t1 = datetime.date(1999, 12, 31)
# Create the pipeline for the model est = LassoLarsCV() #fit model # pdb.set_trace() t0 = time.time() est.fit(X[train],y[train]) #get fit time runtime = time.time()-t0 # print("training done") # pdb.set_trace() # predict on test set y_true = y[test] y_pred = est.predict(X[test]) if problem in scale_these: test_mse = mean_squared_error(sc_y.inverse_transform(y_true), sc_y.inverse_transform(y_pred)) test_r2 = r2_score(sc_y.inverse_transform(y_true), sc_y.inverse_transform(y_pred)) else: test_mse = mean_squared_error(y_true,y_pred) test_r2 = r2_score(y_true,y_pred) # print results out_text = '\t'.join([dataset.split('/')[-1][2:-4], 'lasso', str(i), str(test_mse),
def main(): u"""Main function for assignment 03.""" # Load prepared data. df = return_proc_and_transf_data_set() # Mass is already included as mass in SI units. df.drop(['carat'], inplace=True, axis=1) # Those are dummy variables not needed in our data set anymore. df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1) # A bit of error checking. if df.isnull().sum().sum() != 0: raise ValueError('Your data has unintended nulls.') # Cast our dataframe into float type. df = df.astype('float64') # Scale our dataframe to avoid the sparsity control of our dataframe biased # against some variables. print('Prior to scaling:') print(df.describe()) df = df.apply(preprocessing.scale) print('After scaling:') print(df.describe()) print_separator() if (df.mean().abs() > 1e-3).sum() > 0: raise ValueError('Scaling of your dataframe went wrong.') # Split into training and testing sets # The predictirs should not include any price variable since this was used # to create the output variable predictors = [x for x in df.columns.tolist() if 'price' not in x] print('Input variables:') pprint(predictors, indent=4) input_variables = df[predictors].copy() output_variable = df.price.copy() # Categorized price print_separator() input_training, input_test, output_training, output_test = train_test_split( input_variables, output_variable, test_size=0.3, random_state=0) # A few words about the LassoLarsCV: # LASSO: least absolute shrinkage and selection operator (discussed in # the course material. # LARS: least angle regression: algorithm for linear regression models # to high-dimensional data (aka 'a lot of categories'). # Compared to simple LASSO this model uses the LARS algorithm instead of # the 'vanilla' 'coordinate_descent' of simple LASSO. # CV: cross validation: this sets the alpha parameter (refered to as # lambda parameter in the course video) by cross validation. # In the simple LARS this alpha (the penalty factor) is an input of the # function. # 'The alpha parameter controls the degree of sparsity of the # coefficients estimated. # If alpha = zero then the method is the same as OLS. model = LassoLarsCV( cv=10, # Number of folds. precompute=False, # Do not precompute Gram matrix. # precompute=True, # Do not precompute Gram matrix. # verbose=3, ).fit(input_training, output_training) dict_var_lin_coefs = dict(zip( predictors, model.coef_)) print('Result of linear model:') pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()], key=lambda x: abs(x[1])) ) print_separator() # Plot coefficient progression. # TODO: plot those on 4 different subplots. model_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(model_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') plt.legend(predictors, loc='best',) plt.tight_layout() plt.savefig('result00.png', dpi=600) plt.close() # TODO: why are the coefficients in the result very different than the # coefficient path? # # There seems to be a scaling of the coefficient paths with an arbitrary # almost the same constant (194 in this case) # # print('Resulting alpha is not different than path alpha (difference):') # difference = model.alpha_ - model.alphas_ # pprint(model.alpha_ - model.alphas_) # print('Resulting coefficients are very different than path coefficients (difference):') # pprint(model.coef_ - model.coef_path_.T) # print_separator() # Plot mean square error for each fold. # To avoid getting dividebyzero warning map zero to an extremely low value. model.cv_alphas_ = list( map(lambda x: x if x != 0 else np.inf, model.cv_alphas_)) model_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(model_log_alphas, model.cv_mse_path_, ':') plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.legend() plt.tight_layout() plt.savefig('result01.png', dpi=600) plt.close() # Mean squared error of our model. train_error = mean_squared_error(output_training, model.predict(input_training)) test_error = mean_squared_error(output_test, model.predict(input_test)) print ('Training data MSE') print(train_error) print ('Test data MSE') print(test_error) print_separator() # R-square from training and test data. rsquared_train = model.score( input_training, output_training) rsquared_test = model.score( input_test, output_test) print ('Training data R-square') print(rsquared_train) print ('Test data R-square') print(rsquared_test) print_separator() return {'model': model, 'dataframe': df}
def lasso(X, y, value): regressor = LassoLarsCV(cv=10, precompute=False) regressor.fit(X, y) y_pred = regressor.predict(value) return y_pred
doc = content[0] score = float(content[1]) test_set.setdefault(doc, score) if cnt == 0: X_test = np.array(weight[idx[doc]]).reshape(cnt + 1, d) Y_test = np.array([score]).reshape(cnt + 1, 1) else: X_test = np.concatenate((X_test, np.array(weight[idx[doc]]).reshape(1, d)), axis=0).reshape(cnt + 1, d) Y_test = np.concatenate((Y_test, np.array([score]).reshape(1, 1)), axis=0).reshape(cnt + 1, 1) cnt += 1 line = next(f) print('predicting...') Y_hat = clflars.predict(X_test) #predict MAE = np.mean(np.abs(Y_hat - Y_test)) print('MAE: %f' % MAE) # print(Y_hat) for idx, doc in enumerate(test_set.keys()): if idx >= cnt: break res.write(QID + ' ' + doc + ' ') res.write(str(float(Y_hat[idx]))) res.write('\n') MAE_TOTAL += MAE / 50 print(QID + ' MAE: %f' % MAE) print('===================================\n') res.write('\nMAE: %f' % MAE_TOTAL)
if y_trainset[i] >= 0.01 and y_trainset[i] < 1: X_trainset_1.append(X_trainset[i]) y_trainset_1.append(y_trainset[i]) reg_1 = LassoLarsCV(max_n_alphas=10, positive=True) reg_1.fit(X_trainset_1, y_trainset_1) ## 预测 mse = 0.0 for i in range(0, y_testset.__len__(), 1): predict_x = 0.0 test_x = X_testset[i] test_x = scaler.transform(test_x) one_classify_pro = classify_model_0003.predict_proba(test_x) probe = one_classify_pro[0] if probe[0] - probe[1] > 0.3: predict_x = reg_0003.predict(test_x) elif probe[1] - probe[0] > 0.3: two_classify_pro = classify_model_001.predict_proba(test_x) probe_two = two_classify_pro[0] if probe_two[0] - probe_two[1] > 0.3: predict_x = reg_001.predict(test_x) elif probe_two[1] - probe_two[0] > 0.3: predict_x = reg_1.predict(test_x) else: predict_x = probe_two[0] * reg_001.predict( test_x) + probe_two[1] * reg_1.predict(test_x) else: predict_x = probe[0] * reg_0003.predict( test_x) + probe[1] * reg_001.predict(test_x) print predict_x, y_testset[i] mse += abs(predict_x - y_testset[i])
lines = '' from sklearn.datasets import load_svmlight_file filename = "data/trainingset/oneThousandProperties.txt" data = load_svmlight_file(filename) X_testset, y_testset = data[0], data[1] X_testset = X_testset.toarray() for i in range(0, y_testset.__len__(), 1): predict_x = 0.0 test_x = X_testset[i] test_x = scaler.transform(test_x) one_classify_pro = classify_model_0003.predict_proba(test_x) probe = one_classify_pro[0] if probe[0] - probe[1] > 0.4: predict_x = reg_0003.predict(test_x) elif probe[1] - probe[0] > 0.4: two_classify_pro = classify_model_001.predict_proba(test_x) probe_two = two_classify_pro[0] if probe_two[0] - probe_two[1] > 1: predict_x = reg_001.predict(test_x) elif probe_two[1] - probe_two[0] > 1: predict_x = reg_1.predict(test_x) else: if probe_two[1] > probe_two[0]: predict_x = 0.000 * probe_two[0] * reg_001.predict( test_x) + probe_two[1] * reg_1.predict(test_x) else: predict_x = probe_two[0] * reg_001.predict( test_x) + 0.45 * probe_two[1] * reg_1.predict(test_x)
normalize=False, scoring=None, store_cv_values=False) RG.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1]) result=RG.predict(trainx[k]) acc = 0 for i in range(int(len(trainx[k])/2),len(result)): acc=acc+(result[i-1]/trainy[k][i]-trainy[k][i]/trainy[k][i])**2 acc=acc/int(len(result)/2) acc=acc**(1/2.0) print(1-acc) LL = LassoLarsCV(copy_X=True, cv=None, eps=2.2204460492503131e-16, fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=1, normalize=True, positive=False, precompute='auto', verbose=False) LL.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1]) result=LL.predict(trainx[k]) acc = 0 for i in range(int(len(trainx[k])/2),len(result)): acc=acc+(result[i-1]/trainy[k][i]-trainy[k][i]/trainy[k][i])**2 acc=acc/int(len(result)/2) acc=acc**(1/2.0) print(1-acc) LSC = LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=False) LSC.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1]) result=LSC.predict(trainx[k]) acc = 0
# Prediction for each clusters from sklearn.linear_model import LassoLarsCV results = pd.DataFrame(columns=["Id", "SalePrice"]) for cluster in range(0, kmeans.n_clusters): X_clus = total[total["Cluster"] == cluster].drop("SalePrice", axis=1) y_clus = total[total["Cluster"] == cluster] mean_clus = np.mean(y_clus["SalePrice"]) y_clus = y_clus["SalePrice"] - mean_clus model = LassoLarsCV(cv=3, max_iter=199999999).fit(X_clus, y_clus) X_test_clus = X_test[X_test["Cluster"] == cluster] X_test_id = X_test_clus.loc[:, X_test_clus.columns == 'Id'] pred = model.predict(X_test_clus.drop("Id", axis=1)) X_test_id.loc[:, 1] = pred X_test_id.columns = ["Id", "SalePrice"] X_test_id["SalePrice"] += mean_clus results = pd.concat([results, X_test_id]) test_final = results # Re-mean the prediction #test_final["SalePrice"] += mean_y test_final.head(5) test_final.tail(5) # Rename submission = test_final submission.to_csv(r'Submission_Daphne.csv', index=False)
plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths of Selected Variables') #plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds',linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.xlim(1.95,4.0) #MSE from training and test data training_error = mean_squared_error(target_train,model.predict(predictors_train)) test_error = mean_squared_error(target_test,model.predict(predictors_test)) print('Training data MSE') print(training_error) print('Test data MSE') print(test_error) #R-squared from training and test data rsquared_train=model.score(predictors_train,target_train) rsquared_test=model.score(predictors_test,target_test) print('Training data R**2') print(rsquared_train) print('Test data R**2') print(rsquared_test)
#train["outlier"] = LocalOutlierFactor().fit_predict(train) #test["outlier"] = LocalOutlierFactor().fit_predict(test) ######################################################################################################################## ################## LEAST ANGLE REGRESSION ################## ######################################################################################################################## from sklearn.linear_model import LassoLarsCV from sklearn.linear_model import Lasso from sklearn.model_selection import GridSearchCV #import xgboost as xgb y_LARS_train = train["SalePrice"] - np.mean(train["SalePrice"]) model = LassoLarsCV(cv=10, max_iter=199999999).fit(X_train, y_LARS_train) model.alpha_ pred = model.predict(X_test) pred += np.median(train["SalePrice"]) from modules.modelaccuracy import allyouneedtoknow allyouneedtoknow(pred, y_test) from sklearn import linear_model y_other_train = np.log1p(train["SalePrice"]) from sklearn.svm import SVR svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1) svr_lin = SVR(kernel='linear', C=100, gamma='auto') svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
GB.fit(x_train, y_train) y_pred = GB.predict(x_test) mape = mean_absolute_percentage_error(y_test, y_pred) print('GB', mape) #################MLPRegressor##################ANN from sklearn.neural_network import MLPRegressor Neural_MLP = MLPRegressor(hidden_layer_sizes=(50, ), max_iter=250) #Neural_MLP = MLPRegressor() Neural_MLP.fit(x_train, y_train) #Fitting the Model y_pred = Neural_MLP.predict(x_test) #Predicting on Test DataSet mape = mean_absolute_percentage_error(y_test, y_pred) print('MAPE', mape) #################################LASSO############### from sklearn.linear_model import LassoLarsCV #from sklearn import preprocessing Lasso_model = LassoLarsCV() Lasso_model.fit(x_train, y_train) y_pred = Lasso_model.predict( x_test) # we are predicting the values from the test dataset mape = mean_absolute_percentage_error(y_test, y_pred) print('MAPE', mape) #########linear regression from sklearn.linear_model import LinearRegression lm = LinearRegression() #creatingan object of linear regression model lm.fit(x_train, y_train) #running the model. y_pred = lm.predict(x_test) mape = mean_absolute_percentage_error(y_test, y_pred) print('MAPE', mape)
model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # There is variability across individual cv as variables area added in the same pattern # = Decrease rapidly and then level off to point where more prediction is not reducing MSE # 3.5 MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) #similar accuracy # 3.6 R-square from training and test data rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test) #more accurate than training data
plt.title('Regression Coefficients Progression for Lasso Paths') plt.show() m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.show() from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print ('training data MSE %s'%(train_error)) print ('test data MSE %s'%(test_error)) rsquared_train = model.score(pred_train, tar_train) rsquared_test=model.score(pred_test, tar_test) print ('training data R-square %s'%(rsquared_train))z print ('test data R-square %s'%(rsquared_test))