def test_few_at_least_as_good_as_default(): """test_few.py: few performs at least as well as the default ML """ np.random.seed(1006987) boston = load_boston() d = np.column_stack((boston.data,boston.target)) np.random.shuffle(d) features = d[:,0:-1] target = d[:,-1] print("feature shape:",boston.data.shape) learner = FEW(generations=1, population_size=5, ml = LassoLarsCV(), min_depth = 1, max_depth = 3, sel = 'tournament') learner.fit(features[:300], target[:300]) few_score = learner.score(features[:300], target[:300]) few_test_score = learner.score(features[300:],target[300:]) lasso = LassoLarsCV() lasso.fit(features[:300], target[:300]) lasso_score = lasso.score(features[:300], target[:300]) lasso_test_score = lasso.score(features[300:],target[300:]) print("few score:",few_score,"lasso score:",lasso_score) print("few test score:",few_test_score,"lasso test score:", lasso_test_score) assert round(few_score,8) >= round(lasso_score,8) print("lasso coefficients:",lasso.coef_)
def test_few_at_least_as_good_as_default(): """test_few.py: few performs at least as well as the default ML """ np.random.seed(1006987) boston = load_boston() d = np.column_stack((boston.data,boston.target)) np.random.shuffle(d) features = d[:,0:-1] target = d[:,-1] print("feature shape:",boston.data.shape) learner = FEW(generations=1, population_size=5, mutation_rate=1, crossover_rate=1, ml = LassoLarsCV(), min_depth = 1, max_depth = 3, sel = 'tournament', fit_choice = 'r2',tourn_size = 2, random_state=0, verbosity=0, disable_update_check=False) learner.fit(features[:300], target[:300]) few_score = learner.score(features[:300], target[:300]) test_score = learner.score(features[300:],target[300:]) lasso = LassoLarsCV() lasso.fit(learner._training_features,learner._training_labels) lasso_score = lasso.score(features[:300], target[:300]) print("few score:",few_score,"lasso score:",lasso_score) print("few test score:",test_score,"lasso test score:",lasso.score(features[300:],target[300:])) assert few_score >= lasso_score print("lasso coefficients:",lasso.coef_)
def lasso_regr(wine_set): pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] predictors = pred.copy() targets = wine_set.quality # standardize predictors to have mean=0 and sd=1 predictors = pd.DataFrame(preprocessing.scale(predictors)) predictors.columns = pred.columns # print(predictors.head()) # split into training and testing sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123) # specify the lasso regression model model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train) print('Predictors and their regression coefficients:') d = dict(zip(predictors.columns, model.coef_)) for k in d: print(k, ':', d[k]) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) # ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) print('\nAlpha:', model.alpha_) plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # plot mean squared error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.show() # Mean squared error from training and test data train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('\nMean squared error for training data:', train_error) print('Mean squared error for test data:', test_error) rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('\nR-square for training data:', rsquared_train) print('R-square for test data:', rsquared_test)
def lasso_regr(wine_set): pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] predictors = pred.copy() targets = wine_set.quality # standardize predictors to have mean=0 and sd=1 predictors = pd.DataFrame(preprocessing.scale(predictors)) predictors.columns = pred.columns # print(predictors.head()) # split into training and testing sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123) # specify the lasso regression model model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train) print('Predictors and their regression coefficients:') d = dict(zip(predictors.columns, model.coef_)) for k in d: print(k, ':', d[k]) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) # ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) print('\nAlpha:', model.alpha_) plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # plot mean squared error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.show() # Mean squared error from training and test data train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('\nMean squared error for training data:', train_error) print('Mean squared error for test data:', test_error) rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('\nR-square for training data:', rsquared_train) print('R-square for test data:', rsquared_test)
def lassolarscv(): print ("Doing cross-validated LassoLars") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf5 = LassoLarsCV(cv=cross_val) clf5.fit(base_X, base_Y) print ("Score = %f" % clf5.score(base_X, base_Y)) clf5_pred = clf5.predict(X_test) write_to_file("lassolars.csv", clf5_pred)
def fit_linear_model(X, Y, verbose=True): """ Fits linear regression model to given observations return: fitted model """ X_train, X_test, Y_train, Y_test = split_obs(X, Y) estimator = LassoLarsCV(cv=20).fit(X_train, Y_train) test_score = estimator.score(X_test, Y_test) if verbose: print(f'***** Linear Regression Stats *****') print(f'target: {Y.name}') print(f'r-squared: {round(test_score, 4)}') print(f'alpha: {estimator.alpha_}\n') return estimator
class determine_attribute_quality(object): def __init__(self,red,white): self.red=red self.white=white def remove_column_spaces(self,wine_data): wine_data.columns = [x.strip().replace(' ', '_') for x in wine_data.columns] return wine_data def regression(self,wine_data): self.pred = wine_data[['density', 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] self.predictors = self.pred.copy() self.targets = wine_data.quality # Normalization self.predictors = pd.DataFrame(preprocessing.scale(self.predictors)) self.predictors.columns = self.pred.columns # Split into Training and Testing sets (self.pred_train, self.pred_test, self.target_train, self.target_test) = train_test_split(self.predictors, self.targets, test_size=.2, random_state=123) # Lasso Regression Model self.model = LassoLarsCV(cv=10, precompute=False).fit(self.pred_train, self.target_train) print('Predictors and their Regression coefficients:') d = dict(zip(self.predictors.columns, self.model.coef_)) for k in d: print(k, ':', d[k]) # Plot Coefficient Progression m_log_alphas = -np.log10(self.model.alphas_) plt.plot(m_log_alphas, self.model.coef_path_.T) print('\nAlpha:', self.model.alpha_) plt.axvline(-np.log10(self.model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # Plot MSE for each fold m_log_alphascv = -np.log10(self.model.cv_alphas_) plt.plot(m_log_alphascv, self.model.cv_mse_path_, ':') plt.plot(m_log_alphascv, self.model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean Squared Error') plt.title('Mean Squared Error on Each Fold') plt.show() # Mean Squared Error from Training and Test data self.train_error = mean_squared_error(self.target_train, self.model.predict(self.pred_train)) self.test_error = mean_squared_error(self.target_test, self.model.predict(self.pred_test)) print('\nMean squared error for training data:', self.train_error) print('Mean squared error for test data:', self.test_error) self.rsquared_train = self.model.score(self.pred_train, self.target_train) self.rsquared_test = self.model.score(self.pred_test, self.target_test) print('\nR-square for training data:', self.rsquared_train) print('R-square for test data:', self.rsquared_test)
# plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train=model.score(pred_train,tar_train) rsquared_test=model.score(pred_test,tar_test) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths of Selected Variables') #plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds',linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.xlim(1.95,4.0) #MSE from training and test data training_error = mean_squared_error(target_train,model.predict(predictors_train)) test_error = mean_squared_error(target_test,model.predict(predictors_test)) print('Training data MSE') print(training_error) print('Test data MSE') print(test_error) #R-squared from training and test data rsquared_train=model.score(predictors_train,target_train) rsquared_test=model.score(predictors_test,target_test) print('Training data R**2') print(rsquared_train) print('Test data R**2') print(rsquared_test)
plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') # Plot mean square error for each fold m_log_alphascv = -np.log10(LassoRegModel.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, LassoRegModel.cv_mse_path_, ':') plt.plot(m_log_alphascv, LassoRegModel.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(LassoRegModel.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data train_error = mean_squared_error(tar_train, LassoRegModel.predict(pred_train)) test_error = mean_squared_error(tar_test, LassoRegModel.predict(pred_test)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train=LassoRegModel.score(pred_train,tar_train) rsquared_test=LassoRegModel.score(pred_test,tar_test) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
X_train, y_train = train_set[col], train_set[interest] X_test, y_test = test_set[col], test_set[interest] score = 'mean_squared_error' tuned_params_lasso = [{'alpha': np.linspace(-1, 1, 100), 'normalize': [True, False]}] ### ACROSS WHOLE DATASET ### With StratifiedKFold, we're stratifying according to the interest variable. ### This ensures that there will be an even proportion of RAVLT_DEL (or whatever ### the interest variable is) values across all folds. skf = cross_validation.StratifiedKFold(y_aging, n_folds=6) model = LassoLarsCV(max_iter=100000, cv=skf).fit( X_aging, y_aging ) # print("Best estimator for WHOLE DATASET: \n{0}\n".format(model.best_estimator_)) print("Percent variance explained: {0}".format(model.score( X_aging, y_aging))) print("Coefficients found: \n{0}\n".format(prettyprint(model.coef_, col, sort=True))) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') plt.show() ### ACROSS SUPERAGERS
def main(): u"""Main function for assignment 03.""" # Load prepared data. df = return_proc_and_transf_data_set() # Mass is already included as mass in SI units. df.drop(['carat'], inplace=True, axis=1) # Those are dummy variables not needed in our data set anymore. df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1) # A bit of error checking. if df.isnull().sum().sum() != 0: raise ValueError('Your data has unintended nulls.') # Cast our dataframe into float type. df = df.astype('float64') # Scale our dataframe to avoid the sparsity control of our dataframe biased # against some variables. print('Prior to scaling:') print(df.describe()) df = df.apply(preprocessing.scale) print('After scaling:') print(df.describe()) print_separator() if (df.mean().abs() > 1e-3).sum() > 0: raise ValueError('Scaling of your dataframe went wrong.') # Split into training and testing sets # The predictirs should not include any price variable since this was used # to create the output variable predictors = [x for x in df.columns.tolist() if 'price' not in x] print('Input variables:') pprint(predictors, indent=4) input_variables = df[predictors].copy() output_variable = df.price.copy() # Categorized price print_separator() input_training, input_test, output_training, output_test = train_test_split( input_variables, output_variable, test_size=0.3, random_state=0) # A few words about the LassoLarsCV: # LASSO: least absolute shrinkage and selection operator (discussed in # the course material. # LARS: least angle regression: algorithm for linear regression models # to high-dimensional data (aka 'a lot of categories'). # Compared to simple LASSO this model uses the LARS algorithm instead of # the 'vanilla' 'coordinate_descent' of simple LASSO. # CV: cross validation: this sets the alpha parameter (refered to as # lambda parameter in the course video) by cross validation. # In the simple LARS this alpha (the penalty factor) is an input of the # function. # 'The alpha parameter controls the degree of sparsity of the # coefficients estimated. # If alpha = zero then the method is the same as OLS. model = LassoLarsCV( cv=10, # Number of folds. precompute=False, # Do not precompute Gram matrix. # precompute=True, # Do not precompute Gram matrix. # verbose=3, ).fit(input_training, output_training) dict_var_lin_coefs = dict(zip( predictors, model.coef_)) print('Result of linear model:') pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()], key=lambda x: abs(x[1])) ) print_separator() # Plot coefficient progression. # TODO: plot those on 4 different subplots. model_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(model_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') plt.legend(predictors, loc='best',) plt.tight_layout() plt.savefig('result00.png', dpi=600) plt.close() # TODO: why are the coefficients in the result very different than the # coefficient path? # # There seems to be a scaling of the coefficient paths with an arbitrary # almost the same constant (194 in this case) # # print('Resulting alpha is not different than path alpha (difference):') # difference = model.alpha_ - model.alphas_ # pprint(model.alpha_ - model.alphas_) # print('Resulting coefficients are very different than path coefficients (difference):') # pprint(model.coef_ - model.coef_path_.T) # print_separator() # Plot mean square error for each fold. # To avoid getting dividebyzero warning map zero to an extremely low value. model.cv_alphas_ = list( map(lambda x: x if x != 0 else np.inf, model.cv_alphas_)) model_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(model_log_alphas, model.cv_mse_path_, ':') plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.legend() plt.tight_layout() plt.savefig('result01.png', dpi=600) plt.close() # Mean squared error of our model. train_error = mean_squared_error(output_training, model.predict(input_training)) test_error = mean_squared_error(output_test, model.predict(input_test)) print ('Training data MSE') print(train_error) print ('Test data MSE') print(test_error) print_separator() # R-square from training and test data. rsquared_train = model.score( input_training, output_training) rsquared_test = model.score( input_test, output_test) print ('Training data R-square') print(rsquared_train) print ('Test data R-square') print(rsquared_test) print_separator() return {'model': model, 'dataframe': df}
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred)) y_test_score = rd.score(x_test, y_test) print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score)) print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score)) '''========9.Lasso回归========''' import numpy as np import matplotlib.pyplot as plt # 可视化绘制 from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取 #model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(x_train, y_train) # 线性回归建模 print('系数矩阵:\n', model.coef_, model.intercept_) print('线性回归模型:\n', model) print('最佳的alpha:', model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 # 使用模型预测 #分别预测训练数据和测试数据 y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) #分别计算其均方根误差和拟合优度 y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred)) y_train_score = model.score(x_train, y_train) y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred)) y_test_score = model.score(x_test, y_test) print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score)) print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.savefig('Fig02') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(resp_train, model.predict(pred_train)) test_error = mean_squared_error(resp_test, model.predict(pred_test)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) # R-square from training and test data rsquared_train = model.score(pred_train, resp_train) rsquared_test = model.score(pred_test, resp_test) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test)
#plot coefficient progrssion m_log_alphascv = -np.log10(model.alphas_) ax = pyplot.gca() pyplot.plot(m_log_alphascv, model.coef_path_.T) #.T is to transpose the coeff_path_attri matrix to match the first dim of array of alpha values pyplot.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha_CV') #print("Alpha Value:",m_log_alphascv, "Coefficients:",model.cv_mse_path_) pyplot.ylabel('Regression Coefficients') pyplot.xlabel('-log(alpha)') pyplot.title('Regression coefficients for lasso plots') pyplot.show() # Indicate the lasso parameter that minimizes the average MSE acrossfolds. lasso_fit = model.fit(x, y) lasso_path = model.score(x, y) pyplot.axvline(lasso_fit.alpha_, color = 'red') pyplot.title("Lasso parameter") print('Deg. Coefficient') print(lasso_fit.intercept_) print(dict(zip(X_train.columns, lasso_fit.coef_))) #print("Lasso parameter:",lasso_fit.alpha_) pyplot.show() #MSE for training and testing data train_error=mean_squared_error(y_train, model.predict(X_train)) test_error=mean_squared_error(y_test, model.predict(X_test)) print ("Traiing data MSE") print (train_error) print ("Testing data MSE") print (test_error)
plt.plot(m_log_alphascv, lassomodel.cv_mse_path_, ':') plt.plot(m_log_alphascv, lassomodel.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(lassomodel.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(y_train, lassomodel.predict(X_train)) test_error = mean_squared_error(y_test, lassomodel.predict(X_test)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train= lassomodel.score(X_train,y_train) rsquared_test= lassomodel.score(X_test,y_test) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
## Performs Extremely Badly ## import pandas as pd import numpy as np from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split lm = LassoLarsCV(max_iter = 1000, precompute = 'auto', cv = 10, eps = 0.0001) USE = pd.read_csv(r"C:\Users\chias\source\repos\FFM-MA\US equities.csv") x = USE[['growth','inflation','liquidity','risk app']] y = USE['return'] use_train_x, use_test_x, use_train_y, use_test_y = train_test_split(x, y, test_size = 0.2, random_state = 6 ) lm.fit(use_train_x, use_train_y) print("USE SCORE:") print(lm.score(use_test_x, use_test_y))
print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试LassoLarsCV类**********" lassoLarscv = LassoLarsCV(cv=5) # 拟合训练集 lassoLarscv.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", lassoLarscv.coef_ print "截距:", lassoLarscv.intercept_ print '训练集R2: ', r2_score(train_Y, lassoLarscv.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = lassoLarscv.predict(test_X) print "测试集得分:", lassoLarscv.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, lassoLarscv.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试LassoLarsIC类**********" lassoLarsIC = LassoLarsIC() # lassoLarsIC = LassoLarsIC(criterion='bic') # 拟合训练集 lassoLarsIC.fit(train_X, train_Y.values.ravel())
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(training_target, model.predict(training_data)) test_error = mean_squared_error(test_target, model.predict(test_data)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) # R-square from training and test data rsquared_train = model.score(training_data, training_target) rsquared_test = model.score(test_data, test_target) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test)
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLarsCV from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = LassoLarsCV(cv=15) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
#plot coefficient progrssion m_log_alphascv = -np.log10(model.alphas_) ax = pyplot.gca() pyplot.plot(m_log_alphascv, model.coef_path_.T) #.T is to transpose the coeff_path_attri matrix to match the first dim of array of alpha values pyplot.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha_CV') print("Alpha Value:",m_log_alphascv, "Coefficients:",model.cv_mse_path_) pyplot.ylabel('Regression Coefficients') pyplot.xlabel('-log(alpha)') pyplot.title('Regression coefficients for lasso plots') pyplot.show() # Indicate the lasso parameter that minimizes the average MSE acrossfolds. lasso_fit = model.fit(predictors, y) lasso_path = model.score(predictors, y) pyplot.axvline(lasso_fit.alpha_, color = 'red') pyplot.title("Lasso parameter") print('Deg. Coefficient') print(pd.Series(np.r_[lasso_fit.intercept_, lasso_fit.coef_])) print("Lasso parameter:",lasso_fit.alpha_) pyplot.show() #MSE for training and testing data train_error=mean_squared_error(y_train, model.predict(X_train)) test_error=mean_squared_error(y_test, model.predict(X_test)) print ("Traiing data MSE") print (train_error) print ("Testing data MSE") print (test_error)
class FEW(SurvivalMixin, VariationMixin, EvaluationMixin, BaseEstimator): """FEW uses GP to find a set of transformations from the original feature space that produces the best performance for a given machine learner. """ update_checked = False def __init__(self, population_size=50, generations=100, mutation_rate=0.5, crossover_rate=0.5, ml=None, min_depth=1, max_depth=2, max_depth_init=2, sel='epsilon_lexicase', tourn_size=2, fit_choice=None, op_weight=False, seed_with_ml=True, erc=False, random_state=np.random.randint(9999999), verbosity=0, scoring_function=None, disable_update_check=False, elitism=True, boolean=False, classification=False, clean=False, track_diversity=False, mdr=False, otype='f'): # sets up GP. # Save params to be recalled later by get_params() self.params = locals( ) # Must be placed before any local variable definitions self.params.pop('self') # # Do not prompt the user to update during this session if they ever disabled the update check # if disable_update_check: # FEW.update_checked = True # # # Prompt the user if their version is out of date # if not disable_update_check and not FEW.update_checked: # update_check('FEW', __version__) # FEW.update_checked = True self._best_estimator = None self._training_features = None self._training_labels = None self._best_inds = None self.population_size = population_size self.generations = generations self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.min_depth = min_depth self.max_depth = max_depth self.max_depth_init = max_depth_init self.sel = sel self.tourn_size = tourn_size self.fit_choice = fit_choice self.op_weight = op_weight self.seed_with_ml = seed_with_ml self.erc = erc self.random_state = random_state self.verbosity = verbosity self.scoring_function = scoring_function self.gp_generation = 0 self.elitism = elitism self.max_fit = 99999999.666 self.boolean = boolean self.classification = classification self.clean = clean self.ml = ml self.track_diversity = track_diversity self.mdr = mdr self.otype = otype # if otype is b, boolean functions must be turned on if self.otype == 'b': self.boolean = True # instantiate sklearn estimator according to specified machine learner if self.ml is None: if self.classification: self.ml = LogisticRegression(solver='sag') else: self.ml = LassoLarsCV() if not self.scoring_function: if self.classification: self.scoring_function = accuracy_score else: self.scoring_function = r2_score # set default fitness metrics for various learners if not self.fit_choice: self.fit_choice = { #regression type(LassoLarsCV()): 'mse', type(SVR()): 'mae', type(LinearSVR()): 'mae', type(KNeighborsRegressor()): 'mse', type(DecisionTreeRegressor()): 'mse', type(RandomForestRegressor()): 'mse', #classification type(SGDClassifier()): 'r2', type(LogisticRegression()): 'r2', type(SVC()): 'r2', type(LinearSVC()): 'r2', type(RandomForestClassifier()): 'r2', type(DecisionTreeClassifier()): 'r2', type(DistanceClassifier()): 'silhouette', type(KNeighborsClassifier()): 'r2', }[type(self.ml)] # Columns to always ignore when in an operator self.non_feature_columns = ['label', 'group', 'guess'] # function set self.func_set = [ node('+'), node('-'), node('*'), node('/'), node('sin'), node('cos'), node('exp'), node('log'), node('^2'), node('^3'), node('sqrt') ] # if boolean operators are included but the output type is set to float, then # # include the if and if-else operations that allow use of both stacks # if self.boolean and self.otype=='f': # self.func_set += [ # {'name:','if','arity':2,'in_type':} # ] # terminal set self.term_set = [] # diversity self.diversity = [] #@profile def fit(self, features, labels): """Fit model to data""" np.random.seed(self.random_state) # setup data # imputation if self.clean: features = self.impute_data(features) # Train-test split routine for internal validation #### train_val_data = pd.DataFrame(data=features) train_val_data['labels'] = labels # print("train val data:",train_val_data[::10]) new_col_names = {} for column in train_val_data.columns.values: if type(column) != str: new_col_names[column] = str(column).zfill(10) train_val_data.rename(columns=new_col_names, inplace=True) # internal training/validation split train_i, val_i = train_test_split(train_val_data.index, stratify=None, train_size=0.75, test_size=0.25) x_t = train_val_data.loc[train_i].drop('labels', axis=1).values x_v = train_val_data.loc[val_i].drop('labels', axis=1).values y_t = train_val_data.loc[train_i, 'labels'].values y_v = train_val_data.loc[val_i, 'labels'].values # Store the training features and classes for later use self._training_features = x_t self._training_labels = y_t #### # set population size if type(self.population_size) is str: if 'x' in self.population_size: # self.population_size = int( float(self.population_size[:-1]) * features.shape[1]) else: self.population_size = int(self.population_size) if self.verbosity > 0: print("population size:", self.population_size) # print few settings if self.verbosity > 1: for arg in self.get_params(): print('{}\t=\t{}'.format(arg, self.get_params()[arg])) print('') # initial model initial_estimator = copy.deepcopy(self.ml.fit(x_t, y_t)) # self._best_estimator = copy.deepcopy(self.ml.fit(x_t,y_t)) self._best_score = self.ml.score(x_v, y_v) initial_score = self._best_score if self.verbosity > 2: print("initial estimator size:", self.ml.coef_.shape) if self.verbosity > 0: print("initial ML CV: {:1.3f}".format(self._best_score)) # create terminal set for i in np.arange(x_t.shape[1]): # dictionary of node name, arity, feature column index, output type and input type self.term_set.append(node('x', loc=i)) # features # add ephemeral random constants if flag if self.erc: self.term_set.append(node( 'k', value=np.random.rand())) # ephemeral random constants # edit function set if boolean if self.boolean or self.otype == 'b': # include boolean functions self.func_set += [ node('!'), node('&'), node('|'), node('=='), node('>_f'), node('<_f'), node('>=_f'), node('<=_f'), node('>_b'), node('<_b'), node('>=_b'), node('<=_b'), node('xor_b'), node('xor_f') ] # add mdr if specified if self.mdr: self.func_set += [node('mdr2')] # Create initial population # for now, force seed_with_ml to be off if otype is 'b', since data types` # are assumed to be float if self.otype == 'b': self.seed_with_ml = False pop = self.init_pop(self._training_features.shape[0]) # check that uuids are unique in population uuids = [p.id for p in pop.individuals] if len(uuids) != len(set(uuids)): pdb.set_trace() # Evaluate the entire population # X represents a matrix of the population outputs (number os samples x population size) # single thread pop.X = self.transform(x_t, pop.individuals, y_t).transpose() # parallel: # pop.X = np.asarray(Parallel(n_jobs=-1)(delayed(out)(I,x_t,self.otype,y_t) for I in pop.individuals), order = 'F') # calculate fitness of individuals # fitnesses = list(map(lambda I: fitness(I,y_t,self.ml),pop.X)) fitnesses = self.calc_fitness(pop.X, y_t, self.fit_choice, self.sel) # max_fit = self.max_fit # while len([np.mean(f) for f in fitnesses if np.mean(f) < max_fit and np.mean(f)>=0])<self.population_size and max_count < 100: # pop = self.init_pop() # pop.X = self.transform(x_t,pop.individuals,y_t) # fitnesses = self.calc_fitness(pop.X,y_t,self.fit_choice,self.sel) # # max_count+= 1 # print("fitnesses:",fitnesses) # Assign fitnesses to inidividuals in population for ind, fit in zip(pop.individuals, fitnesses): if isinstance( fit, (list, np.ndarray)): # calc_fitness returned raw fitness values fit[fit < 0] = self.max_fit fit[np.isnan(fit)] = self.max_fit fit[np.isinf(fit)] = self.max_fit ind.fitness_vec = fit ind.fitness = np.mean(ind.fitness_vec) else: ind.fitness = np.nanmin([fit, self.max_fit]) #with Parallel(n_jobs=10) as parallel: #################### ### Main GP loop self.diversity = [] # progress bar pbar = tqdm(total=self.generations, disable=self.verbosity == 0, desc='Internal CV: {:1.3f}'.format(self._best_score)) # for each generation g for g in np.arange(self.generations): if self.track_diversity: self.get_diversity(pop.X) if self.verbosity > 1: print(".", end='') if self.verbosity > 1: print(str(g) + ".)", end='') # if self.verbosity > 1: print("population:",stacks_2_eqns(pop.individuals)) if self.verbosity > 2: print("pop fitnesses:", ["%0.2f" % x.fitness for x in pop.individuals]) if self.verbosity > 1: print("median fitness pop: %0.2f" % np.median([x.fitness for x in pop.individuals])) if self.verbosity > 1: print("best fitness pop: %0.2f" % np.min([x.fitness for x in pop.individuals])) if self.verbosity > 1 and self.track_diversity: print("feature diversity: %0.2f" % self.diversity[-1]) if self.verbosity > 1: print("ml fitting...") # fit ml model with warnings.catch_warnings(): warnings.simplefilter("ignore") try: # if len(self.valid_loc(pop.individuals)) > 0: if self.valid(pop.individuals): self.ml.fit( pop.X[self.valid_loc(pop.individuals), :]. transpose(), y_t) # else: # self.ml.fit(pop.X.transpose(),y_t) except ValueError as detail: # pdb.set_trace() print( "warning: ValueError in ml fit. X.shape:", pop.X[:, self.valid_loc(pop.individuals)].transpose( ).shape, "y_t shape:", y_t.shape) print( "First ten entries X:", pop.X[self.valid_loc(pop.individuals), :].transpose() [:10]) print("First ten entries y_t:", y_t[:10]) print("equations:", stacks_2_eqns(pop.individuals)) print("FEW parameters:", self.get_params()) if self.verbosity > 1: print("---\ndetailed error message:", detail) raise (ValueError) # if self.verbosity > 1: print("number of non-zero regressors:",self.ml.coef_.shape[0]) # keep best model tmp_score = 0 try: # if len(self.valid_loc(pop.individuals)) > 0: if self.valid(pop.individuals): tmp_score = self.ml.score( self.transform( x_v, pop.individuals)[:, self.valid_loc(pop.individuals)], y_v) # else: # tmp_score = 0 # tmp = self.ml.score(self.transform(x_v,pop.individuals),y_v) except Exception as detail: if self.verbosity > 1: print(detail) if self.verbosity > 1: print("current ml validation score:", tmp_score) if self.valid(pop.individuals) and tmp_score > self._best_score: self._best_estimator = copy.deepcopy(self.ml) self._best_score = tmp_score self._best_inds = copy.deepcopy(self.valid(pop.individuals)) if self.verbosity > 1: print("updated best internal validation score:", self._best_score) # Variation if self.verbosity > 2: print("variation...") offspring, elite, elite_index = self.variation(pop.individuals) # evaluate offspring if self.verbosity > 2: print("output...") X_offspring = self.transform(x_t, offspring).transpose() #parallel: # X_offspring = np.asarray(Parallel(n_jobs=-1)(delayed(out)(O,x_t,y_t,self.otype) for O in offspring), order = 'F') if self.verbosity > 2: print("fitness...") F_offspring = self.calc_fitness(X_offspring, y_t, self.fit_choice, self.sel) # F_offspring = parallel(delayed(f[self.fit_choice])(y_t,yhat) for yhat in X_offspring) # print("fitnesses:",fitnesses) # Assign fitnesses to inidividuals in population for ind, fit in zip(offspring, F_offspring): if isinstance( fit, (list, np.ndarray)): # calc_fitness returned raw fitness values fit[fit < 0] = self.max_fit fit[np.isnan(fit)] = self.max_fit fit[np.isinf(fit)] = self.max_fit ind.fitness_vec = fit ind.fitness = np.mean(ind.fitness_vec) else: ind.fitness = np.nanmin([fit, self.max_fit]) # Survival if self.verbosity > 2: print("survival..") survivors, survivor_index = self.survival(pop.individuals, offspring, elite, elite_index) pop.individuals[:] = survivors pop.X = np.vstack((pop.X, X_offspring))[survivor_index, :] if self.verbosity > 2: print("median fitness survivors: %0.2f" % np.median([x.fitness for x in pop.individuals])) if self.verbosity > 2: print( "best features:", stacks_2_eqns(self._best_inds) if self._best_inds else 'original') pbar.set_description('Internal CV: {:1.3f}'.format( self._best_score)) pbar.update(1) # end of main GP loop #################### if self.verbosity > 0: print('finished. best internal val score: {:1.3f}'.format( self._best_score)) if self.verbosity > 0: print("final model:\n", self.print_model()) if not self._best_estimator: self._best_estimator = initial_estimator return self def transform(self, x, inds=None, labels=None): """return a transformation of x using population outputs""" if inds: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in inds)).transpose() return np.asarray([ self.out(I, x, labels, self.otype) for I in inds ]).transpose() else: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in self._best_inds)).transpose() return np.asarray([ self.out(I, x, labels, self.otype) for I in self._best_inds ]).transpose() def impute_data(self, x): """Imputes data set containing Nan values""" imp = Imputer(missing_values='NaN', strategy='mean', axis=0) return imp.fit_transform(x) def clean(self, x): """remove nan and inf rows from x""" return x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] def clean_with_zeros(self, x): """ set nan and inf rows from x to zero""" x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] = 0 return x def predict(self, testing_features): """predict on a holdout data set.""" # print("best_inds:",self._best_inds) # print("best estimator size:",self._best_estimator.coef_.shape) if self.clean: testing_features = self.impute_data(testing_features) if self._best_inds: X_transform = self.transform(testing_features) try: return self._best_estimator.predict( self.transform(testing_features)) except ValueError as detail: pdb.set_trace() print('shape of X:', testing_features.shape) print('shape of X_transform:', X_transform.transpose().shape) print('best inds:', stacks_2_eqns(self._best_inds)) print('valid locs:', self.valid_loc(self._best_inds)) raise ValueError(detail) else: return self._best_estimator.predict(testing_features) def fit_predict(self, features, labels): """Convenience function that fits a pipeline then predicts on the provided features Parameters ---------- features: array-like {n_samples, n_features} Feature matrix labels: array-like {n_samples} List of class labels for prediction Returns ---------- array-like: {n_samples} Predicted labels for the provided features """ self.fit(features, labels) return self.predict(features) def score(self, testing_features, testing_labels): """estimates accuracy on testing set""" # print("test features shape:",testing_features.shape) # print("testing labels shape:",testing_labels.shape) yhat = self.predict(testing_features) return self.scoring_function(testing_labels, yhat) def export(self, output_file_name): """exports engineered features Parameters ---------- output_file_name: string String containing the path and file name of the desired output file Returns ------- None """ if self._best_estimator is None: raise ValueError( 'A model has not been optimized. Please call fit() first.') # Write print_model() to file with open(output_file_name, 'w') as output_file: output_file.write(self.print_model()) # if decision tree, print tree into dot file if 'DecisionTree' in type(self.ml).__name__: export_graphviz(self._best_estimator, out_file=output_file_name + '.dot', feature_names=stacks_2_eqns(self._best_inds) if self._best_inds else None, class_names=['True', 'False'], filled=False, impurity=True, rotate=True) def init_pop(self, num_features=1): """initializes population of features as GP stacks.""" pop = Pop(self.population_size, num_features) # make programs if self.seed_with_ml: # initial population is the components of the default ml model if type(self.ml) == type(LassoLarsCV()): # add all model components with non-zero coefficients for i, (c, p) in enumerate( it.zip_longest([c for c in self.ml.coef_ if c != 0], pop.individuals, fillvalue=None)): if c is not None and p is not None: p.stack = [node('x', loc=i)] elif p is not None: # make program if pop is bigger than model componennts make_program( p.stack, self.func_set, self.term_set, np.random.randint(self.min_depth, self.max_depth + 1), self.otype) p.stack = list(reversed(p.stack)) else: # seed with raw features # if list(self.ml.coef_): #pdb.set_trace() try: if self.population_size < self.ml.coef_.shape[0]: # seed pop with highest coefficients coef_order = np.argsort(self.ml.coef_[::-1]) for i, (c, p) in enumerate( zip(coef_order, pop.individuals)): p.stack = [node('x', loc=i)] else: raise (AttributeError) except Exception: # seed pop with raw features for i, p in it.zip_longest(range( self._training_features.shape[1]), pop.individuals, fillvalue=None): if p is not None: if i is not None: p.stack = [node('x', loc=i)] else: make_program( p.stack, self.func_set, self.term_set, np.random.randint(self.min_depth, self.max_depth + 1), self.otype) p.stack = list(reversed(p.stack)) # print initial population if self.verbosity > 2: print("seeded initial population:", stacks_2_eqns(pop.individuals)) else: for I in pop.individuals: depth = np.random.randint(self.min_depth, self.max_depth + 1) # print("hex(id(I)):",hex(id(I))) # depth = 2; # print("initial I.stack:",I.stack) make_program(I.stack, self.func_set, self.term_set, depth, self.otype) # print(I.stack) I.stack = list(reversed(I.stack)) # print(I.stack) return pop def print_model(self, sep='\n'): """prints model contained in best inds, if ml has a coefficient property. otherwise, prints the features generated by FEW.""" model = '' if self._best_inds: if type(self.ml).__name__ != 'SVC' and type( self.ml).__name__ != 'SVR': # this is need because svm has a bug that throws valueerror on attribute check: if hasattr(self.ml, 'coef_'): if self._best_estimator.coef_.shape[0] == 1 or len( self._best_estimator.coef_.shape) == 1: if self._best_estimator.coef_.shape[0] == 1: s = np.argsort( np.abs(self._best_estimator.coef_[0]))[::-1] scoef = self._best_estimator.coef_[0][s] else: s = np.argsort(np.abs( self._best_estimator.coef_))[::-1] scoef = self._best_estimator.coef_[s] bi = [self._best_inds[k] for k in s] model = (' +' + sep).join([ str(round(c, 3)) + '*' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, scoef)) if round(scoef[i], 3) != 0 ]) else: # more than one decision function is fit. print all. for j, coef in enumerate(self._best_estimator.coef_): s = np.argsort(np.abs(coef))[::-1] scoef = coef[s] bi = [self._best_inds[k] for k in s] model += sep + 'class' + str( j) + ' :' + ' + '.join([ str(round(c, 3)) + '*' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, coef)) if coef[i] != 0 ]) elif hasattr(self._best_estimator, 'feature_importances_'): s = np.argsort( self._best_estimator.feature_importances_)[::-1] sfi = self._best_estimator.feature_importances_[s] bi = [self._best_inds[k] for k in s] model = 'importance : feature\n' model += sep.join([ str(round(c, 3)) + '\t:\t' + stack_2_eqn(f) for i, (f, c) in enumerate(zip(bi, sfi)) if round(sfi[i], 3) != 0 ]) else: return stacks_2_eqns(self._best_inds) else: return stacks_2_eqns(self._best_inds) else: return 'original features' return model def representation(self): """return stacks_2_eqns output""" return stacks_2_eqns(self._best_inds) def valid_loc(self, individuals): """returns the indices of individuals with valid fitness.""" return [ index for index, i in enumerate(individuals) if i.fitness < self.max_fit and i.fitness >= 0 ] def valid(self, individuals): """returns the sublist of individuals with valid fitness.""" return [ i for i in individuals if i.fitness < self.max_fit and i.fitness >= 0 ] def get_params(self, deep=None): """Get parameters for this estimator This function is necessary for FEW to work as a drop-in feature constructor in, e.g., sklearn.model_selection.cross_val_score Parameters ---------- deep: unused Only implemented to maintain interface for sklearn Returns ------- params: mapping of string to any Parameter names mapped to their values """ return self.params def get_diversity(self, X): """compute mean diversity of individual outputs""" # diversity in terms of cosine distances between features feature_correlations = np.zeros(X.shape[0] - 1) for i in np.arange(1, X.shape[0] - 1): feature_correlations[i] = max(0.0, r2_score(X[0], X[i])) # pdb.set_trace() self.diversity.append(1 - np.mean(feature_correlations))
plt.title('Regression Coefficients Progression for Lasso Paths') # plot mean square error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(training_target, model.predict(training_data)) test_error = mean_squared_error(test_target, model.predict(test_data)) print ('training data MSE') print(train_error) print ('test data MSE') print(test_error) # R-square from training and test data rsquared_train=model.score(training_data, training_target) rsquared_test=model.score(test_data, test_target) print ('training data R-square') print(rsquared_train) print ('test data R-square') print(rsquared_test)
from sklearn.linear_model import LassoLarsCV from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt X, y = make_regression(n_features=1, noise=4.0, random_state=0) y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100) reg = LassoLarsCV(cv=5).fit(X_train, y_train) print(reg.score(X_train, y_train)) print(reg.score(X_test, y_test)) print(reg.alpha_) y_pred = reg.predict(X) print(X_train.shape, y_train.shape) plt.scatter(X_train, y_train, label='train') plt.scatter(X_test, y_test, label='test') plt.plot(X, y_pred) plt.show()
y_pred_lasso = lasso.fit(train_features, train_targets).predict(test_features) r2_score_lasso = r2_score(test_targets, y_pred_lasso) r2_values_store.append(r2_score_lasso) #print(r2_values_store) plt.figure() #plt.figure(figsize=(32,18), dpi=1200) # used to expose the figure at higher resolution plt.plot(m_log_alphas_modified, r2_values_store) plt.xlabel('alpha values') plt.ylabel('R^2 values') plt.show() # The estimator chose automatically its lambda: tune_parameter = -np.log10(model.alpha_) print("Tuned parameter obtained using cross validation : %f" % tune_parameter) #To evaluate the cross validation perfomance Cross_validation_perfomance = model.score(test_features, test_targets) print("Cross validation perfomance : %f" % Cross_validation_perfomance) alpha = tune_parameter lasso = Lasso(alpha=-np.log10(model.alpha_)) y_pred_lasso = lasso.fit(train_features, train_targets).predict(test_features) r2_score_lasso = r2_score(test_targets, y_pred_lasso) print(lasso) print("R^2 on test data : %f" % r2_score_lasso) #plt.plot(lasso.coef_, label='Lasso coefficients') #plt.xlabel('alpha values') #plt.ylabel('R^2') #plt.plot(m_log_alphas, r2_score_lasso) #plt.legend(loc='best') #plt.show()
model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') # There is variability across individual cv as variables area added in the same pattern # = Decrease rapidly and then level off to point where more prediction is not reducing MSE # 3.5 MSE from training and test data from sklearn.metrics import mean_squared_error train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('training data MSE') print(train_error) print('test data MSE') print(test_error) #similar accuracy # 3.6 R-square from training and test data rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('training data R-square') print(rsquared_train) print('test data R-square') print(rsquared_test) #more accurate than training data
# Models that performed substantially worse # model = LinearSVC() # model = KNeighborsClassifier(n_neighbors = 3) # model = GaussianNB() # model = LogisticRegression() # model = SVC() # ## Fit/Accurancy # In[32]: model.fit(train_X, train_y) # Print the Training Set Accuracy and the Test Set Accuracy in order to understand overfitting print(model.score(train_X, train_y), model.score(valid_X, valid_y)) # In[33]: id = test_X.Id result = model.predict(test_X) # output = pd.DataFrame( { 'id': id , 'SalePrice': result}, columns=['id', 'SalePrice'] ) output = pd.DataFrame({'id': id, 'SalePrice': result}) output = output[['id', 'SalePrice']] output.to_csv("solution.csv", index=False) output.head(10) # ## Conclusion #