def test_simple_vs_refined_algorithm(theta, fit_path): # Test the consistency of the results between the 2 versions of # the algorithm. # Simple Algorithm (2 steps of Lasso Lars) lasso1 = LassoLars(alpha=alpha) lasso1.fit(X_train, y_train) X1 = X_train.copy() X1[:, lasso1.coef_ == 0] = 0 lasso2 = LassoLars(alpha=alpha*theta) lasso2.fit(X1, y_train) pred_simple = lasso2.predict(X_test) # Refined Algorithm relasso = RelaxedLassoLars(alpha=alpha, theta=theta, fit_path=fit_path) relasso.fit(X_train, y_train) pred_refined = relasso.predict(X_test) assert_array_almost_equal(pred_simple, pred_refined) assert_array_almost_equal(lasso2.coef_, relasso.coef_) assert_almost_equal(lasso2.score(X_test, y_test), relasso.score(X_test, y_test), decimal=2)
# LassoLars Regression import numpy as np from sklearn import datasets from sklearn.linear_model import LassoLars # load the iris datasets dataset = datasets.load_diabetes() # fit a LASSO using LARS model to the data model = LassoLars(alpha=0.1) model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
# LassoLars Regression # The Least Angle Regression (LARS) can be used as an alternative method for calculating Least Absolute Shrinkage # and Selection Operator (LASSO) fit. import numpy as np from sklearn import datasets from sklearn.linear_model import LassoLars # load the iris datasets dataset = datasets.load_diabetes() # fit a LASSO using LARS model to the data model = LassoLars(alpha=0.1) model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted - expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
def task2(data): df = data dfreg = df.loc[:, ['Adj Close', 'Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:', confidencereg * 100) print('The quadratic regression 2 confidence is:', confidencepoly2 * 100) print('The quadratic regression 3 confidence is:', confidencepoly3 * 100) print('The knn regression confidence is:', confidenceknn * 100) print('The lasso regression confidence is:', confidencelas * 100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:', confidencebyr * 100) print('The Lasso LARS regression confidence is:', confidencelar * 100) print('The OMP regression confidence is:', confidenceomp * 100) print('The ARD regression confidence is:', confidenceard * 100) print('The SGD regression confidence is:', confidencesgd * 100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime( '%Y-%m-%d')), dfreg['Adj Close'].to_list( ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list( ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list( ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list( ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list( ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLars data = pd.read_csv("dataset.csv", header=0) X = data.loc[:, ["Commune", "Etage", "Superficie", "Piece"]].values Y = data.loc[:, "Prix"].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) regressor = LassoLars(alpha=0.1) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
l_reg=LinearRegression() lasso_reg=LassoLars() # In[380]: l_reg.fit(X_train,y_train) # In[382]: l_reg.score(X_test,y_test) # In[383]: lasso_reg.fit(X_train,y_train) # In[384]: lasso_reg.score(X_test,y_test)
model_EN.fit(X_train, y_train) model_EN.score(X_train, y_train) print "Score of Elastic-net on train data: ", model_EN.score(X_train, y_train) print "Score of Elastic-net on test data: ", model_EN.score(X_test, y_test) print "L1 ratio: ", models.l1_ratio_ print "Alpha: ", models.alpha_ # At this point we should be our final model on the entire dataset # using previously tunned parameters # Lasso-lars models = LassoLarsCV(max_n_alphas=40, verbose=1, cv=folds) models.fit(X_train, y_train) model_LL = LassoLars(alpha=models.alpha_) model_LL.fit(X_train, y_train) print "Score of Lasso-Lars on train data: ", model_LL.score(X_train, y_train) print "Score of Lasso-Lars on test data: ", model_LL.score(X_test, y_test) ''' =============== PCA ================== 3 first components explain only ~33% of the variance when applied non-normalized data You can test it by copying following code to preprocessing.py from sklearn.decomposition import PCA pca = PCA(3) X = pca.fit_transform(data.iloc[:,1:]) print pca.explained_variance_ratio_ Actually I was surprised by how poorly it performed so I even wrote a simple MatLab script because it didn't feel right, but it outputs exactly same numbers.
mse = mean_squared_error(y_test, lasso_pred) print("Root Mean Squared Error: ", np.sqrt(mse)) fig = plt.figure(figsize=[10, 8]) ax = plt.subplot(111) ax.plot(y_test.index, lasso_pred, label='Predicted') ax.plot(y_test, label='Test') ax.legend() plt.show() # Evaluation confidence_lr = lr.score(X_test, y_test) confidence_poly2 = poly2.score(X_test, y_test) confidence_poly3 = poly3.score(X_test, y_test) confidence_knn = knn.score(X_test, y_test) confidence_lasso = lasso.score(X_test, y_test) print("Results: ", confidence_lr, confidence_poly2, confidence_poly3, confidence_knn, confidence_lasso) # all on one graph fig = plt.figure(figsize=[10, 8]) ax = plt.subplot(111) ax.plot(y_test.index, lasso_pred, label='Lasso', color='red') ax.plot(y_test.index, knn_pred, label='KNN', color='blue') ax.plot(y_test.index, poly2_pred, label='Poly2', color='green') ax.plot(y_test.index, poly3_pred, label='Poly3', color='orange') ax.plot(y_test.index, y_pred_lr, label='LR', color='cyan') ax.plot(y_test, label='Test', color='magenta') ax.legend()
from sklearn.model_selection import train_test_split # 准备数据 boston = load_boston() X,Y = boston.data,boston.target X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.3) ''' 利用LARS的lasso回归: 不知道说什么了,因为完全不了解 ''' rg = LassoLars(alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, copy_X=True, fit_path=True, positive=False) rg.fit(X_train,Y_train) Y_pre = rg.predict(X_test) rg.score(X_test,Y_test) rg.coef_ rg.intercept_ ''' alpha fit_intercept verbose normalize precompute max_iter eps copy_X fit_path positive '''
poly = PolynomialFeatures(include_bias=False) X_poly = poly.fit_transform(scale(X)) print(X_poly.shape) # 104 features X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0) ################ Lasso-Lars ################ from sklearn.linear_model import LassoLars from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score lasso_lars = LassoLars(alpha=0.01) lasso_lars.fit(X_train, y_train) lasso_lars.score(X_test, y_test) np.mean(cross_val_score(lasso_lars, X_train, y_train, cv=10)) ###### Tuning alpha parameter from sklearn.model_selection import GridSearchCV param_grid = {'alpha': np.logspace(-3, 3, 13)} grid = GridSearchCV(lasso_lars, param_grid, cv=10) grid.fit(X_train, y_train) print(grid.best_params_) print(grid.best_score_)
lm = lm.fit(X_final, y) # plot regression coefficient fig2, ax2 = plt.subplots() plt.bar(ivs, lm.coef_) plt.tight_layout() # visualize dim-reduced data along the two dimension with the greatest coefficients indices = np.argsort(np.abs(lm.coef_)) y_tmp = y - np.min(y) y_tmp /= np.max(y_tmp) cmap = plt.cm.inferno colors = cmap(y_tmp) #colors[:, 3] = y_tmp fig, axes = plt.subplots(nrows=3) plt.sca(axes[0]) plt.scatter(X_final[:, indices[-1]], X_final[:, indices[-2]], c=colors) axes[0].set_xlabel(ivs[indices[-1]]) axes[0].set_ylabel(ivs[indices[-2]]) plt.sca(axes[1]) plt.scatter(X_final[:, indices[-1]], X_final[:, indices[-3]], c=colors) axes[1].set_xlabel(ivs[indices[-1]]) axes[1].set_ylabel(ivs[indices[-3]]) plt.sca(axes[2]) plt.scatter(X_final[:, indices[-2]], X_final[:, indices[-3]], c=colors) axes[2].set_xlabel(ivs[indices[-2]]) axes[2].set_ylabel(ivs[indices[-3]]) plt.tight_layout() print(lm.score(X_final, y)) plt.show()
print "R^2: ", r2 print "\n**********测试LassoLars类**********" # 在初始化LassoLars类时, 指定超参数α, 默认值是1.0. lassoLars = LassoLars(alpha=0.005) # 拟合训练集 lassoLars.fit(train_X, train_Y) # 打印模型的系数 print "系数:", lassoLars.coef_ print "截距:", lassoLars.intercept_ print '训练集R2: ', r2_score(train_Y, lassoLars.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = lassoLars.predict(test_X) print "测试集得分:", lassoLars.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, lassoLars.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试LassoLarsCV类**********" lassoLarscv = LassoLarsCV(cv=5) # 拟合训练集 lassoLarscv.fit(train_X, train_Y.values.ravel()) # 打印模型的系数