def LassoLarsIC_df(X, y, criterion): """ Passes the inputs into sklearn's LassoLarsIC model selection function. Returns the rss, intercept and all coefficients as a DataFrame as well as a list containing the features with non-zero coefficients. """ model = LassoLarsIC(criterion=criterion) model.fit(X, y.iloc[:, 0]) model_rss = model.score(X, y.iloc[:, 0]) results = ([model_rss] + [model.intercept_] + list(model.coef_)) results_cols = ['rss', 'intercept'] + list(X.columns) results_dict = {'results': results} results_df = pd.DataFrame.from_dict(results_dict, orient="index", columns=results_cols) remaining_features = list( results_df.iloc[0][results_df.iloc[0] != 0].index[2:]) return results_df, remaining_features
def trainData(fileName): df = pd.read_csv(fileName, index_col='date') df = df.sort_index() df = df[[ 'open', 'high', 'close', 'low', 'volume', 'price_change', 'p_change', 'ma5', 'ma10', 'ma20', 'v_ma5', 'v_ma10', 'v_ma20', 'turnover' ]] df = df[['open', 'high', 'low', 'close', 'volume']] df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100.0 df['PCT_change'] = (df['close'] - df['open']) / df['open'] * 100.0 df = df[['close', 'HL_PCT', 'PCT_change', 'volume']] # print(df.head()) forecast_col = 'close' df.fillna(value=-99999, inplace=True) # forecast_out = int(math.ceil(0.01 * len(df))) forecast_out = 1 # ??forecast_out??? df['label'] = df[forecast_col].shift(-forecast_out) print(df.shape) print(df) X = np.array(df.drop(['label'], 1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) print(X) print(X_lately) y = np.array(df['label']) # print(y) print(X.shape) print(y.shape) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2) clf = LassoLarsIC(max_iter=100) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) joblib.dump(clf, "%s.m" % fileName) print(accuracy, "---------score------") forecast_set = clf.predict(X_lately) print(forecast_out) style.use('ggplot') df['Forecast'] = np.nan last_date = df.iloc[-1].name date_time = datetime.datetime.strptime(last_date, '%Y-%m-%d') last_unix = date_time.timestamp() one_day = 86400 next_unix = last_unix + one_day print(forecast_set) for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i] print(df.tail(forecast_out)) df['close'].plot() df['Forecast'].plot() plt.show()
def modeling(x_train, x_test, y_train, y_test, poly_order=1, criterion='bic', iterations=1000, lars_ic=False, lasso_alpha=None, kfold=True, k_n_splits=2, k_scoring='r2', var_name=None, scale=True): """ Function that produces a tuple of linear regression results plus train and test data. Assumes that data has been split already. Default arguments will return a model trained on scaled data using LASSO linear regression with K fold cross validation. x_train - input variables for model training (expects pandas Series/DataFrame) x_test - input variables for model testing (expects pandas Series/DataFrame) y_train - target variables for model training (expects pandas Series) y_train - target variables for model training (expects pandas Series) poly_order - order of polynomial transform to be applied to x_train and x_test criterion - which information criterion will be used to compute best model; default is BIC iterations - number of iterations for minimizing cost function lars_ic - (bool) whether the Sklearn LassoLars return an information criterion and uses it to determine the optimal alpha kfold - (bool) whether to use the Sklearn KFold object to cross validate training data k_n_splits - how many k splits to use when doing KFold validation k_scoring - what metric of model fit the KFold object should return, default is R2 var_name - what name the variable being tested will have in the pandas DataFrame produced. If None, will default to the name of the y_test series scaling - (bool) whether to scale the data or not; uses StandardScaler. Function returns a tuple of objects. Printout for every option indicates what the respective indices are for accessing different items. """ if var_name == None: var_name = f'{y_test.name[0:4]}_polyO{str(poly_order)}_{k_n_splits}ksplits' if iterations != 1000: var_name = f'{y_test.name[0:4]}_polyO{str(poly_order)}_{k_n_splits}ksplits_iter{iterations}' # Using scaling function to scale features if scale: x_train_scaled, x_test_scaled = scale_features(x_train, x_test) else: x_train_scaled, x_test_scaled = x_train, x_test # Producing Polynomial Features (1 being linear regression) poly = PolynomialFeatures(poly_order) x_poly_train = poly.fit_transform(x_train_scaled) x_poly_test = poly.transform(x_test_scaled) if lars_ic: lars_poly = LassoLarsIC( criterion=criterion, fit_intercept=True, normalize=False, max_iter=iterations, ) fit = lars_poly.fit(x_poly_train, y_train) score = lars_poly.score(x_poly_test, y_test) ic_score = np.mean(lars_poly.criterion_) optimal_alpha = lars_poly.alpha_ if kfold: crossval = KFold(n_splits=k_n_splits, shuffle=True, random_state=42) cvs = cross_val_score(lars_poly, x_poly_train, y_train, scoring=k_scoring, cv=crossval) cvs_mean_score = np.mean(cvs) print( f'''The R-2 for a LASSO Least Angle Regression model with with a Polynomial Order of {poly_order} is {score}.\n The model with the lowest {criterion} of {ic_score} has a LASSO alpha of {optimal_alpha} \n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object \n 1 - training X data (np array) \n 2 - testing X data (np array) \n 3 - Model results table (pandas DataFrame obj) \n 4 - training Y data (np array) \n 5 - testing Y data (np array)''' ) return lars_poly, x_poly_train, x_poly_test, pd.DataFrame( data=[[score, ic_score, optimal_alpha, cvs_mean_score]], columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'], index=[var_name]), y_train, y_test else: print( f'''The R-2 for a LASSO Least Angle Regression model with with a Polynomial Order of {poly_order} is {score}.\n The model with the lowest {criterion} of {ic_score} has a LASSO alpha of {optimal_alpha}\n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object \n 1 - training X data (np array) \n 2 - testing X data (np array) \n 3 - Model results table (pandas DataFrame obj) \n 4 - training Y data (np array) \n 5 - testing Y data (np array) ''' ) return lars_poly, x_poly_train, x_poly_test, pd.DataFrame( data=[[score, ic_score, optimal_alpha]], columns=['R2', f'{criterion}', 'Optimal_alpha'], index=[var_name]), y_train, y_test elif not lars_ic: lasso_reg = Lasso(alpha=lasso_alpha, normalize=False, max_iter=iterations, random_state=42) fit = lasso_reg.fit(x_poly_train, y_train) score = lasso_reg.score(x_poly_test, y_test) if kfold: crossval = KFold(n_splits=k_n_splits, shuffle=True, random_state=42) cvs = cross_val_score(lasso_reg, x_poly_train, y_train, scoring=k_scoring, cv=crossval) cvs_mean_score = np.mean(cvs) print( f'''The R-2 for a model with with a Polynomial Order of {poly_order} and a Lasso Alpha of {lasso_alpha} is {np.round(score,4)}.\n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object \n 1 - training X data (np array) \n 2 - testing X data (np array) \n 3 - Model results table (pandas DataFrame obj) \n 4 - training Y data (np array) \n 5 - testing Y data (np array) ''' ) return lasso_reg, x_poly_train, x_poly_test, pd.DataFrame( data=[[score, None, None, cvs_mean_score]], columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'], index=[var_name]), y_train, y_test else: print( f'''The R-2 for a model with with a Polynomial Order of {poly_order} and a Lasso Alpha of {lasso_alpha} is {np.round(score,4)}.\n Function returns a tuple indexed as follows: \n 0 - Sklearn lasso-regression object \n 1 - training X data (np array) \n 2 - testing X data (np array) \n 3 - Model results table (pandas DataFrame obj) \n 4 - training Y data (np array) \n 5 - testing Y data (np array) ''' ) return lasso_reg, x_poly_train, x_poly_test, pd.DataFrame( data=[[score, None, None, None]], columns=['R2', f'{criterion}', 'Optimal_alpha', 'Mean_cvs'], index=[var_name]), y_train, y_test
## 'OPEC': -1.0037125526070625, ## 'PRS International Country Risk Guide': 0.0, ## 'South_American': 1.1666702294227076, ## 'World Economic Forum EOS': -1.1639115442413683, ## 'Years_In_Nato': 0.0, ## 'alcconsumption': 0.59855758131369263, ## 'armedforcesrate': 0.0, ## 'employrate': -2.2695726938628469, ## 'femaleemployrate': 1.0671515028671372, ## 'incomeperperson': 1.191656220279911, ## 'internetuserate': -2.4535120774767076, ## 'lifeexpectancy': 0.0} from sklearn.metrics import mean_squared_error train_error_aic = mean_squared_error(tar_train, model_aic.predict(pred_train)) test_error_aic = mean_squared_error(tar_test, model_aic.predict(pred_test)) print ('training data MSE') print(train_error_aic) print ('test data MSE') print(test_error_aic) # R-square from training and test data rsquared_train_aic=model_aic.score(pred_train,tar_train) rsquared_test_aic=model_aic.score(pred_test,tar_test) print ('training data R-square') print(rsquared_train_aic) print ('test data R-square') print(rsquared_test_aic)
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLarsIC from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = LassoLarsIC(criterion="aic") regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
print "R^2: ", r2 print "\n**********测试LassoLarsIC类**********" lassoLarsIC = LassoLarsIC() # lassoLarsIC = LassoLarsIC(criterion='bic') # 拟合训练集 lassoLarsIC.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", lassoLarsIC.coef_ print "截距:", lassoLarsIC.intercept_ print '训练集R2: ', r2_score(train_Y, lassoLarsIC.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = lassoLarsIC.predict(test_X) print "测试集得分:", lassoLarsIC.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, lassoLarsIC.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试ElasticNet类**********" # 在初始化ElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5. elasticNet = ElasticNet(alpha=1.0, l1_ratio=0.5) # 拟合训练集 elasticNet.fit(train_X, train_Y)