def build_best_prediction(self): print("Building LassoLarsIC linear regression vanilla model!") from matplotlib import pyplot from sklearn.linear_model import LassoLarsIC from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error target_variable_names = self._configuration['model']['target'][0] data_provider = self.get_data_provider( self._configuration[target_variable_names]['data_provider']) input_features_names = self._configuration['model']['input_features'] X_train = data_provider.train[input_features_names] y_train = data_provider.train[target_variable_names] X_test = data_provider.test[input_features_names] y_test = data_provider.test[target_variable_names] # print X_train.dtypes # print X_train.head() # print X_test.dtypes # print X_test.head() # print y_train.dtypes # print y_train.head() # print y_test.dtypes # print y_test.head() my_model_aic = LassoLarsIC(criterion='aic') my_model_aic.fit(X_train, y_train) y_pred_aic = my_model_aic.predict(X_test) # print "Max error: ", max_error(y_test,y_pred) print("AIC Explained variance score: ", explained_variance_score(y_test, y_pred_aic)) print("AIC Mean absolute error: ", mean_absolute_error(y_test, y_pred_aic)) print("AIC Mean squared error: ", mean_squared_error(y_test, y_pred_aic)) my_model_bic = LassoLarsIC(criterion='bic') my_model_bic.fit(X_train, y_train) y_pred_bic = my_model_bic.predict(X_test) # print "Max error: ", max_error(y_test,y_pred) print("BIC Explained variance score: ", explained_variance_score(y_test, y_pred_bic)) print("BIC Mean absolute error: ", mean_absolute_error(y_test, y_pred_bic)) print("BIC Mean squared error: ", mean_squared_error(y_test, y_pred_bic)) self.fit_results = {'aic': my_model_aic, 'bic': my_model_bic} pickle.dump( self.my_model, open(self._configuration['model']['output_filename'], 'wb')) pass
def _lassolarsic(*, train, test, x_predict=None, metrics, criterion='aic', fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.220446049250313e-16, copy_X=True, positive=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsIC.html#sklearn.linear_model.LassoLarsIC """ model = LassoLarsIC(criterion=criterion, fit_intercept=fit_intercept, verbose=verbose, normalize=normalize, precompute=precompute, max_iter=max_iter, eps=eps, copy_X=copy_X, positive=positive) model.fit(train[0], train[1]) model_name = 'LassoLarsIC' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
class HistogramClassifier: def __init__(self): X, y = make_dataframe(letter_list) self.columns = list(X.columns) self.classifier = LassoLarsIC() self.classifier.fit(X, y) def predict(self, X): counter = snippet_to_histogram(X, letter_list) df = pd.DataFrame(columns=self.columns) df = df.append(counter, ignore_index=True).fillna(0) return self.classifier.predict(df)
class LassoLarsICImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
class r07546035_ICRegression(regression): def trainAlgo(self): self.model = LassoLarsIC(criterion=self.param['criterion'], fit_intercept=self.param['fit_intercept'], normalize=self.param['normalize'], max_iter=self.param['max_iter'], eps=self.param['eps'], positive=self.param['positive']) self.model.fit(self.inputData['X'], self.outputData['Y']) def predictAlgo(self): self.result['Y'] = self.model.predict(self.inputData['X'])
def __lasso_selected(data,data_test, response): X = data.drop([response],axis=1).as_matrix() y = np.array(data[response].tolist()).reshape((len(data),1)) #X = sm.add_constant(X) #model = sm.OLS(y,X) #m = model.fit_regularized(refit=True) #yp = m.predict(data_test) reg = LassoLarsIC(criterion='bic') print y.shape,X.shape reg.fit(X,y) x = data_test.drop([response],axis=1).as_matrix().reshape((len(data_test),len(data_test.keys())-1)) yp = reg.predict(x) te = np.mean((yp-np.array(data_test[response].tolist()))**2) print reg.coef_,te return
class HistogramClassifier: def __init__(self): X, y = make_dataframe(letter_list) self.columns = list(X.columns) self.classifier = LassoLarsIC() self.classifier.fit(X, y) def predict(self, X): counter = snippet_to_histogram(X, letter_list) df = pd.DataFrame(columns=self.columns) df = df.append(counter, ignore_index=True).fillna(0) y = np.zeros(len(X)) for i in range(len(X)): y[i] = self.classifier.predict(df) y = round(y.sum() / len(X)) return y
def fit_models_LassoCV(self, X, Y, bands=None): """ Try to fit models to training period time series """ if bands is None: bands = self.fit_indices models = [] for b in bands: # lasso = LassoCV(n_alphas=100) # lasso = LassoLarsCV(masx_n_alphas=100) lasso = LassoLarsIC(criterion='bic') lasso = lasso.fit(X, Y[b, :]) lasso.nobs = Y[b, :].size lasso.coef = np.copy(lasso.coef_) lasso.coef[0] += lasso.intercept_ lasso.fittedvalues = lasso.predict(X) lasso.rss = np.sum((Y[b, :] - lasso.fittedvalues) ** 2) lasso.rmse = math.sqrt(lasso.rss / lasso.nobs) models.append(lasso) return np.array(models)
def trainData(fileName): df = pd.read_csv(fileName, index_col='date') df = df.sort_index() df = df[[ 'open', 'high', 'close', 'low', 'volume', 'price_change', 'p_change', 'ma5', 'ma10', 'ma20', 'v_ma5', 'v_ma10', 'v_ma20', 'turnover' ]] df = df[['open', 'high', 'low', 'close', 'volume']] df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100.0 df['PCT_change'] = (df['close'] - df['open']) / df['open'] * 100.0 df = df[['close', 'HL_PCT', 'PCT_change', 'volume']] # print(df.head()) forecast_col = 'close' df.fillna(value=-99999, inplace=True) # forecast_out = int(math.ceil(0.01 * len(df))) forecast_out = 1 # ??forecast_out??? df['label'] = df[forecast_col].shift(-forecast_out) print(df.shape) print(df) X = np.array(df.drop(['label'], 1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) print(X) print(X_lately) y = np.array(df['label']) # print(y) print(X.shape) print(y.shape) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2) clf = LassoLarsIC(max_iter=100) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) joblib.dump(clf, "%s.m" % fileName) print(accuracy, "---------score------") forecast_set = clf.predict(X_lately) print(forecast_out) style.use('ggplot') df['Forecast'] = np.nan last_date = df.iloc[-1].name date_time = datetime.datetime.strptime(last_date, '%Y-%m-%d') last_unix = date_time.timestamp() one_day = 86400 next_unix = last_unix + one_day print(forecast_set) for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i] print(df.tail(forecast_out)) df['close'].plot() df['Forecast'].plot() plt.show()
def lasso(X, y): clf = LassoLarsIC(criterion='aic') clf.fit(X, y) y_pred = clf.predict(X) return y_pred, clf.alpha_, clf.coef_
## 'North_American': 0.0, ## 'OPEC': -1.0037125526070625, ## 'PRS International Country Risk Guide': 0.0, ## 'South_American': 1.1666702294227076, ## 'World Economic Forum EOS': -1.1639115442413683, ## 'Years_In_Nato': 0.0, ## 'alcconsumption': 0.59855758131369263, ## 'armedforcesrate': 0.0, ## 'employrate': -2.2695726938628469, ## 'femaleemployrate': 1.0671515028671372, ## 'incomeperperson': 1.191656220279911, ## 'internetuserate': -2.4535120774767076, ## 'lifeexpectancy': 0.0} from sklearn.metrics import mean_squared_error train_error_aic = mean_squared_error(tar_train, model_aic.predict(pred_train)) test_error_aic = mean_squared_error(tar_test, model_aic.predict(pred_test)) print ('training data MSE') print(train_error_aic) print ('test data MSE') print(test_error_aic) # R-square from training and test data rsquared_train_aic=model_aic.score(pred_train,tar_train) rsquared_test_aic=model_aic.score(pred_test,tar_test) print ('training data R-square') print(rsquared_train_aic) print ('test data R-square') print(rsquared_test_aic)
######################################################################################################################## ################## BIC CRITERION ################## ######################################################################################################################## from sklearn.linear_model import LassoLarsIC # https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html#sphx-glr-auto-examples-linear-model-plot-lasso-model-selection-py EPSILON = 1e-4 X = np.array(X_train) y = np.array(y_train) model_bic = LassoLarsIC(criterion='bic') model_bic.fit(X, y) alpha_bic_ = model_bic.alpha_ # alpha of 159 BIC_pred = model_bic.predict(np.array(X_test)) R2_BIC = r2_score(BIC_pred, np.array(y_test)) # 0.796 model_aic = LassoLarsIC(criterion='aic') model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ # alpha 54.23 (really different from the BIC) AIC_pred = model_aic.predict(np.array(X_test)) R2_AIC = r2_score(AIC_pred, np.array(y_test)) # 0.879 def plot_ic_criterion(model, name, color): alpha_ = model.alpha_ + EPSILON alphas_ = model.alphas_ + EPSILON criterion_ = model.criterion_ plt.plot(-np.log10(alphas_),
tss, rss, ess, r2 = xss(Y, lassoLarscv.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试LassoLarsIC类**********" lassoLarsIC = LassoLarsIC() # lassoLarsIC = LassoLarsIC(criterion='bic') # 拟合训练集 lassoLarsIC.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", lassoLarsIC.coef_ print "截距:", lassoLarsIC.intercept_ print '训练集R2: ', r2_score(train_Y, lassoLarsIC.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = lassoLarsIC.predict(test_X) print "测试集得分:", lassoLarsIC.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, lassoLarsIC.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2
#%% y normalization mY = y_train.mean() sY = y_train.std() y_train = (y_train - mY) / sY # y_test = ( y_test - mY ) / sY #%% lasso regression """ lassocv.alplha_ is different from which in R """ mName = 'lasso' lassocv = LassoLarsIC() lassocv.fit(X_train, y_train) y_train_pred = lassocv.predict(X_train) predictions = lassocv.predict(X_test) predictions = predictions * sY + mY #predAll = np.append(predAll,predictions).reshape([-1,1]) coef = lassocv.coef_ lassocv.alpha_ draw_prediction(predictions, y_test, mName) ## lassocv = LassoCV(random_state=0, eps=1e-9, cv=10, n_alphas=100) #lassocv = LassoLarsCV() #lassocv.fit(X_train, y_train) #y_train_pred = lassocv.predict(X_train) #predictions = lassocv.predict(X_test) #np.mean(abs(y_train_pred - y_train))/np.mean(y_train) #np.mean(abs(predictions - y_test))/np.mean(y_test)
print('maive MSE: ', mean_squared_error( [m for _ in range(len(y_test))], y_test)) print() print('-'*100) # linear model lm = sm.OLS(y_train, X_train).fit() print(lm.summary()) print('lm MSE: ', mean_squared_error(lm.predict(X_test), y_test)) print('lm AIC: ', lm.aic) print('-'*100) # AIC print("AIC") aic = LassoLarsIC(criterion='aic') aic.fit(X_train, y_train) predictions = aic.predict(X_test) print(mean_squared_error(y_test, predictions)) print(aic.coef_) print('-'*100) # SGD scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) sgd = SGDRegressor(penalty='l2', alpha=0.15, n_iter=200) sgd = sgd.fit(X_train_scaled, y_train) predictions = sgd.predict(scaler.transform(X_test)) print('sgd: ', mean_squared_error(y_test, predictions))