def __modeling(self, data): ''' :param data: :return: 因素名称和影响因子的字典 ''' heads = data.columns # (0,1) transformation scaler = MinMaxScaler(feature_range=(0, 1)) data = pd.DataFrame(scaler.fit_transform(data)) data.columns = heads # X,y poly_X = data.drop(['OP_TIME', self.__class__.__target[0]], axis=1) y = data[self.__class__.__target[0]] kf = TimeSeriesSplit(n_splits=3) kf.get_n_splits(poly_X) print("start trainning model...") # nested 3-fold TimeSeries cross-validation scores = [] lasso_models = [] for train_index, test_index in kf.split(poly_X): print("finding relatively better alpha...") X_train, X_test = poly_X.iloc[train_index], poly_X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] lassocv = linear_model.LassoCV(cv=10, max_iter=1500) lassocv.fit(X_train, y_train) lasso = linear_model.Lasso(alpha=lassocv.alpha_).fit( X_train, y_train) lasso_models.append(lasso) score = lasso.score(X_test, y_test) scores.append(score) scores_ndarray = np.asarray(scores) best_model = lasso_models[scores_ndarray.argmax()] cv_result = model_selection.cross_val_score( best_model, poly_X, y, cv=kf, scoring='neg_mean_squared_error') print('the mean neg_mse_score for LassoRegression is %s' % (np.mean(np.asarray(cv_result)))) # 得到系数的list # factors = np.square(np.asarray(best_model.coef_)) factors = np.abs(np.asarray(best_model.coef_)) # 得到影响因子 influence = factors / np.sum(factors) # 格式化一下小数,输出两位小数 formatted_influence = map(lambda x: '%.2f' % x, influence) named_scores = zip(poly_X.columns, formatted_influence) # sorted_named_scores = sorted(named_scores, key=lambda influence: influence[1], reverse=True) return dict(named_scores)
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Lasso Regression model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=True) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up the machine learning model if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) model = LassoCV(cv=folds, eps=1e-9, n_alphas=16, n_jobs=N_JOBS) else: model = Lasso(alpha=0.1, warm_start=True) if MULTI: model = MultiOutputRegressor( model, n_jobs=1 if self.tune_model else N_JOBS) # set up a machine learning pipeline pipeline = Pipeline([ ("var", VarianceThreshold()), # ('poly', PolynomialFeatures(2)), # longer run time, potentially more accurate # ('var2', VarianceThreshold()), # use this if 'poly' is used # ('shape', QuantileTransformer(output_distribution="normal")), # make input variables normally distributed ("scale", MinMaxScaler()), ("model", model), ]) with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Neural Network model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = MLPRegressor( max_iter=25, hidden_layer_sizes=(64, 64), learning_rate_init=0.001, batch_size=16, alpha=0, learning_rate="adaptive", activation="relu", solver="adam", warm_start=True, shuffle=False, random_state=42, verbose=False, ) if MULTI: model = MultiOutputRegressor( model, n_jobs=N_JOBS, ) pipeline = Pipeline( [ ("var", VarianceThreshold()), ("scale", MinMaxScaler()), ("model", model), ] ) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner str_ = "" if MULTI: str_ = "estimator__" parameters = { f"model__{str_}hidden_layer_sizes": ( (32, 32), (64, 64), (128, 128), ), f"model__{str_}batch_size": (16, 32), f"model__{str_}learning_rate_init": (0.0001, 0.001, 0.01), } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1 if MULTI else N_JOBS, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
def foward_chain_cv(self, scoring_metric, greater_is_better=False): i = 1 MAE = [] Exp_var = [] MSE = [] r_squared = [] params_used = {} y_pred_cont = [] y_test_cont = [] y_pred_cont_index = [] split_dates = [] fig = plt.figure() tscv = TimeSeriesSplit(n_splits=self.no_splits) for train_index, test_index in tqdm(tscv.split(X)): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_test_index = X_test.index.values.tolist() if self.scalar is not None: # Scale Data scaler_X = self.scalar() scaler_y = self.scalar() scaler_X.fit(X_train) scaler_y.fit(y_train) X_train, X_test = scaler_X.transform( X_train), scaler_X.transform(X_test) y_train, y_test = scaler_y.transform( y_train), scaler_y.transform(y_test) else: X_train, X_test = np.asarray(X_train), np.asarray(X_test) y_train, y_test = np.asarray(y_train), np.asarray(y_test) # Find Best Params best_score, best_params = self.find_optimal_paramters( X_train, y_train, self.regressor, self.parameters, scoring_metric, greater_is_better) self.regressor.set_params(**best_params) self.regressor.fit(X_train, y_train.ravel()) # predict y values y_pred = self.regressor.predict(X_test) if self.scalar is not None: # transform y values back to real scale for assessment y_pred = scaler_y.inverse_transform(y_pred) y_test = scaler_y.inverse_transform(y_test) # compute error metrics params_used[i] = best_params MAE.append(metrics.mean_absolute_error(y_test, y_pred)) Exp_var.append(metrics.explained_variance_score(y_test, y_pred)) MSE.append(metrics.mean_squared_error(y_test, y_pred)) r_squared.append(metrics.r2_score(y_test, y_pred)) # plot y_pred vs y_test y_df = pd.DataFrame(index=pd.to_datetime(X_test_index)) y_pred = y_pred.reshape(len(y_pred), ) y_test = y_test.reshape(len(y_test), ) y_df['y_pred'] = y_pred y_df['y_test'] = y_test # plot the subplots ax = fig.add_subplot(int(sqrt(self.no_splits)), int(sqrt(self.no_splits) + 1), i) ax.xaxis.set_major_formatter(DateFormatter('%m-%y')) y_df.plot(title='Split{}'.format(i), ax=ax, legend=False) ax.tick_params(axis='x', rotation=45, labelsize=8) if i == 1: fig.legend(loc=4) # convert arrays to list and append continuous y_pred vs y_test y_pred_cont_index = y_pred_cont_index + X_test_index split_dates.append(y_pred_cont_index[-1]) y_pred_list = y_pred.tolist() y_test_list = y_test.tolist() y_pred_cont = y_pred_cont + y_pred_list y_test_cont = y_test_cont + y_test_list i += 1 # Plot the continuous chart y_continuous_df = pd.DataFrame(index=pd.to_datetime(y_pred_cont_index)) y_pred_cont = np.asarray(y_pred_cont) y_test_cont = np.asarray(y_test_cont) y_continuous_df['Model'] = y_pred_cont y_continuous_df['Actual'] = y_test_cont y_continuous_df.plot(title='Running Performance') # add verticle lines to the running total output del split_dates[-1] for date in split_dates: date = datetime.strptime(date, '%m/%d/%Y %H:%M') plt.axvline(x=date, linestyle=':', color='red', linewidth=1, alpha=.8) # Calculate average metrics no_splits = tscv.get_n_splits() avg_mae = sum(MAE) / no_splits avg_exp_var = sum(Exp_var) / no_splits avg_mse = sum(MSE) / no_splits avg_rsquared = sum(r_squared) / no_splits print('\nMAE:{} \nMSE:{} \nExp Var Explained: {}\nr^2: {}\nParams:{}'. format(MAE, MSE, Exp_var, r_squared, params_used)) print('\nAvg MAE:', avg_mae, '\nAverage Explained Variance:', avg_exp_var, '\nAvg MSE:', avg_mse, '\nAvg r^2:', avg_rsquared) print('end') fig.tight_layout() plt.show()
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Decision Tree model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = DecisionTreeRegressor( max_depth=12, min_samples_leaf=1, max_features="sqrt", random_state=42, n_jobs=N_JOBS, warm_start=True, ) if MULTI: model = MultiOutputRegressor(model, n_jobs=1) pipeline = Pipeline( [ ("var", VarianceThreshold()), ("model", model), ] ) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner str_ = "" if MULTI: str_ = "estimator__" parameters = { f"model__{str_}max_depth": [6, 10, 14, 18], f"model__{str_}min_samples_leaf": [1, 3, 5, 10], } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Extreme Gradient Boosting Tree model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = XGBRegressor( booster="gbtree", n_estimators=25, learning_rate=0.1, max_depth=7, min_child_weight=1, colsample_bytree=0.8, subsample=0.8, random_state=42, n_jobs=N_JOBS, ) model = MultiOutputRegressor(model, n_jobs=1) pipeline = Pipeline([ ("var", VarianceThreshold()), ("model", model), ]) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner parameters = { "model__estimator__n_estimators": [25, 50, 100], "model__estimator__learning_rate": [0.001, 0.01, 0.1, 1], "model__estimator__max_depth": [3, 6, 9, 12], "model__estimator__min_child_weight": [1, 3, 5], "model__estimator__colsample_bytree": [0.8], "model__estimator__subsample": [0.8], } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1, ) object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Partial Least Squares Regression model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = PLSRegression( n_components=min(X.shape[1] - 1, int(X.shape[0] / 2)), scale=False, ) pipeline = Pipeline([ ("var", VarianceThreshold()), # ('poly', PolynomialFeatures(2)), # longer run time, potentially more accurate # ('var2', VarianceThreshold()), # use this if 'poly' is used # ('shape', QuantileTransformer(output_distribution="normal")), # make input variables normally distributed ("scale", MinMaxScaler()), ("model", model), ]) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner max_components = min(X.shape[1] - 1, int(X.shape[0] * 0.75)) n_models = 16 # number of models to search for parameters = { "model__n_components": np.arange(1, max_components, step=int(max_components / n_models)).tolist(), } grid = RandomizedSearchCV( pipeline, parameters, n_iter=n_models, cv=folds, random_state=0, n_jobs=N_JOBS, ) object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
X["season"] = X["season"].astype(str) X["weather"] = X["weather"].astype(str) # determine which columns are strings (for X) x_columns = X.columns x_dtypes = X.dtypes x_str = np.where(x_dtypes == "object")[0] # convert any string columns to binary columns X = pd.get_dummies(X, columns=x_columns[x_str]) # In[2]: Model the data # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=5) folds = tscv.get_n_splits(X) # set up a machine learning pipeline pipeline = Pipeline( [ ("var1", VarianceThreshold()), # ('poly', PolynomialFeatures(2)), # ('var2', VarianceThreshold()), # ('shape', QuantileTransformer(output_distribution="normal")) ("scale", MinMaxScaler()), ("model", LassoCV(cv=folds, eps=1e-9, n_alphas=16, n_jobs=-1)), ] ) # train a model pipeline.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])
features.index = stocks[i].index features = features.dropna() #features = features.iloc[np.where(features.index=='1998-5-5')[0][0]:np.where(features.index=='2015-5-5')[0][0]] stocks_indicators[i] = features return stocks_indicators #create model stocks_indicators = get_indicators(stocks,5) for j in stocks: X = stocks_indicators[j].iloc[:, :-1].astype('float') y = stocks_indicators[j].iloc[:, -1].astype('float') tscv = TimeSeriesSplit(n_splits=2) tscv.get_n_splits(X) for train_index, test_index in tscv.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) classifier = Sequential() classifier.add(Dense(units=128, kernel_initializer='uniform', activation='relu', input_dim=X.shape[1])) classifier.add(Dense(units=128, kernel_initializer='uniform', activation='relu')) classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid')) classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) classifier.fit(X_train, y_train, batch_size = 10, epochs = 100) y_pred = classifier.predict(X_test) y_pred[y_pred > 0.5] = 1
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Bayesian Ridge Regression model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = MultiOutputRegressor(BayesianRidge(), n_jobs=N_JOBS) pipeline = Pipeline( [ ("var", VarianceThreshold()), # ('poly', PolynomialFeatures(2)), # longer run time, potentially more accurate # ('var2', VarianceThreshold()), # use this if 'poly' is used # ('shape', QuantileTransformer(output_distribution="normal")), # make input variables normally distributed ("scale", MinMaxScaler()), ("model", model), ] ) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner parameters = { "model__estimator__n_iter": [300], "model__estimator__tol": [1e-3], "model__estimator__alpha_1": [1e-2, 1e-6, 1e-10], "model__estimator__lambda_1": [1e-2, 1e-6, 1e-10], "model__estimator__alpha_2": [1e-2, 1e-6, 1e-10], "model__estimator__lambda_2": [1e-2, 1e-6, 1e-10], } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1, ) object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
# (0,1) transformation scaler = MinMaxScaler(feature_range=(0, 1)) raw_data = pd.DataFrame(scaler.fit_transform(raw_data)) raw_data.columns = heads # X,y X = raw_data.drop(['OP_TIME', 'BILL_USER'], axis=1) # poly = PolynomialFeatures(degree=2) # poly_X = pd.DataFrame(poly.fit_transform(X)) # print(poly.get_feature_names(X.columns)) poly_X = X y = raw_data['BILL_USER'] # kf = KFold(n_splits=10) kf = TimeSeriesSplit(n_splits=3) kf.get_n_splits(poly_X) regressions = ['Lasso', 'Ridge', 'GradientBoostingRegression'] print("start trainning %s model..." % (regressions[0])) # nested 10-fold cross-validation scores = [] lasso_models = [] mean_performance = [] for train_index, test_index in kf.split(poly_X): # print("TRAIN:", train_index, "TEST:", test_index) print("start training...") X_train, X_test = poly_X.iloc[train_index], poly_X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] lassocv = linear_model.LassoCV(cv=10, max_iter=1500) lassocv.fit(X_train, y_train) print("alpha is %s" % (lassocv.alpha_))