def do_time_ml(ticker): X, y = extract_featuresets(ticker) num_splits = 5 tscv = TimeSeriesSplit(n_splits=num_splits) # need to have () after the classifier otherwise it gives an error # TypeError: get_params() missing 1 required positional argument: 'self' clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier()), ('gap', GaussianProcessClassifier()), ('bag', BaggingClassifier()), ('nn', MLPClassifier(max_iter=2000))]) i = 1 for train_index, test_index in tscv.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] clf.fit(X_train, y_train['{}_target'.format(ticker)]) confidence = clf.score(X_test, y_test['{}_target'.format(ticker)]) predictions = clf.predict(X_test) print('Accuracy:', confidence) print("Predicted Spread:", Counter(predictions)) clf.fit(X_train, y_train['{}_10d pred'.format(ticker)]) confidence = clf.score(X_test, y_test['{}_10d pred'.format(ticker)]) predictions = clf.predict(X_test) print('p Accuracy:', confidence) print("p Predicted Spread:", Counter(predictions)) i += 1 if i == num_splits: np.savetxt('p_aes_1020_pred.csv', predictions, delimiter=',') np.savetxt('t_aes_1020_pred.csv', X_test, delimiter=',')
def calc_density(self, df): cnt = 0 n_splits = 5 # Calculate the difference between shifted training duration thrs = 3 col = list(set(df.columns) - set(['target']))[0] tss = TimeSeriesSplit(n_splits=n_splits) total_cnt = 0 for idx, (train_idx, valid_idx) in enumerate(tss.split(df)): if idx > 1: train_len = int(len(valid_idx)*(idx+1)/2) train_idx = pd.Index(np.random.choice(train_idx, train_len, replace=False)) # Training target mean train_vc = df[col].iloc[train_idx].value_counts() train_vc = train_vc.loc[train_vc > thrs].index series = df.iloc[train_idx] series = series.loc[series[col].isin(train_vc)] train_mean = series.groupby(col)['target'].agg({'avg': 'mean', 'cnt': 'count'}) # Validation target mean valid_vc = df[col].iloc[valid_idx].value_counts() valid_vc = valid_vc.loc[valid_vc > thrs].index series = df.iloc[valid_idx] series = series.loc[series[col].isin(valid_vc)] valid_mean = series.groupby(col)['target'].agg({'avg': 'mean', 'cnt': 'count'}) # Compare two distribution with p-value prop = pd.merge(train_mean, valid_mean, how='inner', left_index=True, right_index=True) total_cnt += len(prop) for _, cat in prop.iterrows(): p_value = self.t_test(cat) if p_value < 0.01 or p_value > 0.99: cnt += 1 if cnt > total_cnt/10: return (col, True) return (col, False)
def do_cross_val(self, X, Y): ## Do a log transform on Y tscv = TimeSeriesSplit(n_splits=self.n_splits) i = 1 for i_train, i_test in tscv.split(X): self.model.fit(X[i_train], Y[i_train]) self.train_resid_Y.append([ y - y_true for y_true, y in zip(Y[i_train], self.model.predict(X[i_train])) ]) self.train_true_Y.append(Y[i_train]) self.train_X.append(i_train) self.train_pred_Y.append( [y for y in self.model.predict(X[i_train])]) self.test_resid_Y.append([ y - y_true for y, y_true in zip(self.model.predict(X[i_test]), Y[i_test]) ]) self.test_true_Y.append(Y[i_test]) self.test_X.append(i_test) self.test_pred_Y.append([y for y in self.model.predict(X[i_test])]) self.mses.append( mean_squared_error(Y[i_test], self.model.predict(X[i_test]))) self.mapes.append(100 * np.mean( [(y - Y[i]) / Y[i] for i, y in zip(i_test, self.model.predict(X[i_test]))])) print( f"Fold {i} complete with: {len(i_train)} training samples and {len(i_test)} test samples." ) i += 1
def cv_fit(self,n_splits, X_train,_y_train,regressor, **kwargs): ''' Cross validates the data with a Time Series Split Fits the training data for each split and finds the average score INPUT: # of Time Series splits (int) X_train (arr) y_train (arr) Regression Model (reg) Kwargs (varies depending on model) OUTPUT: None ''' tscv = TimeSeriesSplit(n_splits=n_splits) my_cv = tscv.split(X_train,y_train) scores = [] reg = regressor(**kwargs) for train_index, test_index in my_cv: X_val_train, X_val_test = X_train[train_index], X_train[test_index] y_val_train, y_val_test = y_train[train_index], y_train[test_index] reg.fit(X_val_train,y_val_train) scores.append(reg.score(X_val_test, y_val_test)) self.name = reg.__class__.__name__ self.score = round(np.mean(scores),3) self.reg = reg
def train_arma(self): tscv = TimeSeriesSplit(n_splits=10) for train_index, test_index in tscv.split(self.dt): train = self.dt[train_index] test = self.dt[test_index] arma_model = ARMA(train, order=(18, 0)) results_AR = arma_model.fit(disp=-1) plt.figure(figsize=(16, 6)) plt.title('ARMA Model on Aggregate Data') plt.plot(train, label='Training Actual Occupancy Rate') plt.xlabel('Date') plt.ylabel('Percent Occupied') y_pred_AR = pd.Series(results_AR.forecast(steps=len(test))[0], index=test.index) plt.plot(test, label='Testing Actual Occupancy Rate') plt.plot(y_pred_AR, color='purple', label='ARMA Predicted Occupancy Rate') plt.legend() plt.show() print('ARMA Model Metrics on Test Data') self.report_metrics(test.squeeze(), y_pred_AR.squeeze())
def feed_consumers(self, data, symbols, unprocessed_symbols): """ send data to queue """ logger.info("Sending ready") tscv = TimeSeriesSplit(n_splits=10) for symbols_index, symbol in enumerate(symbols): if symbol in unprocessed_symbols: continue folds_index = 1 for train_index, test_index in tscv.split(data[symbol].data): X_train, X_test = data[symbol].data.values[train_index], data[ symbol].data.values[test_index] y_train, y_test = data[symbol].target[train_index], data[ symbol].target[test_index] logger.info("processing %s %d", symbol, folds_index) mq_body = encode_data( (symbol, X_train, X_test, y_train, y_test, folds_index)) with self.lock: self._data.basic_publish(exchange='', routing_key='tpot_data', body=mq_body) self.to_process += 1 folds_index += 1 self.done = True
def split(self, df, y=None, groups=None): self._validate_df(df) groups = df.groupby(self.groupby).indices splits = {} while True: X_idxs, y_idxs = [], [] for key, sub_idx in groups.items(): sub_df = df.iloc[sub_idx] sub_y = y[sub_idx] if y is not None else None if key not in splits: splitter = TimeSeriesSplit(self.n_splits, self.max_train_size) splits[key] = splitter.split(sub_df, sub_y) try: X_idx, y_idx = next(splits[key]) X_idx = np.array([ df.index.get_loc(i) for i in sub_df.iloc[X_idx].index ]) y_idx = np.array([ df.index.get_loc(i) for i in sub_df.iloc[y_idx].index ]) X_idxs.append(X_idx) y_idxs.append(y_idx) except StopIteration: pass if len(X_idxs) == 0: break yield np.concatenate(X_idxs), np.concatenate(y_idxs)
def SVM_TimeSplit(X, y, n_splits, algoritmo, name): tscv = TimeSeriesSplit(n_splits=n_splits) print(name) print(tscv) i = 0 acc = [] yhat = y.copy() for train_index, test_index in tscv.split(X): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print(X_train, X_test, y_train, y_test) #scaler = StandardScaler() #X_train = scaler.fit_transform(X_train) #Standard parameters clf = algoritmo try: clf.fit(X_train, y_train.ravel()) except: return 0, clf #X_test = scaler.transform(X_test) yhat[test_index] = clf.predict(X_test) acc.append( metrics.accuracy_score(yhat[test_index].round(), y_test.round(), normalize=False)) i = i + 1 #print ('Score medio: '+ str(np.mean(acc))) return yhat, clf
def nestedCV(model_params, train_dataset, freq, k=3): """ Performs nested cv on the given dataset with the given hyper parameters """ train_dataset.index = pd.DatetimeIndex(train_dataset.index.values, freq=freq) tscv = TimeSeriesSplit(n_splits=k) mses = [] #loop through the split data for train_index, val_index in tscv.split(train_dataset): cv_train, cv_val = train_dataset.iloc[train_index], train_dataset.iloc[ val_index] sarima = sm.tsa.SARIMAX( endog=cv_train, order=model_params[0], # for SARIMAX, it means (p,d,q), seasonal_order=model_params[ 1], # for SARIMAX, it means (sp,sd,sq,12), enforce_stationarity=False, enforce_invertibility=False).fit() predictions = sarima.predict(cv_val.index.values[0], cv_val.index.values[-1]) true_values = cv_val.values #sometimes returns nan for predictions, but just ignore if not np.isnan(predictions.values).any(): mse = math.sqrt(mean_squared_error(true_values, predictions.values)) mses.append(mse) return np.mean(mses)
def run_test(data): y = np.array(data)[:, np.newaxis] N = len(y) X = np.linspace(-2, 8, N)[:, np.newaxis] tscv = TimeSeriesSplit(n_splits=N - 1) print("Starting model %s" % str(model)) predictions = [] predictors = [] for train_index, test_index in tscv.split(y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] fitted = model.fit(X_train, y_train) prediction = fitted.predict(X_test) predictions.append(prediction[0][0]) predictors.append(X_test[0][0]) rmse, mae, r2, profit, accuracy = eval_metrics(y.flatten(), np.array(predictions)) print(" Profit: %s" % profit) print(" Accuracy: %s" % accuracy) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) parameters = model.get_params() return rmse, mae, r2, profit, accuracy, parameters
def timeseriesCVscore(params, series, loss_function=mean_squared_error, slen=12): """ Returns error on CV params - vector of parameters for optimization series - dataset with timeseries slen - season length for Holt-Winters model """ # errors array errors = [] values = series.values alpha, beta, gamma = params # set the number of folds for cross-validation tscv = TimeSeriesSplit(n_splits=3) # iterating over folds, train model on each, forecast and calculate error for train, test in tscv.split(values): model = HoltWinters(series=values[train], slen=slen, alpha=alpha, beta=beta, gamma=gamma, n_preds=len(test)) model.triple_exponential_smoothing() predictions = model.result[-len(test):] actual = values[test] error = loss_function(predictions, actual) errors.append(error) return np.mean(np.array(errors))
def reg2(T): global i print(i) i += 1 #防止全部为Nan if T.isnull().sum() != T.shape[0]: window = 50 tscv = TimeSeriesSplit(n_splits=T.shape[0] - window + 1) new_dd = pd.Series(np.NAN, index=T.index) for train_index, test_index in tscv.split(T): #print("TRAIN:", train_index[-window:], "TEST:", test_index) X, Y = T.iloc[train_index[-window:]], bench.iloc[ train_index[-window:]] #防止全部为Nan if X.isnull().sum() != X.shape[0]: X = sm.add_constant(X) model = OLS(Y, X, missing='drop') results = model.fit() res = results.resid.iloc[-1] new_dd.iloc[train_index[-1]] = res #计算最后一个 X, Y = T.iloc[-window:], bench.iloc[-window:] #防止全部为Nan if X.isnull().sum() != X.shape[0]: X = sm.add_constant(X) model = OLS(Y, X, missing='drop') results = model.fit() res = results.resid.iloc[-1] new_dd.iloc[-1] = res return new_dd else: return T else: return T
def train_models(x, y, params, folds=5): models = [] tssp = TimeSeriesSplit(n_splits=folds, max_train_size=None) for train_idx, test_idx in tssp.split(x): x_tr, x_te = x.iloc[train_idx], x.iloc[test_idx] y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx] model = LGBMRegressor(colsample_bytree = params["colsample_bytree"], learning_rate = params["learning_rate"], max_depth = params["max_depth"], min_child_samples = params["min_child_samples"], min_sum_hessian_in_leaf = params["min_sum_hessian_in_leaf"], n_estimators = params["n_estimators"], num_leaves = params["num_leaves"], reg_alpha = params["reg_alpha"], reg_lambda = params["reg_lambda"], subsample = params["subsample"], tree_learner = params["tree_learner"], boosting_type = params["boosting_type"], objective = params["objective"], ) model.fit(x_tr, y_tr) preds = model.predict(x_te) print(rmsle(y_true=y_te, y_pred=preds)) models.append(model) del x_tr, x_te, y_tr, y_te, model print(gc.collect()) return models
def fnGridSearchModel(trainX, trainY, nn_model, nb_epoch, batch_size, param_dict): model = KerasRegressor(build_fn=nn_model, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2) numNeurons = [i * numFeatures for i in range(1, 4, 1)] nLayers = [3, 4, 5, 6] nDropout = [.2] tscv = TimeSeriesSplit(n_splits=2) CVData = [(fnSliceOffDataPerBatchSize(pFeatures=train, pBatch_Size=batch_size)[0], fnSliceOffDataPerBatchSize(pFeatures=test, pBatch_Size=batch_size)[0]) for train, test in tscv.split(trainX)] param_grid = dict(nLayers=nLayers, numNeurons=numNeurons, nDropout=nDropout) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=CVData) grid_result = grid.fit(trainX, trainY) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) params = grid_result.best_params_ return grid_result.best_score_, params
def timeseriesCVscore(series, model, loss_function, n_splits=3): """ :param n_splits: :param series: :param model: :param loss_function: :return: """ # errors array errors = [] values = series.values # set the number of folds for cross-validation tscv = TimeSeriesSplit(n_splits=n_splits) # iterating over folds, train model on each, forecast and calculate error # TODO add interface for learning for train, test in tscv.split(values): model.fit(values[train]) predictions = model.predict(steps=len(test)) actual = values[test] error = loss_function(actual, predictions) errors.append(error) return np.mean(np.array(errors))
def panel_split(n_folds, groups, grouping_var='date_of_transaction'): """ Function to generate time series splits of a panel, provided a number of folds, and an indexable dataframe to create groups. Returns a generator object for compliance with sci-kit learn API. """ date_idx = (groups[[ grouping_var ]].drop_duplicates().sort_values(grouping_var).reset_index().rename( {'index': 'tsidx'}, axis=1)) by_ticker_index = groups.reset_index().rename({'index': 'panel_index'}, axis=1) by_ticker_index = (pd.merge( by_ticker_index, date_idx, on=grouping_var).sort_values('panel_index').set_index('panel_index')) ticker_range = sorted(by_ticker_index['tsidx'].unique().tolist()) splits = TimeSeriesSplit(n_splits=n_folds) for train_indices, test_indices in splits.split(ticker_range): panel_train_indices = (by_ticker_index[by_ticker_index['tsidx'].isin( train_indices)].index.tolist()) panel_test_indices = (by_ticker_index[by_ticker_index['tsidx'].isin( test_indices)].index.tolist()) yield panel_train_indices, panel_test_indices
def LGB_bayesian(self, learning_rate, num_leaves, bagging_fraction, feature_fraction, min_child_weight, min_data_in_leaf, max_depth, reg_alpha, reg_lambda): # LightGBM expects next three parameters need to be integer. num_leaves = int(num_leaves) min_data_in_leaf = int(min_data_in_leaf) max_depth = int(max_depth) assert type(num_leaves) == int assert type(min_data_in_leaf) == int assert type(max_depth) == int BayesianParams = { 'num_leaves': num_leaves, 'min_data_in_leaf': min_data_in_leaf, 'min_child_weight': min_child_weight, 'bagging_fraction': bagging_fraction, 'feature_fraction': feature_fraction, 'learning_rate' : learning_rate, 'max_depth': max_depth, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda, 'objective': 'binary', 'save_binary': True, 'seed': 1337, 'feature_fraction_seed': 1337, 'bagging_seed': 1337, 'drop_seed': 1337, 'data_random_seed': 1337, 'boosting_type': 'gbdt', 'verbose': 1, 'is_unbalance': False, 'boost_from_average': True, 'metric': 'auc'} folds = TimeSeriesSplit(n_splits=5) for fold, (bayesian_tr_idx, bayesian_val_idx) in enumerate(folds.split(self.train, self.targetCol)): print('Training on fold {}'.format(fold + 1)) trn_data = lgb.Dataset(self.train.iloc[bayesian_tr_idx], label=self.targetCol.iloc[bayesian_tr_idx]) val_data = lgb.Dataset(self.train.iloc[bayesian_val_idx], label=self.targetCol.iloc[bayesian_val_idx]) clf = lgb.train(BayesianParams, trn_data, 10000, valid_sets=[trn_data, val_data], verbose_eval=1000, early_stopping_rounds=10, categorical_feature=['TransactionHour']) oof = np.zeros(len(self.train)) features=list(self.train) oof[bayesian_val_idx] = clf.predict(self.train_df.iloc[bayesian_val_idx][features].values, num_iteration=clf.best_iteration) score = roc_auc_score(self.train_df.iloc[bayesian_val_idx][self.target].values, oof[bayesian_val_idx]) return score
def execute(): for lg in LAGS: for c in COUNTRIES: # get the data X, y = get_data(c, lg) # define the splits tscv = TimeSeriesSplit(n_splits=10) # do the grid search. Scoring follows the convention that # higher values are better (hence the `neg') grid_search = GridSearchCV(CLF, parameters, scoring='neg_mean_absolute_error', cv=tscv.split(X), n_jobs=-1, verbose=1) print("Performing grid search...") t0 = time() grid_search.fit(X, y) print("done in %0.3fs" % (time() - t0)) # record best result par_res = grid_search.best_estimator_.get_params() m = {'params': {pn: par_res[pn] for pn in parameters.keys()}, 'country': c, 'lag': lg} result.append(m) json.dump(result, open(config['gridsearch-parameters'], 'w')) print("Done")
def get_cv_split(y, method, **init_params): # Returns None if group_labels doesn't exist (e.g. for cv=timeseries) # groups = load_processed_data(name="group_labels", **load_kwargs) if method == "timeseries": splitter = TimeSeriesSplit(**init_params) groups = None elif method == "storms": splitter = GroupKFold(**init_params) # groups = load_processed_data( # name="group_labels", must_exist=True, **load_kwargs # ) # NOTE: y should be a pandas object with storm index so set groups to be storm index groups = y.storms.index # Reindex times within each storm # Cannot just reindex groups with y index directly because there are # overlapping storms # groups = pd.concat( # ( # groups[groups[STORM_LEVEL] == storm].reindex(y.storms.get(storm).index) # for storm in y.storms.level # ) # ) # assert len(groups) == len( # y # ), f"Length of groups ({len(groups)}) does not match length of y ({len(y)})" split = splitter.split(y, groups=groups) return list(split)
def wrapper_fit_pred_val(serie, order, s_order, **kwargs): dct_model = {} stop_at = 'fit' tscv = TimeSeriesSplit(n_splits=3, test_size=10, gap=10) try: for train_idx, val_idx in tscv.split(serie): print( "ajustement:", f"[{serie.index[train_idx[0]]} --> {serie.index[train_idx[-1]]}]", "-- validation:", f"[{serie.index[val_idx[0]]} --> {serie.index[val_idx[-1]]}]") s_train = serie.iloc[train_idx] s_valid = serie.iloc[val_idx] model_fit(dct_model, s_train, order, s_order, **kwargs) stop_at = model_eval(dct_model, **kwargs) stop_at = model_pred(dct_model, s_train, **kwargs) stop_at = model_fcst(dct_model, s_valid, **kwargs) stop_at = model_val(dct_model, s_valid, **kwargs) for key in dct_model["validation_score"].keys(): dct_model['validation_score'][key] /= tscv.n_splits except: print( f"Une exception est survenue pour: SARIMA {order}{s_order} -- [{stop_at}]" ) dct_model = None return dct_model
def testSplit(df): ''' Test to guarantee that split is done on dates instead of row count ''' loop = 0 split = 2 splits = TimeSeriesSplit(max_train_size=4025, n_splits=split) dates = np.unique(df.date) backtest = 1 # cutoff_date = '2018-03-23T00:00:00.000000000' cutoff_date = np.datetime64('2016-01-04') if backtest == 1: a = np.where(dates < cutoff_date)[0] b = np.where(dates >= cutoff_date)[0] s = [] s.append((a, b)) else: s = splits.split(dates) for train_date_index, test_date_index in s: train = df[df.date.isin(dates[train_date_index])] test = df[df.date.isin(dates[test_date_index])] print("\ntrain", min(train.date), max(train.date)) print("test ", min(test.date), max(test.date))
def grid_search(models, params, X_train, y_train, n_splits=3, n_iter=5): """ Function for randomsearch CV to get best estimators :param models: dictionary of models to be used :param params: dictionary of models' parameters to be searched :param X_train: training explanatory features :param y_train: training target feature :param n_splits: number of cross validation to be used :param n_iter: number of parameter combinations to iterate :return: best estimators from the grid search """ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # no. of crossvalidation tscv = TimeSeriesSplit(n_splits=n_splits) best_estimators = {} for key, model in models.items(): print(key, ' search') try: # gridsearch to get best parameters model_param_search = RandomizedSearchCV(estimator=model, param_distributions=params[key], scoring='neg_mean_squared_error', n_jobs=4, verbose=0, random_state=42, n_iter=n_iter, cv=[(train_split_index, val_index) for train_split_index, val_index in tscv.split(X_train)]) model_param_search.fit(X_train, y_train.reshape(len(y_train), )) best_estimators[key] = model_param_search.best_estimator_ except Exception as e: logger.error(f'Training for {key} failed: ', str(e)) return best_estimators
def split( X: np.ndarray, horizon: int = 12, min_train_size: int = None, max_train_size: int = None, ) -> List[Tuple[List[int]]]: """ creates a list of train/test indices for time series cross-fold validation Arguments: X {np.ndarray} -- values to split by index Keyword Arguments: horizon {int} -- length of prediction horizon (default: {12}) max_train_size {int} -- maximum size for a single training set (default: {None}) min_train_size {int} -- minimum size for a single training set. If None, it will be set to the length of the prediction horizon (default: {None}) Returns: List[Tuple[List[int]]] -- list of train/test index tuples """ n_splits = X.shape[0] // horizon - 1 splits = TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size) if min_train_size is None: min_train_size = horizon filtered_splits = [] for train_index, test_index in splits.split(X): if len(train_index) >= min_train_size: filtered_splits.append((train_index, test_index[:horizon])) print( f'The dataset has been split into {len(filtered_splits)} folds for CV') return filtered_splits
def train_model(train_files, labels, model): ## trains visitor prediction model folds = TimeSeriesSplit(n_splits=8) scores = [] preds = [] for i, (train_index, val_index) in enumerate(folds.split(train_files, labels), 1): ## training and validation subsets X_train, y_train = train_files[train_index], np.take(labels, train_index, axis=0) X_val, y_val = train_files[val_index], np.take(labels, val_index, axis=0) ## fitting the model model.fit(X_train, y_train) ## calculating rmsle for training and testing subsets for each fold t_score = rmse(y_train, model.predict(X_train)) val_preds = model.predict(X_val) score = rmse(y_val, val_preds) scores.append(score) preds.append(val_preds) print(f'Fold-{i}: Train_RMSLE: {t_score}, Validation_RMSLE: {score}') print(f'Mean_RMSLE of validation set: {np.round(np.mean(scores), 4)}') print( f'Normalized_RMSLE using Standard Deviation:{np.mean(scores)/np.std(preds)}' ) print('\n===============Finished Training====================\n') return model
def cross_valid(model_dict_cv, df_train, target_col, feature_col): scoring = 'ACC' model_dict = {} # keep selected parameters for each model for model_tuple, param_grid in model_dict_cv.items(): # produces all combinations of parameters for given model all_grid = list(dict_product(param_grid)) cv_res = [] for param in all_grid: # timeseries split for cross validation tscv = TimeSeriesSplit(n_splits=3) model = model_tuple[1](**param) score = [] for train_index, test_index in tscv.split(df_train): X_train, X_test = df_train[feature_col].iloc[train_index],\ df_train[feature_col].iloc[test_index] y_train, y_test = df_train[target_col].iloc[train_index],\ df_train[target_col].iloc[test_index] date_train, date_test = df_train['Date'].iloc[train_index],\ df_train['Date'].iloc[test_index] model.fit(X_train, y_train) y_binary = model.predict(X_test) y_prob = model.predict_proba(X_test)[:, 1] res_dict = { 'Date': date_test, 'Regime': y_test, 'crash_prob': y_prob, 'crash_binary': y_binary } res_df = pd.DataFrame.from_dict(res_dict) score.append(error_metrics(res_df)[scoring]) cv_res.append(np.mean(score)) best_param = all_grid[np.argmax(cv_res)] model_dict[model_tuple] = best_param return model_dict
class Predictor(LoggerMixin, object): def __init__(self, model, fold=5, file='temp.log', loglevel=logging.INFO): super().__init__(file, loglevel) self.logger.debug('Created an instance of %s', self.__class__.__name__) self.model = model self.splitor = TimeSeriesSplit(n_splits=fold) @staticmethod def reg_error(y_predicted, y_test): return sqrt(mean_squared_log_error(y_predicted, y_test)) def fit(self, X, y=None): return self def predict(self, X, y=None): predictions = pd.DataFrame([]) i = 0 for train_idx, test_idx in self.splitor.split(X): X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[ train_idx], y[test_idx] self.model.fit(X_train, y_train) predictions[i] = self.model.predict(X_test) i += 1 return predictions def score(self, X, y=None): self.logger.debug('--- score starts.') errors = [] for train_idx, test_idx in self.splitor.split(X): X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[ train_idx], y[test_idx] self.model.fit(X_train, y_train) y_predicted = self.model.predict(X_test) error = self.reg_error(y_predicted, y_test) errors.append(error) self.logger.debug('test idx {}, error: {:.6f}'.format( test_idx[0], error)) print("---- split: {}".format(error)) avg_error = mean(errors) self.logger.debug('avg error: {:.4f}'.format(avg_error)) return avg_error
def tuning_hyper_parameters(): # 开始计时,并打印相关信息 start = time() print('\nStart tuning hyper parameters') # 加载训练集 X_train = load_npz(path_modeling_dataset + npz_X_train) y_train = np.load(path_modeling_dataset + npy_y_train) from sklearn.metrics import make_scorer, log_loss loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True) from sklearn.model_selection import TimeSeriesSplit tscv = TimeSeriesSplit(n_splits=5) # GridSearch from sklearn.model_selection import GridSearchCV from sklearn.linear_model import SGDClassifier alphas = np.logspace(-4, -1, 4) param_grid = {'alpha': alphas} generator = tscv.split(X_train) clf = GridSearchCV(SGDClassifier(loss='log', n_jobs=-1), param_grid, cv=generator, scoring=loss, n_jobs=-1) # 训练模型 clf.fit(X_train, y_train) # 打印 cv_results cv_results_df = \ DataFrame(clf.cv_results_)[['rank_test_score', 'param_alpha', 'mean_train_score', 'mean_test_score']] cv_results_df.rename( columns={'mean_train_score': 'mean_train_loss', 'mean_test_score': 'mean_val_loss', 'rank_test_score': 'rank_val_loss'}, inplace=True) cv_results_df[['mean_val_loss', 'mean_train_loss']] = -cv_results_df[['mean_val_loss', 'mean_train_loss']] print('cv results: ') print(cv_results_df) # 手动释放内存 del X_train del y_train gc.collect() # 加载测试集 X_test = load_npz(path_modeling_dataset + npz_X_test) y_test = np.load(path_modeling_dataset + npy_y_test) # 打印在测试集上的 logloss print('logloss in testset: ', -clf.score(X=X_test, y=y_test)) # 手动释放内存 del X_test del y_test gc.collect() # 存储模型 util.safe_save(path_model, 'sgd_lr.pkl', clf.best_estimator_) # 停止计时,并打印相关信息 util.print_stop(start)
def fit(self, folds=3, thetas=(-2, -1, 0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2)): """Function to theta models based on Kevin Sheppard's code. Selects the best theta for the series based on KFold cross-validation Parameters ---------- @Parameters thetas - tuple of float theta values to evaluate Returns ---------- None """ # Initialise the KFold object kf = TimeSeriesSplit(n_splits=folds) for i, series in enumerate(self.data.columns): x = self.data.loc[:self.train_ix[series] - 1, series] mspes = {t: np.empty((folds, 1)) for t in thetas} p = pd.DataFrame(None, index=["a0", "b0"], dtype=np.double) params = {i: p for i in range(folds)} fold_ix = 0 for tr_ix, te_ix in kf.split(x): # Set up data x_tr, x_te = x.iloc[tr_ix], x.iloc[te_ix] t = x_tr.shape[0] k = x_te.shape[0] for theta in thetas: # Estimate the different theta models params[fold_ix][theta] = self.estimate(x_tr, theta) # Forecast for different theta models: b0 = params[fold_ix][theta]["b0"] # New RHS for forecasting rhs_oos = np.ones((k, 2)) rhs_oos[:, 1] = np.arange(k) + t + 1 # Exp. Smoothing term fit_args = {"disp": False, "iprint": -1, "low_memory": True} ses = ExponentialSmoothing(x_tr).fit(**fit_args) alpha = ses.params.smoothing_level # Actual forecasting ses_forecast = ses.forecast(k) trend = (np.arange(k) + 1 / alpha - ((1 -alpha) ** t) / alpha) trend *= 0.5 * b0 forecast = np.array(ses_forecast + trend) mspes[theta][fold_ix] = mse(x_te, forecast) fold_ix += 1 # Evaluate the KFold for k, v in mspes.items(): mspes[k] = np.mean(v) self.best_theta[series] = min(mspes, key=mspes.get) self.fitted[series] = self.estimate(x, self.best_theta[series]) self.fit_success = True
def q02_data_splitter(path): path = 'data/elecdemand.csv' shape, df = q01_load_data(path) tscv = TimeSeriesSplit(n_splits=2) com_idx = [] for train_index, valid_index in tscv.split(df): com_idx.append((train_index, valid_index)) return com_idx
def do_stuff(X, y): tscv = TimeSeriesSplit(n_splits=3) score = [] for train_index, test_index in tscv.split(X): estimator.fit(X[train_index], y[train_index]) score.append(estimator.SMAPE(X[test_index], y[test_index])) print score[-1] return np.mean(score)
def test_cv(): df = pd.read_pickle(os.path.join(root, '..', 'data', 'ta', 'base1', 'AAPL.pkl')) assert isinstance(df, pd.DataFrame) npDates = df["date"].unique() df.set_index(["date"], drop=True, inplace=True) assert df.shape == df.loc[npDates.tolist()].shape cv = TimeSeriesSplit(n_splits=5) for (train, test) in cv.split(npDates): train_size = len(df.loc[npDates[train]]) test_size = len(df.loc[npDates[test]]) assert len(df) == train_size + test_size
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
def arima_gridsearch_cv(series, cv_splits=2,verbose=True,show_plots=True): # prepare train-test split object tscv = TimeSeriesSplit(n_splits=cv_splits) # initialize variables splits = [] best_models = [] all_models = [] i = 1 # loop through each CV split for train_index, test_index in tscv.split(series): print("*"*20) print("Iteration {} of {}".format(i,cv_splits)) i = i + 1 # print train and test indices if verbose: print("TRAIN:", train_index, "TEST:", test_index) splits.append({'train':train_index,'test':test_index}) # split train and test sets train_series = series.ix[train_index] test_series = series.ix[test_index] print("Train shape:{}, Test shape:{}".format(train_series.shape, test_series.shape)) # perform auto arima _best_model, _all_models = auto_arima(series=train_series) best_models.append(_best_model) all_models.append(_all_models) # display summary for best fitting model if verbose: print(_best_model['model_obj'].summary()) results = _best_model['model_obj'] if show_plots: # show residual plots residuals = pd.DataFrame(results.resid) residuals.plot() plt.title('Residual Plot') plt.show() residuals.plot(kind='kde') plt.title('KDE Plot') plt.show() print(residuals.describe()) # show forecast plot fig, ax = plt.subplots(figsize=(18, 4)) fig.autofmt_xdate() ax = train_series.plot(ax=ax) test_series.plot(ax=ax) fig = results.plot_predict(test_series.index.min(), test_series.index.max(), dynamic=True,ax=ax, plot_insample=False) plt.title('Forecast Plot ') plt.legend() plt.show() # show error plot insample_fit = list(results.predict(train_series.index.min()+1, train_series.index.max(), typ='levels')) plt.plot((np.exp(train_series.ix[1:].tolist())-\ np.exp(insample_fit))) plt.title('Error Plot') plt.show() return {'cv_split_index':splits, 'all_models':all_models, 'best_models':best_models}
def timeSeriesSplit(cso = False): state = {0: 'NSW', 1: 'QLD', 2: 'SA', 3: 'TAS', 4: 'VIC'} year = {0: '2015', 1: '2016', 2: '2017'} df_nsw = pd.DataFrame() df_qld = pd.DataFrame() df_sa = pd.DataFrame() df_tas = pd.DataFrame() df_vic = pd.DataFrame() df = {'NSW': df_nsw, 'QLD': df_qld, 'SA': df_sa, 'TAS': df_tas, 'VIC': df_vic} df_nsw_test = pd.DataFrame() df_qld_test = pd.DataFrame() df_sa_test = pd.DataFrame() df_tas_test = pd.DataFrame() df_vic_test = pd.DataFrame() df_test = {'NSW': df_nsw_test, 'QLD': df_qld_test, 'SA': df_sa_test, 'TAS': df_tas_test, 'VIC': df_vic_test} for st in state.values(): for ye in year.values(): for mn in range(1,13): if mn < 10: dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + '0' + str(mn) +'_' + st + '1.csv') else: dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + str(mn) +'_' + st + '1.csv') df[st] = df[st].append(dataset.iloc[:,1:3]) df[st] = df[st].set_index('SETTLEMENTDATE') for st in state.values(): dataset = pd.read_csv('./datasets/test/' + st + '/PRICE_AND_DEMAND_201801_' + st + '1.csv') df_test[st] = df_test[st].append(dataset.iloc[:,1:3]) df_test[st] = df_test[st].set_index('SETTLEMENTDATE') # numpy array list_hourly_load_NSW = np.array(df['NSW']) list_hourly_load_QLD = np.array(df['QLD']) list_hourly_load_SA = np.array(df['SA']) list_hourly_load_TAS = np.array(df['TAS']) list_hourly_load_VIC = np.array(df['VIC']) # the length of the sequnce for predicting the future value sequence_length = 84 x_size = 36 hidden = 10 y_size = 48 # normalizing matrix_load_NSW = list_hourly_load_NSW / np.linalg.norm(list_hourly_load_NSW) matrix_load_QLD = list_hourly_load_QLD / np.linalg.norm(list_hourly_load_QLD) matrix_load_SA = list_hourly_load_SA / np.linalg.norm(list_hourly_load_SA) matrix_load_TAS = list_hourly_load_TAS / np.linalg.norm(list_hourly_load_TAS) matrix_load_VIC = list_hourly_load_VIC / np.linalg.norm(list_hourly_load_VIC) matrix_load_NSW = matrix_load_NSW[:-(len(matrix_load_NSW) % sequence_length)] matrix_load_QLD = matrix_load_QLD[:-(len(matrix_load_QLD) % sequence_length)] matrix_load_SA = matrix_load_SA[:-(len(matrix_load_SA) % sequence_length)] matrix_load_TAS = matrix_load_TAS[:-(len(matrix_load_TAS) % sequence_length)] matrix_load_VIC = matrix_load_VIC[:-(len(matrix_load_VIC) % sequence_length)] matrix_load_NSW = matrix_load_NSW.reshape(-1, sequence_length) matrix_load_QLD = matrix_load_QLD.reshape(-1, sequence_length) matrix_load_SA = matrix_load_SA.reshape(-1, sequence_length) matrix_load_TAS = matrix_load_TAS.reshape(-1, sequence_length) matrix_load_VIC = matrix_load_VIC.reshape(-1, sequence_length) # shuffle the training set (but do not shuffle the test set) np.random.shuffle(matrix_load_NSW) np.random.shuffle(matrix_load_QLD) np.random.shuffle(matrix_load_SA) np.random.shuffle(matrix_load_TAS) np.random.shuffle(matrix_load_VIC) # the training set X_NSW = matrix_load_NSW[:, :x_size] X_QLD = matrix_load_QLD[:, :x_size] X_SA = matrix_load_SA[:, :x_size] X_TAS = matrix_load_TAS[:, :x_size] X_VIC = matrix_load_VIC[:, :x_size] # the last column is the true value to compute the mean-squared-error loss y_NSW = matrix_load_NSW[:, x_size:] y_QLD = matrix_load_QLD[:, x_size:] y_SA = matrix_load_SA[:, x_size:] y_TAS = matrix_load_TAS[:, x_size:] y_VIC = matrix_load_VIC[:, x_size:] tscv = TimeSeriesSplit(n_splits=5) X = {'NSW': X_NSW, 'QLD': X_QLD, 'SA': X_SA, 'TAS': X_TAS, 'VIC': X_VIC} y = {'NSW': y_NSW, 'QLD': y_QLD, 'SA': y_SA, 'TAS': y_TAS, 'VIC': y_VIC} for st in state.values(): print("State: ", st) i = 1 for train_index, test_index in tscv.split(X[st]): X_train, X_test = X[st][train_index], X[st][test_index] y_train, y_test = y[st][train_index], y[st][test_index] print("Train and validation from state ", st, " split ", i) net = nt.Network([x_size, hidden, y_size], nt.Activation.tanh, nt.QuadraticCost) if cso: fname = "kernelBiasTimeSeries" + st + ".npy" if not path.exists(fname): print("Weights and biases initialization for state ",st, " in progress...") randInt = np.random.randint(X_train.shape[0]) net.cso(100,X_train[randInt].reshape(x_size,1),y_train[randInt].reshape(y_size,1), net.multiObjectiveFunction,-0.6,0.6,net.dim ,100) net.set_weight_bias(np.array(net.get_Gbest())) np.save(fname, np.array(net.get_Gbest())) net.set_weight_bias(np.load(fname)) if cso: fname = "results_" + st + "_TS_" + str(i) + "CSO" else: fname = "results_" + st + "_TS_" + str(i) + "GD" num_epochs = 1500 lmbda = 2 evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae = net.SGD( X_train.transpose(),y_train.transpose(), num_epochs, 10, 0.01, X_test.transpose(), y_test.transpose(), lmbda, monitor_evaluation_cost = True, monitor_evaluation_accuracy = True, monitor_training_cost = True, monitor_training_accuracy = True, output2D = True) f = open(fname, "w") json.dump([evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae], f) f.close() # make_plots(fname, num_epochs, # training_cost_xmin = 0, # test_accuracy_xmin = 0, # test_cost_xmin = 0, # training_accuracy_xmin = 0) i = i+1
train_val_sample.reset_index(drop=True,inplace=True) testing_sample.dropna(inplace=True) X_train=train_val_sample.drop(['y','date','device']+removal_list,1) y_train=train_val_sample['y'].astype(int) X_test=testing_sample.drop(['y','date','device']+removal_list,1) y_test=testing_sample['y'].astype(int) y_train.value_counts() #I create 3 training samples and 3 validation samples. tscv=TimeSeriesSplit(n_splits=3) print(tscv) for train,test in tscv.split(X_train): print('%s %s' %(train,test)) ################################################ #fit the model #Cross validation and hyper-parameter search print('running cross validation') ######################################## #XGBoost clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic') param_dist_xgb = {'n_estimators': stats.randint(150, 500),