Beispiel #1
0
    def get_cv(self):
        """
        employ CV strategy
        """

        # return cv.split
        if self.cv_method == "KFold":
            cv = model_selection.KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df)
        
        elif self.cv_method == "StratifiedKFold":
            cv = model_selection.StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target])
        
        elif self.cv_method == "TimeSeriesSplit":
            cv = model_selection.TimeSeriesSplit(max_train_size=None, n_splits=self.n_splits)
            return cv.split(self.train_df)
        
        elif self.cv_method == "GroupKFold":
            cv = GroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target], self.group)
        
        elif self.cv_method == "StratifiedKFold2":
            cv = model_selection.StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.group])

        elif self.cv_method == "StratifiedGroupKFold":
            cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target], self.group)
    def test_split(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([1, 2, 3, 4, 5, 6])

        fold1 = model_selection.TimeSeriesSplit(n_splits=3).split(X)
        fold2 = sklearn_model_selection.TimeSeriesSplit(n_splits=3).split(X)

        self.assertFoldEqual(fold1, fold2)
Beispiel #3
0
def do_CV_and_validation(model, modelAbbr, param_grid, feature_df, y_df,
                         endIndexTrain):
    label = y_df.columns[0]
    # remove validation set
    x_cv, y_cv = feature_df[:endIndexTrain], y_df[:endIndexTrain]
    # perform expanding window cross-validation on the train set (=x_cv)
    tscv = model_selection.TimeSeriesSplit(n_splits=5)
    cv_fit = model_selection.GridSearchCV(
        model,
        param_grid,
        cv=tscv,
        scoring=['r2', 'neg_mean_absolute_error'],
        return_train_score=True,
        refit='r2')
    cv_fit.fit(x_cv, y_cv.values.ravel())
    # results: model parameter, train error, test error
    model_parameters = cv_fit.best_estimator_.get_params()
    cv_train_score = {
        'R2':
        cv_fit.cv_results_['mean_train_r2'][cv_fit.best_index_],
        'MAE':
        -cv_fit.cv_results_['mean_train_neg_mean_absolute_error'][
            cv_fit.best_index_]
    }
    cv_test_score = {
        'R2':
        cv_fit.best_score_,
        'MAE':
        -cv_fit.cv_results_['mean_test_neg_mean_absolute_error'][
            cv_fit.best_index_]
    }

    # get validation score
    preds = cv_fit.best_estimator_.predict(feature_df)
    preds_df = pd.DataFrame(preds, index=y_df.index, columns=[modelAbbr])

    validation_perf = obtain_performance(
        y_df[label][endIndexTrain:].values,
        preds_df[modelAbbr][endIndexTrain:].values)
    validation_perf = {
        'R2': validation_perf['R2'],
        'MAE': validation_perf['MAE']
    }

    return {
        'optParams': model_parameters,
        'cv_train': cv_train_score,
        'cv_test': cv_test_score,
        'val': validation_perf,
        'preds': preds_df
    }
 def TimeSeriesCrossValidate(self,
                             df,
                             featureColLabels,
                             targetColLabel,
                             model,
                             n_splits,
                             scorings='accuracy'):
     splittingStrategy = model_selection.TimeSeriesSplit(n_splits=n_splits)
     result = self._crossValidate(df=df,
                                  featureColLabels=featureColLabels,
                                  targetColLabel=targetColLabel,
                                  model=model,
                                  splittingStrategy=splittingStrategy,
                                  scorings=scorings)
     return result
Beispiel #5
0
def split_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        # The entry test_fold[i] represents the index of the test set that sample i belongs to.
        # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1.
        test_fold = [0, 1, -1, 1]
        split = PredefinedSplit(test_fold)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The stratified folds are made by preserving the percentage of samples for each class.
        split = model_selection.StratifiedShuffleSplit(n_splits=3,
                                                       test_size=0.25,
                                                       random_state=None)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        split = model_selection.GroupShuffleSplit(n_splits=3,
                                                  test_size=0.25,
                                                  random_state=None)
        #print('#splits =', split.get_n_splits(X, y, groups))
        print('#splits =', split.get_n_splits(groups=groups))
    elif False:
        split = model_selection.TimeSeriesSplit(n_splits=3,
                                                max_train_size=None)
        print('#splits =', split.get_n_splits())
    else:
        split = model_selection.ShuffleSplit(n_splits=3,
                                             test_size=0.25,
                                             random_state=None)
        print('#splits =', split.get_n_splits(X))
    print('Split:', split)

    #for train_indices, test_indices in split.split():
    #for train_indices, test_indices in split.split(X, y):
    #for train_indices, test_indices in split.split(X, y, groups):
    for train_indices, test_indices in split.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
Beispiel #6
0
def buildRandomForestRegression(train_data_path, test_data_path):
    print("\nBuilding Random Forest Regression Model ...")

    print("Preparing training dataset ...")
    df = pd.read_csv(train_data_path)
    df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64')
    df.set_index('TIMESTAMP', inplace=True)
    df = df.resample('1M').mean()
    x_train, y_train = transformDataset(df)

    print("Preparing testing dataset ...")
    dt = pd.read_csv(test_data_path)
    dt['TIMESTAMP'] = dt['TIMESTAMP'].astype('datetime64')
    dt.set_index('TIMESTAMP', inplace=True)
    x_test, y_test = transformDataset(dt)

    print("Searching for best regressor ...")
    model = ensemble.RandomForestRegressor()
    param_search = {
        'n_estimators': [100],
        'max_features': ['auto'],
        'max_depth': [10]
    }
    tscv = model_selection.TimeSeriesSplit(n_splits=2)
    rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False)
    gsearch = model_selection.GridSearchCV(estimator=model,
                                           cv=tscv,
                                           param_grid=param_search,
                                           scoring=rmse_score)
    gsearch.fit(x_train, y_train)
    best_score = gsearch.best_score_
    best_model = gsearch.best_estimator_
    y_true = y_test.values
    print("Predicting with best regressor ...")
    y_pred = best_model.predict(x_test)
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = sqrt(mse)
    print(str.format("Random Forest Regression RMSE: {:.2f}", rmse))
    return rmse
Beispiel #7
0
def predictRandomForestRegression(data_path, periods):
    print("\nTraining Random Forest Regression model with full dataset ...")
    df = pd.read_csv(data_path)
    df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64')
    df.set_index('TIMESTAMP', inplace=True)
    dfmean = df.resample('1M').mean()
    dfmin = df.resample('1M').min()
    dfmax = df.resample('1M').max()
    x_train, y_train = transformDataset(dfmean)
    xmin_train, ymin_train = transformDataset(dfmin)
    xmax_train, ymax_train = transformDataset(dfmax)

    model = ensemble.RandomForestRegressor()
    model_min = ensemble.RandomForestRegressor()
    model_max = ensemble.RandomForestRegressor()
    param_search = {
        'n_estimators': [100],
        'max_features': ['auto'],
        'max_depth': [10]
    }
    tscv = model_selection.TimeSeriesSplit(n_splits=2)
    rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False)
    gsearch = model_selection.GridSearchCV(estimator=model,
                                           cv=tscv,
                                           param_grid=param_search,
                                           scoring=rmse_score)
    gsearch_min = model_selection.GridSearchCV(estimator=model_min,
                                               cv=tscv,
                                               param_grid=param_search,
                                               scoring=rmse_score)
    gsearch_max = model_selection.GridSearchCV(estimator=model_max,
                                               cv=tscv,
                                               param_grid=param_search,
                                               scoring=rmse_score)
    gsearch.fit(x_train, y_train)
    gsearch_min.fit(xmin_train, ymin_train)
    gsearch_max.fit(xmax_train, ymax_train)
    best_score = gsearch.best_score_
    best_model = gsearch.best_estimator_
    best_model_min = gsearch_min.best_estimator_
    best_model_max = gsearch_max.best_estimator_

    print("\nPredicting with Random Forest regressor ...")
    prediction = pd.DataFrame(columns=['TIMESTAMP', 'RENEWABLES_PCT'])
    l = len(x_train)
    x_pred = x_train.iloc[[l - 1]]
    y_pred = best_model.predict(x_pred)
    xmin_pred = xmin_train.iloc[[l - 1]]
    ymin_pred = best_model_min.predict(xmin_pred)
    xmax_pred = xmax_train.iloc[[l - 1]]
    ymax_pred = best_model_max.predict(xmax_pred)
    prediction = prediction.append(
        {
            'TIMESTAMP': x_pred.index[0],
            'RENEWABLES_PCT_MEAN': y_pred[0],
            'RENEWABLES_PCT_LOWER': ymin_pred[0],
            'RENEWABLES_PCT_UPPER': ymax_pred[0]
        },
        ignore_index=True)
    for i in range(1, periods):
        ti = prediction.iloc[i -
                             1]['TIMESTAMP'] + pd.offsets.DateOffset(months=1)
        xi_pred = pd.DataFrame({
            'YESTERDAY': y_pred,
            'YESTERDAY_DIFF': y_pred - x_pred['YESTERDAY'],
            'YESTERDAY-1': x_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF': x_pred['YESTERDAY_DIFF']
        })
        yi_pred = best_model.predict(xi_pred)
        xmini_pred = pd.DataFrame({
            'YESTERDAY':
            ymin_pred,
            'YESTERDAY_DIFF':
            ymin_pred - xmin_pred['YESTERDAY'],
            'YESTERDAY-1':
            xmin_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF':
            xmin_pred['YESTERDAY_DIFF']
        })
        ymini_pred = best_model.predict(xmini_pred)
        xmaxi_pred = pd.DataFrame({
            'YESTERDAY':
            ymax_pred,
            'YESTERDAY_DIFF':
            ymax_pred - xmax_pred['YESTERDAY'],
            'YESTERDAY-1':
            xmax_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF':
            xmax_pred['YESTERDAY_DIFF']
        })
        ymaxi_pred = best_model.predict(xmaxi_pred)
        prediction = prediction.append(
            {
                'TIMESTAMP': ti,
                'RENEWABLES_PCT_MEAN': yi_pred[0],
                'RENEWABLES_PCT_LOWER': ymini_pred[0],
                'RENEWABLES_PCT_UPPER': ymaxi_pred[0]
            },
            ignore_index=True)
        x_pred = xi_pred
        y_pred = yi_pred
        xmin_pred = xmini_pred
        ymin_pred = ymini_pred
        xmax_pred = xmaxi_pred
        ymax_pred = ymaxi_pred

    prediction.set_index('TIMESTAMP', inplace=True)
    prediction = prediction.resample('1Y').mean()
    p = prediction.plot()
    p.set_title('CA Predicted Renewables % by Random Forest Regression')
    p.set_ylabel('Renewables %')
    wd = os.path.dirname(data_path) + '/../images'
    os.makedirs(wd, exist_ok=True)
    plt.savefig(wd + '/prediction-randomforest.png')

    return prediction