class RandomForestRegression(QuantileRegression): def __init__(self, qt, x, y, params={}): """ Parameters ---------- qt: float the quantile we want to estimate x: DataFrame feature dataset y: DataFrame target dataset params: dictionary a dictionary containing hyper-parameter key-value pairs of the model Internal Attributes ------------------- self.random_forest : RandomForestQuantileRegressor Object or None the fitted model """ super(RandomForestRegression, self).__init__(qt, x, y, params) self.random_forest = None self.fit_model() def fit_model(self): """ fit the gradient boosting regression model using the train dataset Returns ------- output: RandomForestQuantileRegressor object the random forest quantile regression model """ x_train_dummy = pd.get_dummies(self.x) self.random_forest = RandomForestQuantileRegressor() self.random_forest.set_params(**self.params) self.random_forest = self.random_forest.fit(x_train_dummy, self.y) return self.random_forest def feature_importance(self): """ Sort the features from the most important to the least important Returns ------- output: Series Sort the features from the most important to the least important with corresponding values """ feature_importances = self.random_forest.feature_importances_ feature_importances = pd.Series(feature_importances, index=pd.get_dummies(self.x).columns) return feature_importances.sort_values(ascending=False) def predict(self, data): """ predict the qt th quantile for new data Parameters ---------- data: DataFrame new data Returns ------- output: numpy.ndarray predicted quantile for new data """ data_dummy = pd.get_dummies(data) return self.random_forest.predict(data_dummy, quantile=self.qt * 100)
y = y.reshape(y.shape[0], ) kf = KFold(n_splits=6, random_state=0) rfqr = RandomForestQuantileRegressor(random_state=0, min_samples_split=10, n_estimators=1000) y_true_all = [] lower = [] upper = [] for train_index, test_index in kf.split(X): X_train, X_test, y_train, y_test = (X[train_index], X[test_index], y[train_index], y[test_index]) rfqr.set_params(max_features=X_train.shape[1] // 3) rfqr.fit(X_train, y_train) y_true_all = np.concatenate((y_true_all, y_test)) upper = np.concatenate((upper, rfqr.predict(X_test, quantile=98.5))) lower = np.concatenate((lower, rfqr.predict(X_test, quantile=2.5))) interval = upper - lower sort_ind = np.argsort(interval) y_true_all = y_true_all[sort_ind] upper = upper[sort_ind] lower = lower[sort_ind] mean = (upper + lower) / 2 # Center such that the mean of the prediction interval is at 0.0 y_true_all -= mean upper -= mean
class ComponentForecast: def __init__(self, dependent_var_str: str, len_of_lag=48, len_of_forecast=48, min_samples_split=2, len_of_test=48, n_estimators=1000, n_jobs=4): """ initializing class :param dependent_var_str: sets variable to be fit :param min_samples_split: minimum number of samples needed to generate a new branch :param n_estimators: number of estimators used """ self.model = RandomForestQuantileRegressor( min_samples_split=min_samples_split, n_estimators=n_estimators, bootstrap=True, # min_weight_fraction_leaf=0.01, max_leaf_nodes=1000, n_jobs=n_jobs) self.dependent_var = dependent_var_str self.length_of_lag = len_of_lag self.length_of_test = len_of_test self.length_of_forecast = len_of_forecast def train(self, df: pandas.DataFrame): x, y = parse_data_for_training(df, self.dependent_var, length_of_lag=self.length_of_lag, length_of_test=self.length_of_test) self.model.set_params(max_features=x.shape[1]) self.model.fit(x, y) def test(self, df: pandas.DataFrame): #x = parse_data_for_forecast(df[:df.index[-self.length_of_test]], self.dependent_var, # length_of_lag=self.length_of_lag, length_of_forecast=self.length_of_forecast) #values = self.model.predict(x) #fcst = pandas.Series(values, df.index[-self.length_of_test:]) fcst = self.predict(df[:df.index[-self.length_of_test]]) diff = (fcst - df.loc[df.index[-self.length_of_test]:, self.dependent_var]) / \ df.loc[df.index[-self.length_of_test]:, self.dependent_var] rms_err = numpy.sqrt(numpy.nanmean(diff**2)) print(' RMS error: {}'.format(rms_err)) return rms_err def predict(self, df: pandas.DataFrame, quantile=None): x = parse_data_for_forecast(df, self.dependent_var, length_of_lag=self.length_of_lag, length_of_forecast=self.length_of_forecast) values = self.model.predict(x, quantile=quantile) index = pandas.date_range( start=df.index[-1] - datetime.timedelta(minutes=15 * self.length_of_lag), periods=self.length_of_forecast + self.length_of_lag, freq='15T') fcast = pandas.Series(values[1:], index=index) if numpy.nansum(fcast) == 0: scale = 1 else: scale = numpy.nansum(df.loc[index[0]:, self.dependent_var]) / numpy.nansum( fcast[:df.index[-1]]) return scale * fcast[df.index[-1] + datetime.timedelta(minutes=15):]