Example #1
0
    def __init__(self, params: Optional[dict] = None):
        """
        Implements Quantile Random Forests using skgarden.
        """
        from skgarden import RandomForestQuantileRegressor

        self.model = RandomForestQuantileRegressor(**params)
Example #2
0
    def __init__(self, p, alpha, random_state=2020, verbose=True):
        # Parameters of the random forest
        self.alpha = 100 * alpha

        self.model = RandomForestQuantileRegressor(random_state=random_state,
                                                   min_samples_split=3,
                                                   n_estimators=100)
Example #3
0
    def __init__(self,
                 quantiles,
                 min_samples_leaf=5,
                 n_estimators=100,
                 n_jobs=1,
                 random_state=0,
                 verbose=False):
        """ Initialization
        Parameters
        ----------
        quantiles : numpy array of quantile levels (q), each in the range (0,1)
        num_features : integer, input signal dimension (p)
        random_state : integer, seed used in quantile random forests
        """

        self.device = 'cpu'
        # Store input (sort the quantiles)
        self.quantiles = torch.from_numpy(np.sort(quantiles)).float().to(
            self.device)
        # Define RF model
        self.model = RandomForestQuantileRegressor(
            random_state=random_state,
            min_samples_leaf=min_samples_leaf,
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            verbose=verbose)
Example #4
0
 def __init__(self, quantiles=0.5, min_samples_split=10, n_estimators=100):
     self.quantiles = quantiles
     self.model = RandomForestQuantileRegressor(
         random_state=0,
         min_samples_split=min_samples_split,
         n_estimators=n_estimators)
     self.label = 'Quantile Forest'
     self.filename = 'rf'
 def __init__(self, x, y, args):
     super(QuantileForest, self).__init__()
     self.alpha = args.alpha
     self.model_name = "QuantileForest"
     self.rfqr = RandomForestQuantileRegressor(n_estimators=args.n_learners)
     #min_samples_split=args.min_samples_split,
     #n_estimators=args.n_learners,
     #random_state=args.seed)
     # self.rfqr.set_params(max_features=x.shape[1] // args.max_features)
     self.rfqr.fit(x, y)
Example #6
0
class QRF:
    """ Fit a random forest (conditional quantile) to training data
    """
    def __init__(self,
                 quantiles,
                 min_samples_leaf=5,
                 n_estimators=100,
                 n_jobs=1,
                 random_state=0,
                 verbose=False):
        """ Initialization
        Parameters
        ----------
        quantiles : numpy array of quantile levels (q), each in the range (0,1)
        num_features : integer, input signal dimension (p)
        random_state : integer, seed used in quantile random forests
        """

        self.device = 'cpu'
        # Store input (sort the quantiles)
        self.quantiles = torch.from_numpy(np.sort(quantiles)).float().to(
            self.device)
        # Define RF model
        self.model = RandomForestQuantileRegressor(
            random_state=random_state,
            min_samples_leaf=min_samples_leaf,
            n_estimators=n_estimators,
            n_jobs=n_jobs,
            verbose=verbose)

    def fit(self, X, Y, return_loss=None):
        warnings.filterwarnings("ignore", category=FutureWarning)
        self.model.fit(X, Y)
        warnings.filterwarnings("default", category=FutureWarning)
        return 0

    def predict(self, X):
        """ Estimate the label given the features
        Parameters
        ----------
        x : numpy array of training features (nXp)
        Returns
        -------
        ret_val : numpy array of predicted labels (n)
        """
        quantiles = self.quantiles.cpu()
        ret_val = np.zeros((X.shape[0], len(quantiles)))
        print("Predicting RF quantiles:")
        for i in tqdm(range(len(quantiles))):
            ret_val[:, i] = self.model.predict(X, quantile=100 * quantiles[i])
        return ret_val

    def get_quantiles(self):
        return self.quantiles.cpu().numpy()
Example #7
0
    def __init__(self,
                 model,
                 fit_params=None,
                 quantiles=[5, 95],
                 params=None):
        """ Initialization

        Parameters
        ----------
        model : None, unused parameter (for compatibility with nc class)
        fit_params : None, unused parameter (for compatibility with nc class)
        quantiles : numpy array, low and high quantile levels in range (0,100)
        params : dictionary of parameters
                params["random_state"] : integer, seed for splitting the data
                                         in cross-validation. Also used as the
                                         seed in quantile random forests (QRF)
                params["min_samples_leaf"] : integer, parameter of QRF
                params["n_estimators"] : integer, parameter of QRF
                params["max_features"] : integer, parameter of QRF
                params["CV"] : boolean, use cross-validation (True) or
                               not (False) to tune the two QRF quantile levels
                               to obtain the desired coverage
                params["test_ratio"] : float, ratio of held-out data, used
                                       in cross-validation
                params["coverage_factor"] : float, to avoid too conservative
                                            estimation of the prediction band,
                                            when tuning the two QRF quantile
                                            levels in cross-validation one may
                                            ask for prediction intervals with
                                            reduced average coverage, equal to
                                            coverage_factor*(q_high - q_low).
                params["range_vals"] : float, determines the lowest and highest
                                       quantile level parameters when tuning
                                       the quanitle levels bt cross-validation.
                                       The smallest value is equal to
                                       quantiles[0] - range_vals.
                                       Similarly, the largest is equal to
                                       quantiles[1] + range_vals.
                params["num_vals"] : integer, when tuning QRF's quantile
                                     parameters, sweep over a grid of length
                                     num_vals.

        """
        super(QuantileForestRegressorAdapter, self).__init__(model, fit_params)
        # Instantiate model
        self.quantiles = quantiles
        self.cv_quantiles = self.quantiles
        self.params = params
        self.rfqr = RandomForestQuantileRegressor(random_state=params["random_state"],
                                                  min_samples_leaf=params["min_samples_leaf"],
                                                  n_estimators=params["n_estimators"],
                                                  max_features=params["max_features"])
    def fit_model(self):
        """
        fit the gradient boosting regression model using the train dataset

        Returns
        -------
        output: RandomForestQuantileRegressor object
            the random forest quantile regression model
        """
        x_train_dummy = pd.get_dummies(self.x)
        self.random_forest = RandomForestQuantileRegressor()
        self.random_forest.set_params(**self.params)
        self.random_forest = self.random_forest.fit(x_train_dummy, self.y)
        return self.random_forest
Example #9
0
class QRF:
    @validated()
    def __init__(self, params: Optional[dict] = None):
        """
        Implements Quantile Random Forests using skgarden.
        """
        from skgarden import RandomForestQuantileRegressor

        self.model = RandomForestQuantileRegressor(**params)

    def fit(self, x_train, y_train):
        self.model.fit(np.array(x_train), np.array(y_train))

    def predict(self, x_test, quantile):
        return self.model.predict(x_test, quantile=100 * quantile)
def build_model(**kwargs):
    model = RandomForestQuantileRegressor(random_state=0,
                                          min_samples_split=10,
                                          n_estimators=1000,
                                          n_jobs=-1,
                                          warm_start=False)
    return model
Example #11
0
def train_qr_algo(model_obj, theta_mat, stats_mat, algo_name, learner_kwargs,
                  pytorch_kwargs, alpha, prediction_grid):
    # Train the regression quantiles algorithms
    if algo_name == 'xgb':
        model = GradientBoostingRegressor(loss='quantile',
                                          alpha=alpha,
                                          **learner_kwargs)
        model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, ))
        pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d))
    elif algo_name == 'rf':
        model = RandomForestQuantileRegressor(**learner_kwargs)
        model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, ))
        pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d),
                                 quantile=alpha * 100)
    elif algo_name == 'lgb':
        model = lgb.LGBMRegressor(objective='quantile',
                                  alpha=alpha,
                                  **learner_kwargs)
        model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, ))
        pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d))
    elif algo_name == 'pytorch':
        model = q_model([alpha],
                        dropout=0.1,
                        in_shape=model_obj.d,
                        **pytorch_kwargs)
        loss_func = QuantileLoss(quantiles=[alpha])
        learner = Learner(model,
                          partial(torch.optim.Adam, weight_decay=1e-6),
                          loss_func,
                          device="cpu")
        learner.fit(theta_mat.reshape(-1, model_obj.d),
                    stats_mat.reshape(-1, ), **learner_kwargs)
        pred_vec = learner.predict(
            prediction_grid.reshape(-1, model_obj.d).astype(np.float32))
    elif algo_name == 'pytorch_3l':
        model = q_model_3l([alpha],
                           dropout=0.1,
                           in_shape=model_obj.d,
                           **pytorch_kwargs)
        loss_func = QuantileLoss(quantiles=[alpha])
        learner = Learner(model,
                          partial(torch.optim.Adam, weight_decay=1e-6),
                          loss_func,
                          device="cpu")
        learner.fit(theta_mat.reshape(-1, model_obj.d),
                    stats_mat.reshape(-1, ), **learner_kwargs)
        pred_vec = learner.predict(
            prediction_grid.reshape(-1, model_obj.d).astype(np.float32))
    elif algo_name == 'linear':
        pred_vec = QuantReg(theta_mat.reshape(-1, model_obj.d),
                            stats_mat.reshape(-1, )).fit(q=alpha).predict(
                                prediction_grid.reshape(-1, model_obj.d))
    else:
        raise ValueError('CDE Classifier not defined in the file.')

    return pred_vec
Example #12
0
class QuantileForest:
    def __init__(self, quantiles=0.5, min_samples_split=10, n_estimators=100):
        self.quantiles = quantiles
        self.model = RandomForestQuantileRegressor(
            random_state=0,
            min_samples_split=min_samples_split,
            n_estimators=n_estimators)
        self.label = 'Quantile Forest'
        self.filename = 'rf'

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        if np.isscalar(self.quantiles):
            return self.model.predict(X, quantile=self.quantiles * 100)
        return np.array([
            self.model.predict(X, quantile=q * 100) for q in self.quantiles
        ]).T
Example #13
0
 def __init__(self, params, quantiles, verbose=False):
     self.regressor = RF(n_estimators=params['n_estimators'],
                         max_features=params['max_features'],
                         min_samples_leaf=params['min_samples_leaf'],
                         random_state=params['random_state'],
                         n_jobs=params['n_jobs'])
     self.quantiles = quantiles
     self.cv_quantiles = quantiles
     self.verbose = verbose
     self.cv = params["cv"]
Example #14
0
class ForestQuantileRegressor:
    def __init__(self, p, alpha, random_state=2020, verbose=True):
        # Parameters of the random forest
        self.alpha = 100 * alpha

        self.model = RandomForestQuantileRegressor(random_state=random_state,
                                                   min_samples_split=3,
                                                   n_estimators=100)

    def fit(self, X, y):
        # Reshape the data
        X = np.asarray(X)
        y = np.asarray(y)
        self.model.fit(X, y)

    def predict(self, X):
        lower = self.model.predict(X, quantile=self.alpha)
        y = np.concatenate(
            (lower[:, np.newaxis],
             self.model.predict(
                 X, quantile=100.0 - self.alpha)[:, np.newaxis]), 1)
        return y
Example #15
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        orig_cols = list(X.names)
        self.pre_get_model()
        from skgarden import RandomForestQuantileRegressor
        model = RandomForestQuantileRegressor(**self.params)
        X = self.basic_impute(X)
        X = X.to_numpy()

        model.fit(X, y)
        importances = np.array(model.feature_importances_)
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances.tolist(),
            iterations=self.params["n_estimators"],
        )
class QuantileForest:
    """
    Estimate conditional quantiles by Quantile Forest
    (fits one model for all quantiles)
    """
    def __init__(self, x, y, args):
        super(QuantileForest, self).__init__()
        self.alpha = args.alpha
        self.model_name = "QuantileForest"
        self.rfqr = RandomForestQuantileRegressor(n_estimators=args.n_learners)
        #min_samples_split=args.min_samples_split,
        #n_estimators=args.n_learners,
        #random_state=args.seed)
        # self.rfqr.set_params(max_features=x.shape[1] // args.max_features)
        self.rfqr.fit(x, y)

    def predict(self, x_te):
        preds_low = self.rfqr.predict(x_te, (self.alpha / 2) * 100)
        preds_high = self.rfqr.predict(x_te, (1 - self.alpha / 2) * 100)
        preds_mean = (preds_high - preds_low) / 2

        return torch.Tensor(preds_mean), torch.Tensor(preds_low), torch.Tensor(
            preds_high)
Example #17
0
 def __init__(self,
              dependent_var_str: str,
              len_of_lag=48,
              len_of_forecast=48,
              min_samples_split=2,
              len_of_test=48,
              n_estimators=1000,
              n_jobs=4):
     """
     initializing class
     :param dependent_var_str:  sets variable to be fit
     :param min_samples_split:  minimum number of samples needed to generate a new branch
     :param n_estimators:       number of estimators used
     """
     self.model = RandomForestQuantileRegressor(
         min_samples_split=min_samples_split,
         n_estimators=n_estimators,
         bootstrap=True,
         # min_weight_fraction_leaf=0.01, max_leaf_nodes=1000,
         n_jobs=n_jobs)
     self.dependent_var = dependent_var_str
     self.length_of_lag = len_of_lag
     self.length_of_test = len_of_test
     self.length_of_forecast = len_of_forecast
Example #18
0
    def __init__(self, switch, X_train, y_train, **RF_params):
        """
        The initilialization includes registration and fitting of the random forest.

        :param switch: It can be chosen between 'Classifier' or 'Regressor' with the corresponding string.
        :param X_train: Features used for training. Can be provided as numpy array or pandas DF (and more?)
        :param y_train: The target for regression or the labels for classification. Also numpy array or pandas DF.
        :param RF_params: All options from sklearn can be used. For instance 
        """
        if switch == 'Classifier':
            clf = RandomForestClassifier(**RF_params)
        elif switch == 'Regressor':
            clf = RandomForestRegressor(**RF_params)
        elif switch == 'RegressorQuantile':
            clf = RandomForestQuantileRegressor(**RF_params)
        else:
            print('specify Classifier or Regressor (first argument)')
            return
        clf.fit(X_train, y_train)
        self.model = clf
Example #19
0
    def run_random_forest(self):
        x_norm2, w = self.prep_train_data()
        x_train, x_test = train_test_split(x_norm2, test_size=self.__test_size)
        x_tr = x_train.values
        x_te = x_test.values
        x_tr, y_tr, x_te, y_te = split_time_series(x_tr, x_te,
                                                   self.__proportion)

        train_y = []
        for y in y_tr:
            train_y.append(y[-1])

        rfqr = RandomForestQuantileRegressor(random_state=0,
                                             min_samples_split=2,
                                             n_estimators=100,
                                             criterion='mae')
        rfqr.fit(x_tr, train_y)

        test_y = []
        for y in y_te:
            test_y.append(np.float(y[-1]))

        y_mean_test = rfqr.predict(x_te)
        y_high_test = rfqr.predict(x_te, 85)
        y_low_test = rfqr.predict(x_te, 15)

        test_predictions = pd.DataFrame({
            'high': y_high_test * w[-1],
            'low': y_low_test * w[-1],
            'point': y_mean_test * w[-1],
            'actual': np.array(test_y) * w[-1]
        })
        test_predictions['id'] = np.arange(0, len(test_predictions))
        self.__logging.info(test_predictions)
        self.__model = rfqr
        return rfqr, w, test_predictions
Example #20
0
max_features = None  # default

##############################################################
##############################################################

####### TRAIN MODELS  ######

X = df[features]
X = np.array(X)

# def train_qrf():

# label = 'C' # quantity to predict

etaC_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split,
                                           n_estimators=n_estimators,
                                           max_features=max_features)
etaW_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split,
                                           n_estimators=n_estimators,
                                           max_features=max_features)
etaN_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split,
                                           n_estimators=n_estimators,
                                           max_features=max_features)

etaC_model.fit(X, df['etaC'].values)
etaW_model.fit(X, df['etaW'].values)
etaN_model.fit(X, df['etaN'].values)

lon = ncres['lon'][:]
lat = ncres['lat'][:]
nlon = np.size(lon)
Example #21
0
def pred_error(dfr, label, features, seeds,*,
               min_samples_split=10, nshuffles=10,
               n_estimators=1000, test_size = 0.33, min_dist=100,
               max_features='none', plotmaps = True):
    ''' Predict error components using a quantile regression forest model '''

    # initialize arrays with prediction and test values:
    nX = np.size(features)
    Y_TEST   = []
    D_TEST   = []
    T_TEST   = []
    LOWER    = []
    UPPER    = []
    MEDIAN   = []
    EXPECTED = []
    LAT_TEST = []
    LON_TEST = []
    QRF_IMPS = np.zeros((nX, nshuffles))
    RF_IMPS = np.zeros((nX, nshuffles))

    for irs in range(nshuffles):

        # df = shuffle(df) # first reshuffle order of rows
        df0 = dfr.copy()
        df0 = df0.sample(frac=1, replace=False,
                       random_state=seeds[irs]).reset_index(drop=True)
        df = remove_neighbours(df0, min_dist=min_dist)



        X = df[features]
        # Xnames = list(X.columns)
        X = np.array(X)
        Yname = 'eta{}'.format(label) # label = error in the parameter C
        Y = df[Yname].values
        D = df['{}d'.format(label)].values # downscaled values to correct
        T = df['{}g'.format(label)].values # ground truth value
        Lat = df['clat'].values  # latitude value
        Lon = df['clon'].values  # longitiude value


        X_train, X_test, Y_train, Y_test, D_train, D_test, T_train, T_test, \
            Lat_train, Lat_test, Lon_train, Lon_test = train_test_split(
            X, Y, D, T, Lat, Lon, test_size=test_size, shuffle=False)


        if irs < 3 and plotmaps == True:
            plt.figure()
            plt.plot(dfr.clat, dfr.clon, '.')
            plt.plot(Lat_test, Lon_test, 'or')
            plt.plot(Lat_train, Lon_train, 'ob')
            plt.savefig(os.path.join(cfun.outplot, 'stats',
                                     'qrf_gen_{}_{}'.format(irs, label)))
            plt.close()

        # fit quantile regression forest
        rfqr = RandomForestQuantileRegressor(
                min_samples_split=min_samples_split,
                n_estimators=n_estimators, max_features=max_features)
        rfqr.fit(X_train, Y_train)
        upper = rfqr.predict(X_test, quantile=75)
        lower = rfqr.predict(X_test, quantile=25)
        median = rfqr.predict(X_test, quantile=50)
        qrf_imps = rfqr.feature_importances_
        # print(qrf_imps)

        # Fit random forest
        rfr = RandomForestRegressor(min_samples_split=min_samples_split,
                n_estimators=n_estimators, max_features=max_features)
        rfr.fit(X_train, Y_train)
        expected = rfqr.predict(X_test)
        rf_imps = rfr.feature_importances_
        # print(rf_imps)

        Y_TEST   = np.concatenate((Y_TEST, Y_test))
        D_TEST   = np.concatenate((D_TEST, D_test))
        T_TEST   = np.concatenate((T_TEST, T_test))
        UPPER    = np.concatenate((UPPER, upper))
        LOWER    = np.concatenate((LOWER, lower))
        MEDIAN   = np.concatenate((MEDIAN, median))
        EXPECTED = np.concatenate((EXPECTED, expected))
        LAT_TEST   = np.concatenate((LAT_TEST, Lat_test))
        LON_TEST   = np.concatenate((LON_TEST, Lon_test))

        QRF_IMPS[:, irs] = qrf_imps
        RF_IMPS [:, irs] = rf_imps
        # print(QRF_IMPS)

    MEAN_QRF_IMPS = np.mean(QRF_IMPS, axis=1)
    MEAN_RF_IMPS = np.mean(RF_IMPS, axis=1)
    CORR_QRF = D_TEST/(MEDIAN + 1)
    CORR_RF = D_TEST/(EXPECTED + 1)

    res = {'pred_qrf': MEDIAN, 'pred_rf': EXPECTED,
           'upper': UPPER, 'lower': LOWER,
           'y_test': Y_TEST, 'd_test': D_TEST, 't_test': T_TEST,
           'lat_test': LAT_TEST, 'lon_test': LON_TEST,
           'corr_qrf': CORR_QRF, 'corr_rf': CORR_RF,
           'qrf_imps': QRF_IMPS,
           'rf_imps': RF_IMPS,
           'mean_qrf_imps': MEAN_QRF_IMPS,
           'mean_rf_imps': MEAN_RF_IMPS
           }
    return res
Example #22
0
class RandomForestQR:
    def __init__(self, params, quantiles, verbose=False):
        self.regressor = RF(n_estimators=params['n_estimators'],
                            max_features=params['max_features'],
                            min_samples_leaf=params['min_samples_leaf'],
                            random_state=params['random_state'],
                            n_jobs=params['n_jobs'])
        self.quantiles = quantiles
        self.cv_quantiles = quantiles
        self.verbose = verbose
        self.cv = params["cv"]

    def fit(self, X, y, cv=True):
        if self.cv and cv:
            self.tune(X, y)
        self.regressor.fit(X, y)

    def predict(self, X, quantiles=None):
        if quantiles is None:
            quantiles = self.cv_quantiles

        predictions = np.zeros((X.shape[0], len(quantiles)))
        for j in range(len(quantiles)):
            q = 100.0 * quantiles[j]
            predictions[:, j] = self.regressor.predict(X, q)

        predictions.sort(axis=1)
        return predictions

    def tune(self, X, y, test_ratio=0.2, random_state=1):
        "Tune using cross-validation"
        coverage_factor = 0.85
        target_coverage = round(self.quantiles[-1] - self.quantiles[0],
                                3) * coverage_factor
        range_vals = 0.3
        num_vals = 10

        print("  [CV] target coverage = %.3f" % (target_coverage))
        sys.stdout.flush()

        quantiles = np.array(self.quantiles)
        grid_q_low = np.linspace(quantiles[0], quantiles[0] + range_vals,
                                 num_vals).reshape(-1, 1)
        grid_q_median = np.repeat(0.5, num_vals).reshape(-1, 1)
        grid_q_high = np.linspace(quantiles[-1], quantiles[-1] - range_vals,
                                  num_vals).reshape(-1, 1)
        grid_q = np.concatenate((grid_q_low, grid_q_median, grid_q_high), 1)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_ratio, random_state=random_state)
        print("  [CV] Fitting random forest... ", end="")
        sys.stdout.flush()
        self.fit(X_train, y_train, cv=False)
        print("done.")
        sys.stdout.flush()

        best_avg_length = 1e10
        best_q = grid_q[0]
        for q in grid_q:
            print("  [CV] q = [%.3f,%.3f,%.3f], " % (q[0], q[1], q[-1]),
                  end="")
            sys.stdout.flush()
            y_predictions = self.predict(X_test, quantiles=q)
            lower = y_predictions[:, 0]
            upper = y_predictions[:, -1]
            coverage = np.mean((y_test >= lower) & (y_test <= upper))
            avg_length = np.mean(upper - lower)
            print("coverage = %.3f, length = %.3f" % (coverage, avg_length))
            sys.stdout.flush()
            if (coverage >= target_coverage) and (avg_length <
                                                  best_avg_length):
                best_avg_length = avg_length
                best_q = q
            else:
                break

        print("  [CV] Best q = [%.3f,%.3f,%.3f]" %
              (best_q[0], best_q[1], best_q[-1]))
        sys.stdout.flush()

        self.cv_quantiles = best_q
        return best_q
Example #23
0
class QuantileForestRegressorAdapter(RegressorAdapter):
    """ Conditional quantile estimator, defined as quantile random forests (QRF)

    References
    ----------
    .. [1]  Meinshausen, Nicolai. "Quantile regression forests."
            Journal of Machine Learning Research 7.Jun (2006): 983-999.

    """
    def __init__(self, model, fit_params=None, quantiles=[5, 95], params=None):
        """ Initialization

        Parameters
        ----------
        model : None, unused parameter (for compatibility with nc class)
        fit_params : None, unused parameter (for compatibility with nc class)
        quantiles : numpy array, low and high quantile levels in range (0,100)
        params : dictionary of parameters
                params["random_state"] : integer, seed for splitting the data
                                         in cross-validation. Also used as the
                                         seed in quantile random forests (QRF)
                params["min_samples_leaf"] : integer, parameter of QRF
                params["n_estimators"] : integer, parameter of QRF
                params["max_features"] : integer, parameter of QRF
                params["CV"] : boolean, use cross-validation (True) or
                               not (False) to tune the two QRF quantile levels
                               to obtain the desired coverage
                params["test_ratio"] : float, ratio of held-out data, used
                                       in cross-validation
                params["coverage_factor"] : float, to avoid too conservative
                                            estimation of the prediction band,
                                            when tuning the two QRF quantile
                                            levels in cross-validation one may
                                            ask for prediction intervals with
                                            reduced average coverage, equal to
                                            coverage_factor*(q_high - q_low).
                params["range_vals"] : float, determines the lowest and highest
                                       quantile level parameters when tuning
                                       the quanitle levels bt cross-validation.
                                       The smallest value is equal to
                                       quantiles[0] - range_vals.
                                       Similarly, the largest is equal to
                                       quantiles[1] + range_vals.
                params["num_vals"] : integer, when tuning QRF's quantile
                                     parameters, sweep over a grid of length
                                     num_vals.

        """
        super(QuantileForestRegressorAdapter, self).__init__(model, fit_params)
        # Instantiate model
        self.quantiles = quantiles
        self.cv_quantiles = self.quantiles
        self.params = params
        self.rfqr = RandomForestQuantileRegressor(
            random_state=params["random_state"],
            min_samples_leaf=params["min_samples_leaf"],
            n_estimators=params["n_estimators"],
            max_features=params["max_features"])

    def fit(self, x, y):
        """ Fit the model to data

        Parameters
        ----------

        x : numpy array of training features (nXp)
        y : numpy array of training labels (n)

        """
        if self.params["CV"]:
            target_coverage = self.quantiles[1] - self.quantiles[0]
            coverage_factor = self.params["coverage_factor"]
            range_vals = self.params["range_vals"]
            num_vals = self.params["num_vals"]
            grid_q_low = np.linspace(self.quantiles[0],
                                     self.quantiles[0] + range_vals,
                                     num_vals).reshape(-1, 1)
            grid_q_high = np.linspace(self.quantiles[1],
                                      self.quantiles[1] - range_vals,
                                      num_vals).reshape(-1, 1)
            grid_q = np.concatenate((grid_q_low, grid_q_high), 1)

            self.cv_quantiles = tune_params_cv.CV_quntiles_rf(
                self.params, x, y, target_coverage, grid_q,
                self.params["test_ratio"], self.params["random_state"],
                coverage_factor)

        self.rfqr.fit(x, y)

    def predict(self, x):
        """ Estimate the conditional low and high quantiles given the features

        Parameters
        ----------
        x : numpy array of training features (nXp)

        Returns
        -------
        ret_val : numpy array of estimated conditional quantiles (nX2)

        """
        lower = self.rfqr.predict(x, quantile=self.cv_quantiles[0])
        upper = self.rfqr.predict(x, quantile=self.cv_quantiles[1])

        ret_val = np.zeros((len(lower), 2))
        ret_val[:, 0] = lower
        ret_val[:, 1] = upper
        return ret_val
Example #24
0
def CV_quntiles_rf(params,
                   X,
                   y,
                   target_coverage,
                   grid_q,
                   test_ratio,
                   random_state,
                   coverage_factor=0.9):
    """ Tune the low and high quantile level parameters of quantile random
        forests method, using cross-validation
    
    Parameters
    ----------
    params : dictionary of parameters
            params["random_state"] : integer, seed for splitting the data 
                                     in cross-validation. Also used as the
                                     seed in quantile random forest (QRF)
            params["min_samples_leaf"] : integer, parameter of QRF
            params["n_estimators"] : integer, parameter of QRF
            params["max_features"] : integer, parameter of QRF
    X : numpy array, containing the training features (nXp)
    y : numpy array, containing the training labels (n)
    target_coverage : desired coverage of prediction band. The output coverage
                      may be smaller if coverage_factor <= 1, in this case the
                      target will be modified to target_coverage*coverage_factor
    grid_q : numpy array, of low and high quantile levels to test
    test_ratio : float, test size of the held-out data
    random_state : integer, seed for splitting the data in cross-validation.
                   Also used as the seed in QRF.
    coverage_factor : float, when tuning the two QRF quantile levels one may
                      ask for prediction band with smaller average coverage,
                      equal to coverage_factor*(q_high - q_low) to avoid too
                      conservative estimation of the prediction band
    
    Returns
    -------
    best_q : numpy array of low and high quantile levels (length 2)
    
    References
    ----------
    .. [1]  Meinshausen, Nicolai. "Quantile regression forests."
            Journal of Machine Learning Research 7.Jun (2006): 983-999.
    
    """
    target_coverage = coverage_factor * target_coverage
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, random_state=random_state)
    best_avg_length = 1e10
    best_q = grid_q[0]

    rf = RandomForestQuantileRegressor(
        random_state=params["random_state"],
        min_samples_leaf=params["min_samples_leaf"],
        n_estimators=params["n_estimators"],
        max_features=params["max_features"])
    rf.fit(X_train, y_train)

    for q in grid_q:
        y_lower = rf.predict(X_test, quantile=q[0])
        y_upper = rf.predict(X_test, quantile=q[1])
        coverage, avg_length = helper.compute_coverage_len(
            y_test, y_lower, y_upper)
        if (coverage >= target_coverage) and (avg_length < best_avg_length):
            best_avg_length = avg_length
            best_q = q
        else:
            break
    return best_q
def rfqr_model(pos_dict, predict_year, sz):
    model_dict = {}
    predict_dict = {}
    outcomes = {}
    for pos in pos_dict:
        print(pos)
        target = copy.deepcopy(pos_dict[pos])

        # team dummy variables
        dum = pd.get_dummies(target.Tm)
        target = target.drop('Tm', axis=1)
        target = pd.concat([target, dum], axis=1)

        # save these values to evaluate predictions later
        outcomes[pos] = target.loc[target.Year == predict_year]\
            .reset_index(drop = True)[['Name', 'pts_next_year']]

        # set aside data to use for model prediction when the model is done
        predict_dict[pos] = target.loc[target.Year == predict_year]\
            .reset_index(drop = True)\
            .drop(['Year', 'pts_next_year', 'Name'], axis=1)

        # make sure new values arent used in the modeling
        target = target.loc[target.Year < predict_year].reset_index(drop=True)
        # only use 'sz' years of data before prediction year
        target = target.loc[target.Year > predict_year - sz].drop(
            ['Year', 'Name'], axis=1)

        # separate labels, targets, features
        labels = np.array(target['pts_next_year'])
        target = target.drop(['pts_next_year'], axis=1)
        features = np.array(target)
        feature_list = list(target.columns)

        # run model
        rfqr = RandomForestQuantileRegressor(random_state=0, n_estimators=3000)
        rfqr.fit(features, labels)
        model_dict[pos] = rfqr

        #
        upper = np.concatenate(
            ([], rfqr.predict(predict_dict[pos], quantile=98.5)))
        lower = np.concatenate(
            ([], rfqr.predict(predict_dict[pos], quantile=2.5)))
        median = np.concatenate(
            ([], rfqr.predict(predict_dict[pos], quantile=50)))

        #interval = upper - lower
        #sort_ind = np.argsort(interval)
        y_true_all = outcomes[pos]['pts_next_year']  #[sort_ind]
        upper = upper  #[sort_ind]
        lower = lower  #[sort_ind]
        median = median  #[sort_ind]
        #mean = (upper + lower) / 2

        # Center such that the mean of the prediction interval is at 0.0
        #y_true_all -= mean
        #upper -= mean
        #lower -= mean

        plt.plot(y_true_all, "ro")
        plt.fill_between(np.arange(len(upper)),
                         lower,
                         upper,
                         alpha=0.2,
                         color="r",
                         label="Pred. interval")
        plt.plot(median)
        plt.xlabel("X variable")
        plt.ylabel("Points")
        plt.xlim([0, 100])
        plt.show()

        # Get numerical feature importances
        importances = list(rfqr.feature_importances_)
        feature_importances = [
            (feature, round(importance, 2))
            for feature, importance in zip(feature_list, importances)
        ]
        feature_importances = sorted(feature_importances,
                                     key=lambda x: x[1],
                                     reverse=True)
        [
            print('Variable: {:20} Importance: {}'.format(*pair))
            for pair in feature_importances
        ]

    ## get ouptuts
    final_dict = copy.deepcopy(predict_dict)
    for pos in predict_dict:
        model = model_dict[pos]
        final_dict[pos]['prediction'] = model.predict(predict_dict[pos])
        final_dict[pos]['Names'] = outcomes[pos]['Name']
        final_dict[pos]['pts_next_year'] = outcomes[pos]['pts_next_year']
    return final_dict
         label="Cross-validation score")

sns.despine()
plt.ylabel('R2 score')
plt.xlabel('Training examples')
plt.ylim((0, 1))
plt.legend(loc="best")
plt.show()


# ===============================================================
# RF QUANTILE REGRESSOR
# ===============================================================

# == fit
rfqr = RandomForestQuantileRegressor(**best_params)
rfqr.fit(X, y)
lower = rfqr.predict(X, quantile=2.5)
upper = rfqr.predict(X, quantile=97.5)
med = rfqr.predict(X, quantile=50)
ypred = reg.predict(X)

# plot confidence intervals
sort_ind = np.argsort(ypred)
plt.plot(np.arange(len(upper)), lower[sort_ind], label='lower')
plt.plot(np.arange(len(upper)), ypred[sort_ind], label='predicted')
plt.plot(np.arange(len(upper)), med[sort_ind], label='median')
plt.plot(np.arange(len(upper)), upper[sort_ind], label='upper')
plt.xlabel('ordered samples')
plt.ylabel('dropout rate')
plt.legend()
Example #27
0
def train_RandomForestQuantileRegressor(
        population, plpData, train, modelOutput, seed, quiet, n_estimators,
        criterion, max_features, max_depth, min_samples_split,
        min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap,
        oob_score, warm_start):
    print("Training RandomForestQuantileRegressor ")
    y = population[:, 1]
    X = plpData[population[:, 0], :]
    trainInds = population[:, population.shape[1] - 1] > 0
    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
    print("population loaded- %s rows and %s columns" %
          (np.shape(population)[0], np.shape(population)[1]))
    ###########################################################################
    if train:
        pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
        print("Calculating prediction for train set of size %s" % (pred_size))
        test_pred = np.zeros(
            pred_size
        )  # zeros length sum(population[:,population.size[1]] ==i)
        for i in range(1,
                       int(np.max(population[:, population.shape[1] - 1]) + 1),
                       1):
            testInd = population[population[:, population.shape[1] - 1] > 0,
                                 population.shape[1] - 1] == i
            trainInd = (population[population[:, population.shape[1] - 1] > 0,
                                   population.shape[1] - 1] != i)
            train_x = X[trainInds, :][trainInd, :]
            train_y = y[trainInds][trainInd]
            test_x = X[trainInds, :][testInd, :]
            print("Fold %s split %s in train set and %s in test set" %
                  (i, train_x.shape[0], test_x.shape[0]))
            print("Train set contains %s outcomes " % (np.sum(train_y)))
            print("Training fold %s" % (i))
            start_time = timeit.default_timer()
            tmodel = RandomForestQuantileRegressor(
                n_estimators=n_estimators,
                criterion=criterion,
                max_features=max_features,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_leaf_nodes=max_leaf_nodes,
                bootstrap=bootstrap,
                oob_score=oob_score,
                warm_start=warm_start,
                random_state=seed,
                n_jobs=-1)
            tmodel = tmodel.fit(X=csr_matrix(train_x), y=train_y)
            end_time = timeit.default_timer()
            print("Training fold took: %.2f s" % (end_time - start_time))
            print("Calculating predictions on left out fold set...")
            ind = (population[:, population.shape[1] - 1] > 0)
            ind = population[ind, population.shape[1] - 1] == i
            test_pred[ind] = tmodel.predict(csr_matrix(test_x))
            print("Prediction complete: %s rows " %
                  (np.shape(test_pred[ind])[0]))
            print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
        # merge pred with indexes[testInd,:]
        test_pred.shape = (
            population[population[:, population.shape[1] - 1] > 0, :].shape[0],
            1)
        prediction = np.append(
            population[population[:, population.shape[1] - 1] > 0, :],
            test_pred,
            axis=1)
        return prediction
    # train final:
    else:
        print("Training final adaBoost model on all train data...")
        print("X- %s rows and Y %s length" %
              (X[trainInds, :].shape[0], y[trainInds].shape[0]))
        start_time = timeit.default_timer()
        tmodel = RandomForestQuantileRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_features=max_features,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_leaf_nodes=max_leaf_nodes,
            bootstrap=bootstrap,
            oob_score=oob_score,
            warm_start=warm_start,
            random_state=seed,
            n_jobs=-1)
        tmodel = tmodel.fit(X=csr_matrix(X[trainInds, :]), y=y[trainInds])
        end_time = timeit.default_timer()
        print("Training final took: %.2f s" % (end_time - start_time))
        # save the model:
        if not os.path.exists(modelOutput):
            os.makedirs(modelOutput)
        print("Model saved to: %s" % (modelOutput))
        joblib.dump(tmodel,
                    os.path.join(modelOutput, "model.pkl"),
                    compress=True)
        pred = tmodel.predict(csr_matrix(X[trainInds, :]))[:, 0]
        pred.shape = (
            population[population[:, population.shape[1] - 1] > 0, :].shape[0],
            1)
        prediction = np.append(
            population[population[:, population.shape[1] - 1] > 0, :],
            pred,
            axis=1)
        return prediction, tmodel.feature_importances_
Example #28
0
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from skgarden import RandomForestQuantileRegressor
import pandas as pd

X = pd.read_csv(r'c:\test\2010pop-.csv',
                usecols=['slope', 'poi', 'dem', 'ndvi', 'dmsp'])
y = pd.read_csv(r'c:\test\2010pop-.csv', usecols=['log_pop'])
X = np.array(X)
y = np.array(y)
y = y.reshape(y.shape[0], )

kf = KFold(n_splits=6, random_state=0)
rfqr = RandomForestQuantileRegressor(random_state=0,
                                     min_samples_split=10,
                                     n_estimators=1000)

y_true_all = []
lower = []
upper = []

for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X[train_index], X[test_index],
                                        y[train_index], y[test_index])

    rfqr.set_params(max_features=X_train.shape[1] // 3)
    rfqr.fit(X_train, y_train)
    y_true_all = np.concatenate((y_true_all, y_test))
    upper = np.concatenate((upper, rfqr.predict(X_test, quantile=98.5)))
    lower = np.concatenate((lower, rfqr.predict(X_test, quantile=2.5)))
Example #29
0
with the more direct "just predict the maximum" approach.
"""
import numpy as np
import matplotlib.pyplot as plt
from skgarden import RandomForestQuantileRegressor
from sklearn.ensemble import RandomForestRegressor

_, y = simulate_ts(T=1500)

pasts, futures = windows(y)
samples = window_samples(pasts, futures, np.arange(0, 1.05, 0.05))

###############################################################################
# First the quantile forest idea
###############################################################################
model = RandomForestQuantileRegressor(n_estimators=1000)

# fit model using all the quantiles
y_p = np.array([v["x"] for v in samples])
y_f = np.array([v["y"] for v in samples])
model.fit(y_p, y_f)

# make predictions only for 0.9 quantiles
x_q = np.array([v["x"] for v in samples if v["quantile"] == 1])
q_hat = model.predict(x_q, quantile=1)

y_q = np.array([v["y"] for v in samples if v["quantile"] == 1])
plt.scatter(y_q, q_hat)
#plt.show()

###############################################################################
Example #30
0
def CV_quntiles_rf(params, X, y, target_coverage, grid_q, test_ratio, random_state, coverage_factor=1.0):
    """ Tune the low and high quantile level parameters of quantile random
        forests method, using cross-validation
    
    Parameters
    ----------
    params : dictionary of parameters
            params["random_state"] : integer, seed for splitting the data 
                                     in cross-validation. Also used as the
                                     seed in quantile random forest (QRF)
            params["min_samples_leaf"] : integer, parameter of QRF
            params["n_estimators"] : integer, parameter of QRF
            params["max_features"] : integer, parameter of QRF
    X : numpy array, containing the training features (nXp)
    y : numpy array, containing the training labels (n)
    target_coverage : desired coverage of prediction band. The output coverage
                      may be smaller if coverage_factor <= 1, in this case the
                      target will be modified to target_coverage*coverage_factor
    grid_q : numpy array, of low and high quantile levels to test
    test_ratio : float, test size of the held-out data
    random_state : integer, seed for splitting the data in cross-validation.
                   Also used as the seed in QRF.
    coverage_factor : float, when tuning the two QRF quantile levels one may
                      ask for prediction band with smaller average coverage,
                      equal to coverage_factor*(q_high - q_low) to avoid too
                      conservative estimation of the prediction band
    
    Returns
    -------
    best_q : numpy array of low and high quantile levels (length 2)
    
    References
    ----------
    .. [1]  Meinshausen, Nicolai. "Quantile regression forests."
            Journal of Machine Learning Research 7.Jun (2006): 983-999.
    
    """
    target_coverage = coverage_factor*target_coverage

    rf = RandomForestQuantileRegressor(random_state=params["random_state"],
                                       min_samples_leaf=params["min_samples_leaf"],
                                       n_estimators=params["n_estimators"],
                                       max_features=params["max_features"])

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)
    n_folds = 10
    kf = KFold(n_splits=n_folds)
    folds = kf.split(X,y)

    coverage_values = np.zeros((len(grid_q), n_folds))
    length_values = np.zeros((len(grid_q), n_folds))

    fold_idx = 0
    for fold in folds:
        print("[CV DEBUG] fold " + str(fold_idx+1) + " of " + str(n_folds) + "... ", end="")
        sys.stdout.flush()

        idx_train = fold[0]
        idx_test = fold[1]
        X_train = X[idx_train,:]
        y_train = y[idx_train]
        X_test = X[idx_test,:]
        y_test = y[idx_test]

        rf.fit(X_train, y_train)

        for q_idx in range(len(grid_q)):
            q = grid_q[q_idx]
            y_lower = rf.predict(X_test, quantile=q[0])
            y_upper = rf.predict(X_test, quantile=q[-1])
            coverage, avg_length = helper.compute_coverage_len(y_test, y_lower, y_upper)
            coverage_values[q_idx,fold_idx] = coverage
            length_values[q_idx,fold_idx] = avg_length

        fold_idx = fold_idx+1

        print("done.")
        sys.stdout.flush()


    avg_coverage = coverage_values.mean(1)
    avg_length = length_values.mean(1)

    idx_under = np.where(avg_coverage<=target_coverage)[0]
    if len(idx_under)>0:
        best_idx = np.max(idx_under)
    else:
        best_idx = 0
    best_q = grid_q[best_idx]
    best_coverage = avg_coverage[best_idx]
    best_length = avg_length[best_idx]

    print("[CV DEBUG] best q " + str(best_q) + ", coverage " + str(best_coverage) + 
          ", length " + str(best_length))

    return best_q, best_coverage, best_length