Example #1
0
def test_max_depth_None_rfqr():
    # Since each leaf is pure and has just one unique value.
    # the mean equals any quantile.
    rng = np.random.RandomState(0)
    X = rng.randn(10, 1)
    y = np.linspace(0.0, 100.0, 10.0)

    rfqr = RandomForestQuantileRegressor(random_state=0,
                                         bootstrap=False,
                                         max_depth=None)
    rfqr.fit(X, y)

    for quantile in [20, 40, 50, 60, 80, 90]:
        assert_array_almost_equal(rfqr.predict(X, quantile=None),
                                  rfqr.predict(X, quantile=quantile), 5)
Example #2
0
def test_tree_forest_equivalence():
    """
    Test that a DecisionTree and RandomForest give equal quantile
    predictions when bootstrap is set to False.
    """
    rfqr = RandomForestQuantileRegressor(random_state=0,
                                         bootstrap=False,
                                         max_depth=2)
    rfqr.fit(X_train, y_train)

    dtqr = DecisionTreeQuantileRegressor(random_state=0, max_depth=2)
    dtqr.fit(X_train, y_train)

    assert_true(np.all(rfqr.y_train_leaves_ == dtqr.y_train_leaves_))
    assert_array_almost_equal(rfqr.predict(X_test, quantile=10),
                              dtqr.predict(X_test, quantile=10), 5)
Example #3
0
def test_base_forest_quantile():
    """
    Test that the base estimators belong to the correct class.
    """
    rng = np.random.RandomState(0)
    X = rng.randn(10, 1)
    y = np.linspace(0.0, 100.0, 10.0)

    rfqr = RandomForestQuantileRegressor(random_state=0, max_depth=1)
    rfqr.fit(X, y)
    for est in rfqr.estimators_:
        assert_true(isinstance(est, DecisionTreeQuantileRegressor))

    etqr = ExtraTreesQuantileRegressor(random_state=0, max_depth=1)
    etqr.fit(X, y)
    for est in etqr.estimators_:
        assert_true(isinstance(est, ExtraTreeQuantileRegressor))
Example #4
0
def train_RandomForestQuantileRegressor(
        population, plpData, train, modelOutput, seed, quiet, n_estimators,
        criterion, max_features, max_depth, min_samples_split,
        min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap,
        oob_score, warm_start):
    print("Training RandomForestQuantileRegressor ")
    y = population[:, 1]
    X = plpData[population[:, 0], :]
    trainInds = population[:, population.shape[1] - 1] > 0
    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
    print("population loaded- %s rows and %s columns" %
          (np.shape(population)[0], np.shape(population)[1]))
    ###########################################################################
    if train:
        pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
        print("Calculating prediction for train set of size %s" % (pred_size))
        test_pred = np.zeros(
            pred_size
        )  # zeros length sum(population[:,population.size[1]] ==i)
        for i in range(1,
                       int(np.max(population[:, population.shape[1] - 1]) + 1),
                       1):
            testInd = population[population[:, population.shape[1] - 1] > 0,
                                 population.shape[1] - 1] == i
            trainInd = (population[population[:, population.shape[1] - 1] > 0,
                                   population.shape[1] - 1] != i)
            train_x = X[trainInds, :][trainInd, :]
            train_y = y[trainInds][trainInd]
            test_x = X[trainInds, :][testInd, :]
            print("Fold %s split %s in train set and %s in test set" %
                  (i, train_x.shape[0], test_x.shape[0]))
            print("Train set contains %s outcomes " % (np.sum(train_y)))
            print("Training fold %s" % (i))
            start_time = timeit.default_timer()
            tmodel = RandomForestQuantileRegressor(
                n_estimators=n_estimators,
                criterion=criterion,
                max_features=max_features,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_leaf_nodes=max_leaf_nodes,
                bootstrap=bootstrap,
                oob_score=oob_score,
                warm_start=warm_start,
                random_state=seed,
                n_jobs=-1)
            tmodel = tmodel.fit(X=csr_matrix(train_x), y=train_y)
            end_time = timeit.default_timer()
            print("Training fold took: %.2f s" % (end_time - start_time))
            print("Calculating predictions on left out fold set...")
            ind = (population[:, population.shape[1] - 1] > 0)
            ind = population[ind, population.shape[1] - 1] == i
            test_pred[ind] = tmodel.predict(csr_matrix(test_x))
            print("Prediction complete: %s rows " %
                  (np.shape(test_pred[ind])[0]))
            print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
        # merge pred with indexes[testInd,:]
        test_pred.shape = (
            population[population[:, population.shape[1] - 1] > 0, :].shape[0],
            1)
        prediction = np.append(
            population[population[:, population.shape[1] - 1] > 0, :],
            test_pred,
            axis=1)
        return prediction
    # train final:
    else:
        print("Training final adaBoost model on all train data...")
        print("X- %s rows and Y %s length" %
              (X[trainInds, :].shape[0], y[trainInds].shape[0]))
        start_time = timeit.default_timer()
        tmodel = RandomForestQuantileRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_features=max_features,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_leaf_nodes=max_leaf_nodes,
            bootstrap=bootstrap,
            oob_score=oob_score,
            warm_start=warm_start,
            random_state=seed,
            n_jobs=-1)
        tmodel = tmodel.fit(X=csr_matrix(X[trainInds, :]), y=y[trainInds])
        end_time = timeit.default_timer()
        print("Training final took: %.2f s" % (end_time - start_time))
        # save the model:
        if not os.path.exists(modelOutput):
            os.makedirs(modelOutput)
        print("Model saved to: %s" % (modelOutput))
        joblib.dump(tmodel, os.path.join(modelOutput, "model.pkl"))
        pred = tmodel.predict(csr_matrix(X[trainInds, :]))[:, 0]
        pred.shape = (
            population[population[:, population.shape[1] - 1] > 0, :].shape[0],
            1)
        prediction = np.append(
            population[population[:, population.shape[1] - 1] > 0, :],
            pred,
            axis=1)
        return prediction, tmodel.feature_importances_