def test_max_depth_None_rfqr(): # Since each leaf is pure and has just one unique value. # the mean equals any quantile. rng = np.random.RandomState(0) X = rng.randn(10, 1) y = np.linspace(0.0, 100.0, 10.0) rfqr = RandomForestQuantileRegressor(random_state=0, bootstrap=False, max_depth=None) rfqr.fit(X, y) for quantile in [20, 40, 50, 60, 80, 90]: assert_array_almost_equal(rfqr.predict(X, quantile=None), rfqr.predict(X, quantile=quantile), 5)
def test_tree_forest_equivalence(): """ Test that a DecisionTree and RandomForest give equal quantile predictions when bootstrap is set to False. """ rfqr = RandomForestQuantileRegressor(random_state=0, bootstrap=False, max_depth=2) rfqr.fit(X_train, y_train) dtqr = DecisionTreeQuantileRegressor(random_state=0, max_depth=2) dtqr.fit(X_train, y_train) assert_true(np.all(rfqr.y_train_leaves_ == dtqr.y_train_leaves_)) assert_array_almost_equal(rfqr.predict(X_test, quantile=10), dtqr.predict(X_test, quantile=10), 5)
def train_RandomForestQuantileRegressor( population, plpData, train, modelOutput, seed, quiet, n_estimators, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap, oob_score, warm_start): print("Training RandomForestQuantileRegressor ") y = population[:, 1] X = plpData[population[:, 0], :] trainInds = population[:, population.shape[1] - 1] > 0 print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1])) print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1])) ########################################################################### if train: pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) print("Calculating prediction for train set of size %s" % (pred_size)) test_pred = np.zeros( pred_size ) # zeros length sum(population[:,population.size[1]] ==i) for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) train_x = X[trainInds, :][trainInd, :] train_y = y[trainInds][trainInd] test_x = X[trainInds, :][testInd, :] print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) print("Train set contains %s outcomes " % (np.sum(train_y))) print("Training fold %s" % (i)) start_time = timeit.default_timer() tmodel = RandomForestQuantileRegressor( n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, oob_score=oob_score, warm_start=warm_start, random_state=seed, n_jobs=-1) tmodel = tmodel.fit(X=csr_matrix(train_x), y=train_y) end_time = timeit.default_timer() print("Training fold took: %.2f s" % (end_time - start_time)) print("Calculating predictions on left out fold set...") ind = (population[:, population.shape[1] - 1] > 0) ind = population[ind, population.shape[1] - 1] == i test_pred[ind] = tmodel.predict(csr_matrix(test_x)) print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) # merge pred with indexes[testInd,:] test_pred.shape = ( population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) prediction = np.append( population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) return prediction # train final: else: print("Training final adaBoost model on all train data...") print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) start_time = timeit.default_timer() tmodel = RandomForestQuantileRegressor( n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, oob_score=oob_score, warm_start=warm_start, random_state=seed, n_jobs=-1) tmodel = tmodel.fit(X=csr_matrix(X[trainInds, :]), y=y[trainInds]) end_time = timeit.default_timer() print("Training final took: %.2f s" % (end_time - start_time)) # save the model: if not os.path.exists(modelOutput): os.makedirs(modelOutput) print("Model saved to: %s" % (modelOutput)) joblib.dump(tmodel, os.path.join(modelOutput, "model.pkl")) pred = tmodel.predict(csr_matrix(X[trainInds, :]))[:, 0] pred.shape = ( population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) prediction = np.append( population[population[:, population.shape[1] - 1] > 0, :], pred, axis=1) return prediction, tmodel.feature_importances_