def LearnGradientBoost(iX, iy, param_overrides=None, test_ratio=0.2): """Learns a model to approximate iy from iX. Args: iX: List of pandas.Series. iy: pandas.Series. param_overrides: Dict of param overrides for GradientBoost algorithm. test_ratio: Which part to put into test set. Returns: Model. """ assert all(len(x) == len(iy) for x in iX), "%s != %d" % (map(len, iX), len(iy)) X, y = PrepareForTraining(iX, iy) L = len(y) # print '%d docs have y defined out of total %d docs' % (L, len(iy)) X_test, X_train, y_test, y_train = _SplitIntoTrainAndTest(X, y, test_ratio) params = {"n_estimators": 500, "max_depth": 1, "min_samples_split": 5, "learn_rate": 0.1, "loss": "ls"} if param_overrides: params.update(param_overrides) clf = ensemble.GradientBoostingRegressor(**params) clf.fit(X_train, y_train) best_func = _CutAtBestStep(clf, X_test, y_test) _PrintDebugInfo(best_func, X_test, X_train, y_test, y_train) return clf
def LearnGradientBoostInTwoHalves(iX, iy, param_overrides=None, min_steps=10): """Learns a model to approximate iy from iX. Args: iX: List of pandas.Series. iy: pandas.Series. param_overrides: Dict of param overrides for GradientBoost algorithm. test_ratio: Which part to put into test set. min_steps: int. If training for any part of the data took more than this number of steps, we retrain. Returns: Model. """ assert all(len(x) == len(iy) for x in iX), "%s != %d" % (map(len, iX), len(iy)) X, y = PrepareForTraining(iX, iy) L = len(y) # print '%d docs have y defined out of total %d docs' % (L, len(iy)) while True: X_1, X_2, y_1, y_2 = _SplitIntoTrainAndTest(X, y, 0.5) params = { "n_estimators": 500, "max_depth": 1, "min_samples_split": 5, "min_samples_leaf": 5, "learn_rate": 0.1, "loss": "ls", } if param_overrides: params.update(param_overrides) clf1 = ensemble.GradientBoostingRegressor(**params) clf1.fit(X_1, y_1) step1 = _CutAtBestStep(clf1, X_2, y_2) if step1 < min_steps: continue clf2 = ensemble.GradientBoostingRegressor(**params) clf2.fit(X_2, y_2) step2 = _CutAtBestStep(clf2, X_1, y_1) if step1 >= min_steps and step2 >= min_steps: print step1, step2 return base_predictor.AveragePredictor(clf1, clf2) """