Beispiel #1
0
def get_regressor_model(model_config):
    if 'rf' in model_config.model_type or 'random_forest' in model_config.model_type:
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(**model_config.model_params)
        # temporarily set n_estimators to 0, will add estimators for each batch in iteration
        regressor.n_estimators = 0
    elif 'xgb' in model_config.model_type and 'tree' in model_config.model_type:
        # TODO: add other options?
        #from xgboost.sklearn_generic import XGBRegressor
        pass
    else:
        logger.error(
            'model_type {self.model_type} provided in config file is not a valid model type.'
        )

    return regressor
Beispiel #2
0
        y_ = mds.get_y(h, v, year, month, day, masks)

        assert x_.shape[0] == y_.shape[0]

        if it == 0:
            x = x_
            y = y_
        else:
            x = np.vstack((x, x_))
            y = np.vstack((y, y_))

        print 'loading data: %d of %d tiles, h%.2dv%.2d, year:%d, month:%d, day:%d, x:%s, y:%s' % (i+1, len(tile_hv), \
                h, v, year, month, day, x.shape, y.shape)

    train_x = x
    set_rf_samples(int(train_x.shape[0] * 0.7))

    for ib in band_ids:
        train_y = y[:, ib]

        rf = rf_models[ib]
        rf.n_estimators = (i + 1) * n_trees
        rf.fit(train_x, train_y)

        score = rf.score(train_x, train_y)
        print '   (%d of %d bands):   h%.2dv%.2d: train r squared: %.5f' % (
            ib + 1, n_bands, h, v, score)

for ib in band_ids:
    joblib.dump(rf_models[ib], 'rf%d.model' % ib)
def test_RF(X_tv, y_tv, dates_tv, day_test, day_valid_small, day_valid,
            m_f_opt, m_d_opt, n_e_opt):
    # preparing data
    n_rows = 2 * 12 * len(m_f_opt) * len(m_d_opt) * len(n_e_opt)
    rf_results = pd.DataFrame(np.zeros([n_rows, 11]),
                              columns=[
                                  'year', 'month', 'max_features', 'max_depth',
                                  'n_estimators', 'rmsle_tot', 'rmsle_cas',
                                  'rmsle_reg', 'train_rmsle_tot',
                                  'train_rmsle_cas', 'train_rmsle_reg'
                              ])
    rf_results.loc[:, [
        'year', 'month', 'max_features', 'max_depth', 'n_estimators'
    ]] = list(it.product([2011, 2012], range(1, 13), m_f_opt, m_d_opt,
                         n_e_opt))

    i = 0
    for year in [2011, 2012]:
        for month in range(1, 13):
            if year == 2012 or month >= 4:
                day_valid_curr = day_valid
            else:
                day_valid_curr = day_valid_small
            train_ind = ss.get_train(dates_tv, year, month, day_test,
                                     day_valid_curr)
            valid_ind = ss.get_valid(dates_tv, year, month, day_test,
                                     day_valid_curr)
            print('year {}, month {}'.format(year, month), flush=True)
            print('train size: {}, validation size: {}'.format(
                train_ind.sum(), valid_ind.sum()))
            print('learning from {} to {}'.format(dates_tv[train_ind].min(),
                                                  dates_tv[train_ind].max()))
            print('validation from {} to {}'.format(dates_tv[valid_ind].min(),
                                                    dates_tv[valid_ind].max()))
            for m in m_f_opt:
                for md in m_d_opt:
                    rf_c = RandomForestRegressor(n_jobs=-1,
                                                 max_features=m,
                                                 max_depth=md,
                                                 warm_start=True)
                    rf_r = RandomForestRegressor(n_jobs=-1,
                                                 max_features=m,
                                                 max_depth=md,
                                                 warm_start=True)
                    for n in n_e_opt:
                        ## casual
                        rf_c.n_estimators = n
                        rf_c.fit(X_tv[train_ind], y_tv.loc[train_ind,
                                                           'lcasual'])
                        pred_cas = rf_c.predict(X_tv[valid_ind])
                        rf_results.ix[i, 'rmsle_cas'] = resf.rmsle_of_logs(
                            pred_cas, y_tv.loc[valid_ind, 'lcasual'])
                        pred_cas_train = rf_c.predict(X_tv[train_ind])
                        rf_results.ix[i,
                                      'train_rmsle_cas'] = resf.rmsle_of_logs(
                                          pred_cas_train, y_tv.loc[train_ind,
                                                                   'lcasual'])
                        ## registered
                        rf_r.n_estimators = n
                        rf_r.fit(X_tv[train_ind], y_tv.loc[train_ind,
                                                           'lregistered'])
                        pred_reg = rf_r.predict(X_tv[valid_ind])
                        rf_results.ix[i, 'rmsle_reg'] = resf.rmsle_of_logs(
                            pred_reg, y_tv.loc[valid_ind, 'lregistered'])
                        pred_reg_train = rf_r.predict(X_tv[train_ind])
                        rf_results.ix[i,
                                      'train_rmsle_reg'] = resf.rmsle_of_logs(
                                          pred_reg_train,
                                          y_tv.loc[train_ind, 'lregistered'])
                        ## total
                        pred_total = np.log(
                            np.exp(pred_cas) + np.exp(pred_reg) - 1
                        )  #np.log(resf.total_from_log(pred_cas, pred_reg)+1)
                        pred_total_train = np.log(
                            resf.total_from_log(pred_cas_train, pred_reg_train)
                            + 1)
                        rf_results.ix[i, 'rmsle_tot'] = resf.rmsle_of_logs(
                            pred_total, y_tv.loc[valid_ind, 'ltotal'])
                        rf_results.ix[i,
                                      'train_rmsle_tot'] = resf.rmsle_of_logs(
                                          pred_total_train, y_tv.loc[train_ind,
                                                                     'ltotal'])
                        print('Done: ', flush=True)
                        print(rf_results.ix[i], flush=True)

                        i += 1

    return rf_results
Beispiel #4
0
#plt.tight_layout()

param_grid = {
    'n_estimators': [10, 20, 50, 100],
    'max_depth': list(range(3, 10 + 1)),
    'max_features': [None]
}

scores = {}
for max_depth in param_grid['max_depth']:
    print(max_depth)
    for max_features in param_grid['max_features']:
        print(max_features)
        rf = RandomForestRegressor(max_depth=max_depth,
                                   max_features=max_features,
                                   oob_score=True,
                                   warm_start=True)
        for n_estimators in param_grid['n_estimators']:
            rf.n_estimators = n_estimators
            scores[(max_depth, max_features,
                    n_estimators)] = rf.fit(X, y).oob_score_

modelFinal = RandomForestRegressor(max_depth=10,
                                   max_features=None,
                                   n_estimators=100,
                                   oob_score=True).fit(X, y)
modelFinal.oob_score_

metrics.mean_squared_error(y, modelFinal.oob_prediction_)
metrics.mean_absolute_error(y, modelFinal.oob_prediction_)