Ejemplo n.º 1
0
    def test_cleanup(self):
        reg1 = RGFRegressor()
        reg1.fit(self.X_train, self.y_train)

        reg2 = RGFRegressor()
        reg2.fit(self.X_train, self.y_train)

        self.assertNotEqual(reg1.cleanup(), 0)
        self.assertEqual(reg1.cleanup(), 0)

        glob_file = os.path.join(_get_temp_path(), reg1._file_prefix + "*")
        self.assertFalse(glob.glob(glob_file))

        self.assertRaises(NotFittedError, reg1.predict, self.X_test)
        reg2.predict(self.X_test)
Ejemplo n.º 2
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Ejemplo n.º 3
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Ejemplo n.º 4
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor(prefix='reg')
     for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Ejemplo n.º 5
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor()
     for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix,
                           sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Ejemplo n.º 6
0
    def test_joblib_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        joblib.dump(reg, 'test_reg.pkl')

        # Remove model file
        _cleanup()

        reg2 = joblib.load('test_reg.pkl')
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Ejemplo n.º 7
0
    def test_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        s = pickle.dumps(reg)

        # Remove model file
        _cleanup()

        reg2 = pickle.loads(s)
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Ejemplo n.º 8
0
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],
                                            infer_datetime_format=True)
df_test_set["travel_date"] = df_test_set["travel_date"].dt.dayofweek

df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"],
                                         categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"],
                                            categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(
    lambda x: int(x[0]) * 60 + int(x[1]))
df_test_set['is_weekend'] = np.where(df_test_set['travel_date'] >= 5, 1, 0)

X_test = df_test_set.drop(['ride_id', 'max_capacity'], axis=1)

print(X_test.head(5))

test_set_predictions = model.predict(X_test)

d = {
    'ride_id': df_test_set["ride_id"],
    'number_of_ticket': test_set_predictions
}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id', 'number_of_ticket']]

df_predictions.to_csv('results.csv', index=False)
Ejemplo n.º 9
0
def rgf_state_prediction(state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format(
                    state, city)):
                print(city, 'done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                model = RGFRegressor(max_leaf=300,
                                     algorithm="RGF_Sib",
                                     test_interval=100,
                                     loss="LS",
                                     verbose=False)

                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]
                try:
                    model.fit(X_train, tgt)
                except ValueError as err:
                    print(
                        '-----------------------------------------------------'
                    )
                    print(city, 'ERRO')
                    print(
                        '-----------------------------------------------------'
                    )
                    break
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                pred_m = model.predict(X_test[:(len(tgtt))])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format(
                'saved_models/rgf', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Ejemplo n.º 10
0
y_train = train[target_column].values
x_train = train.drop([target_column], axis=1)

# Drop irrelevant columns from train and test
x_train = x_train.drop([id_column], axis=1)
test = test.drop([id_column], axis=1)   

# -----------------------------------------------------------------------------
# 			STEP 4 - TRAIN ML MODEL AND GENERATE PREDICTIONS
# -----------------------------------------------------------------------------
# XGBoost
if ml_algorithm == 'XGBoost': 
	d_train = xgb.DMatrix(x_train, label=y_train)
	d_test = xgb.DMatrix(test)
	model = xgb.train(params, d_train, num_rounds, verbose_eval=10)
	prediction = model.predict(d_test)
# LightGBM
elif ml_algorithm == 'LightGBM': 
	d_train = lgb.Dataset(x_train, label=y_train)
	d_test = lgb.Dataset(test)
	model = lgb.train(params, d_train, num_rounds, verbose_eval=10)
	prediction = model.predict(d_test)	
# RGFRegressor, FastRGFRegressor, Ridge Regression, Lasso Regression
else:
	model.fit(x_train, y_train)
	prediction = model.predict(test) 

# -----------------------------------------------------------------------------
# 				STEP 5 - GENERATE KAGGLE SUBMISSION FILE
# -----------------------------------------------------------------------------
print('Generate Submission ...')
Ejemplo n.º 11
0
    def stacklearning(self):
        class extAll(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return self

            def predict(self, X):
                return self

        class extMorgan(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                _,morgan,_=sepTables(X)
                return morgan
        class extMACCS(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,_=sepTables(X)
                maccs = pd.concat([morgan,maccs],axis=1)

                return maccs

        class extDescriptor(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,descriptor=sepTables(X)
                descriptor = pd.concat([morgan,descriptor],axis=1)
                descriptor = pd.concat([maccs,descriptor],axis=1)
                return descriptor

        class extPCA(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                model = PCA(n_components=64)
                _,morgan,_=sepTables(X)
                morgan = morgan.reset_index().drop('index', axis=1)
                W = pd.DataFrame(model.fit_transform(X))
                W = pd.concat([morgan,W],axis=1)
                return W

        lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)

        pipe1 = make_pipeline(extMACCS(), rgf)
        pipe2 = make_pipeline(extMorgan(), rgf1)
        pipe3 = make_pipeline(extDescriptor(), rgf2)
        pipe4 = make_pipeline(extPCA(), rgf3)
        pipe7 =make_pipeline(extDescriptor(), rgf4)
        pipe8 =make_pipeline(extDescriptor(), rgf4)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')
        sgd = SGDRegressor(max_iter=1000)
        pls = PLSRegression(n_components=3)
        ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        pipe5 = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)

        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1)
        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1)

        scores = cross_val_score(stack2, X, y, cv=10)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking'))
        stack1_score = cross_val_score(stack1,X,y, cv=10)
        rgf_score = cross_val_score(rgf,X,y,cv=10)

        stack2.fit(X_train, y_train)
        y_pred = stack2.predict(X_train)
        y_val = stack2.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        rgf.fit(X_train, y_train)
        y_pred = rgf.predict(X_train)
        y_val = rgf.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        pipe1.fit(X_train, y_train)
        y_pred = pipe1.predict(X_train)
        y_val = pipe1.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))


        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        rgf.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
Ejemplo n.º 12
0
    x_valid = x_valid.drop(target_column, axis=1)

    print('Training ...', i + 1)
    # XGBoost
    if ml_algorithm == 'XGBoost':
        d_train = xgb.DMatrix(x_train, label=y_train)
        d_valid = xgb.DMatrix(x_valid, label=y_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(params,
                          d_train,
                          num_rounds,
                          watchlist,
                          early_stopping_rounds=100,
                          verbose_eval=10)
        # Custom scoring function (optional)
        y_pred = model.predict(d_valid, model.best_iteration)
        custom_eval_score = gini_score(y_pred, y_valid)
# LightGBM
    elif ml_algorithm == 'LightGBM':
        d_train = lgb.Dataset(x_train, label=y_train)
        d_valid = lgb.Dataset(x_valid, label=y_valid, reference=d_train)
        model = lgb.train(params,
                          d_train,
                          num_rounds,
                          valid_sets=d_valid,
                          feval=gini_lgb,
                          early_stopping_rounds=100,
                          verbose_eval=10)
# RGFRegressor, FastRGFRegressor, Ridge Regression, Lasso Regression
    else:
        model.fit(x_train, y_train)
Ejemplo n.º 13
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/age/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['age'].copy().reset_index(drop=True)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge(),
        HuberRegressor(epsilon=2.5, alpha=1),
        OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 5, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 5, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')
    print('Reading scores from ', folder_preds)

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train[prefix +
              '_score'] = np.load(folder_preds +
                                  '{}_score_seed{}.npy'.format(prefix, seed))
        test[prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(
            set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) -
            set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_cols=pca_cols)

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # dl
    dict_cols = sorted(
        list(
            set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) -
            set(['IC_20'])))
    train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols)

    # learning process on different datasets
    names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        MLPRegressor(activation='tanh', random_state=0),
        RGFRegressor(max_leaf=1500, loss='Abs'),
        NuSVR(C=10, nu=0.4, kernel='rbf'),
        BayesianRidge(),
        OrthogonalMatchingPursuitCV(),
        ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0),
        KernelRidge(kernel='poly', alpha=0.5)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 +
            [train_dl] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 +
                        [test_dl] * 2,
                        names,
                        is_blend=False)

    # rewrite folders for models and preds
    folder_models = './models/age/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    folds = KFold(n_splits=10, shuffle=True, random_state=0)
    stack = pd.DataFrame(zoo.oof_preds).T
    stack.columns = names

    model_stacker_rgf = RGFRegressor(max_leaf=1000,
                                     reg_depth=25,
                                     verbose=False)
    rgf_pred = cross_val_predict(model_stacker_rgf,
                                 stack,
                                 y.dropna(),
                                 cv=folds,
                                 n_jobs=-1)

    model_stacker_br = BayesianRidge()
    br_pred = cross_val_predict(model_stacker_br,
                                stack,
                                y.dropna(),
                                cv=folds,
                                n_jobs=-1)

    model_stacker_rgf.fit(stack, y.dropna())
    model_stacker_br.fit(stack, y.dropna())

    # save models
    save_pickle(model_stacker_br,
                folder_models + 'BRidge_stack_seed{}'.format(seed))
    save_pickle(model_stacker_rgf,
                folder_models + 'RGF_stack_seed{}'.format(seed))
    print('Final age NMAE: {:.5f}'.format(
        NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred)))

    test_preds = pd.DataFrame(preds).T
    test_preds.columns = names

    age_prediction = pd.DataFrame()
    age_prediction['Id'] = test['Id'].values
    age_prediction['pred'] = 0.25 * model_stacker_rgf.predict(
        test_preds) + 0.75 * model_stacker_br.predict(test_preds)
    age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed),
                          index=False)
    print('age seed pred is saved as',
          folder_preds + 'age_stack_seed{}.csv'.format(seed))
Ejemplo n.º 14
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
Ejemplo n.º 15
0
class RGF(ModelBase):
    """"""

    _l_drop_cols = ['Item_Outlet_Sales', 'index']

    ## training, parameter tuning for single L1
    def train(self, importance=False):
        """"""
        print('\n parameters %s \n' % self.parameters)
        d_fold_val = {}
        for fold in range(self.kfold):
            print('\n---- fold %s begins.\n' % fold)

            ## load data
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## train and predict on valid
            self.__fit()
            eval = self.__predict()
            d_fold_val[fold] = eval

            ## save
            OutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(OutputDir) == False):
                os.makedirs(OutputDir)
            DataUtil.save(self.TrainData,
                          '%s/train.%s' % (OutputDir, self.data_format),
                          format=self.data_format)
            DataUtil.save(self.TestData,
                          '%s/test.%s' % (OutputDir, self.data_format),
                          format=self.data_format)

            print('\n---- Fold %d done. ----\n' % fold)

        return d_fold_val

    ## inferring for fold data and holdout data
    def infer(self, head, HoldoutData, SubmitData, metric_pk=False):
        """"""
        ##
        l_pred_fold = []
        PredHoldout = pd.DataFrame(index=HoldoutData.index)
        PredHoldout['index'] = HoldoutData['index']
        PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales']
        PredSubmit = pd.DataFrame(index=SubmitData.index)
        for fold in range(self.kfold):
            ## load
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## fit
            PredFold = pd.DataFrame(index=self.TestData.index)
            PredFold['index'] = self.TestData['index']
            PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales']
            PredFold['fold'] = fold
            self.__fit()

            ## inferring
            PredFold[head] = self._model.predict(
                self.TestData[self._l_train_columns])
            PredHoldout['fold%s' % (fold)] = self._model.predict(
                HoldoutData[self._l_train_columns])
            PredSubmit['fold%s' % fold] = self._model.predict(
                SubmitData[self._l_train_columns])
            l_pred_fold.append(PredFold)
        ## aggregate folds data
        PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True)
        ## save for folds data
        for fold in range(self.kfold):
            FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format)
            TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format)

            TrainData = PredKFold[PredKFold['fold'] != fold]
            TestData = PredKFold[PredKFold['fold'] == fold]
            DataUtil.save(TrainData, TrainFile, format=self.data_format)
            DataUtil.save(TestData, TestFile, format=self.data_format)

        HoldCols = [
            col for col in PredHoldout.columns if col.startswith('fold')
        ]
        ## save for holdout data
        PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1)
        HoldoutOutputDir = '%s/holdout' % self.OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(PredHoldout,
                      '%s/test.%s' % (HoldoutOutputDir, self.data_format),
                      format=self.data_format)
        ## save for submit data
        PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1)
        SubmitOutputDir = '%s/submit' % self.OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(PredSubmit,
                      '%s/test.%s' % (SubmitOutputDir, self.data_format),
                      format=self.data_format)

        ## metric PK
        if (metric_pk):
            d_metric = {}
            for col in self._l_train_columns:
                diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales'])
                rmse = np.sqrt(np.sum(diff * diff) / len(diff))
                d_metric[col] = rmse
            diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales']
            ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff))
            print('\n===== metric pk result ====\n')
            print('single model: %s, ensemble model %s: %s' %
                  (d_metric, head, ensemble_metric))
            print('\n===== metric pk result ====\n')

        return

    ## L1 fitting
    def __fit(self):
        """"""
        start = time.time()
        ##
        id_cols = [
            col for col in self.TrainData.columns
            if (col.startswith('Item_Identifier'))
        ]
        self._l_drop_cols.extend(id_cols)
        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['Item_Outlet_Sales']
        ##
        self._l_train_columns = X.columns
        print('Size of feature space: %s' % len(self._l_train_columns))
        ##
        self._model = RGFRegressor(
            algorithm=self.parameters['algorithm'],
            loss=self.parameters['loss'],
            learning_rate=self.parameters['learning_rate'],
            n_iter=self.parameters['n_iter'],
            reg_depth=self.parameters['reg_depth'],
            l2=self.parameters['l2'],
            sl2=self.parameters['sl2'],
            #min_samples_leaf= self.parameters['min_samples_leaf'],
            max_leaf=self.parameters['max_leaf'],
            verbose=True)
        self._model.fit(X, Y)
        end = time.time()
        print('\nTraining is done. Time elapsed %ds' % (end - start))

        return

    ## predict
    def __predict(self):
        """"""
        start = time.time()
        ##
        x_test = self.TestData[self._l_train_columns]
        pred_test = self._model.predict(x_test)
        truth_test = self.TestData['Item_Outlet_Sales']
        ## RMSE
        diff = (pred_test - truth_test)
        rmse = np.sqrt(np.sum(diff * diff) / len(diff))

        ##
        end = time.time()
        print('\n Prediction done. Time consumed %ds' % (end - start))

        return rmse
Ejemplo n.º 16
0
# In[9]:
#https://www.analyticsvidhya.com/blog/2018/02/introductory-guide-regularized-greedy-forests-rgf-python/

###############Classifier#####################
from rgf.sklearn import RGFRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score

rgf = RGFRegressor(max_leaf=400,
                    algorithm="RGF_Sib",
                    test_interval=100,
                    verbose=True)

rgf.fit(train_all[features], train_all['kda_ratio'])
valid_preds = list(rgf.predict(validation_all[features]))
test_preds = list(rgf.predict(test_all[features]))

valid_preds = model.predict(validation_all[features])
print('The rmse of prediction using validation set is:', mean_squared_error(validation_set['kda_ratio'], valid_preds) ** 0.5)

test_preds = list(rgf.predict(test_all[features]))


##Using grid serach
parameters = {'max_leaf':[1000,1200,1300,1400,1500,1600,1700,1800,1900,2000],
              'l2':[0.1,0.2,0.3],
              'min_samples_leaf':[5,10]}

model = GridSearchCV(estimator=rgf,
                   param_grid=parameters,