def test_regressor(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred = reg.predict(self.X_test) mse = mean_squared_error(self.y_test, y_pred) print("MSE: {0:.5f}".format(mse)) self.assertLess(mse, 6.0)
def test_attributes(self): reg = RGFRegressor() attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_') for attr in attributes: self.assertRaises(NotFittedError, getattr, reg, attr) reg.fit(self.X_train, self.y_train) self.assertEqual(reg.n_features_, self.X_train.shape[-1]) self.assertTrue(reg.fitted_) if reg.sl2 is None: self.assertEqual(reg.sl2_, reg.l2) else: self.assertEqual(reg.sl2_, reg.sl2) if reg.min_samples_leaf < 1: self.assertLessEqual(reg.min_samples_leaf_, 0.5 * self.X_train.shape[0]) else: self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf) if reg.n_iter is None: if reg.loss == "LS": self.assertEqual(reg.n_iter_, 10) else: self.assertEqual(reg.n_iter_, 5) else: self.assertEqual(reg.n_iter_, reg.n_iter)
def test_regressor_sparse_input(self): reg = RGFRegressor(prefix='reg') for sparse_format in (csr_matrix, csc_matrix, coo_matrix): X_sparse = sparse_format(self.X) reg.fit(X_sparse, self.y) y_pred = reg.predict(X_sparse) mse = mean_squared_error(self.y, y_pred) self.assertLess(mse, 6.0)
def test_regressor_sparse_input(self): reg = RGFRegressor() for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix, sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): X_sparse = sparse_format(self.X) reg.fit(X_sparse, self.y) y_pred = reg.predict(X_sparse) mse = mean_squared_error(self.y, y_pred) self.assertLess(mse, 6.0)
def test_joblib_pickle(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred1 = reg.predict(self.X_test) joblib.dump(reg, 'test_reg.pkl') # Remove model file _cleanup() reg2 = joblib.load('test_reg.pkl') y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_pickle(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred1 = reg.predict(self.X_test) s = pickle.dumps(reg) # Remove model file _cleanup() reg2 = pickle.loads(s) y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_cleanup(self): reg1 = RGFRegressor() reg1.fit(self.X_train, self.y_train) reg2 = RGFRegressor() reg2.fit(self.X_train, self.y_train) self.assertNotEqual(reg1.cleanup(), 0) self.assertEqual(reg1.cleanup(), 0) glob_file = os.path.join(_get_temp_path(), reg1._file_prefix + "*") self.assertFalse(glob.glob(glob_file)) self.assertRaises(NotFittedError, reg1.predict, self.X_test) reg2.predict(self.X_test)
def test_params(self): reg = RGFRegressor() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, verbose=True, prefix='rgf_regressor', inc_prefix=True, clean=True) reg.set_params(**valid_params) reg.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, verbose=-1, prefix='', inc_prefix=1, clean=0) for key in non_valid_params: reg.set_params(**valid_params) # Reset to valid params reg.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
def test_sample_weight(self): reg = RGFRegressor() y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test) y_pred_weighted = reg.fit(self.X_train, self.y_train, np.ones(self.y_train.shape[0]) ).predict(self.X_test) np.testing.assert_allclose(y_pred, y_pred_weighted) np.random.seed(42) idx = np.random.choice(400, 80, replace=False) self.X_train[idx] = -99999 # Add some outliers y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test) mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt) weights = np.ones(self.y_train.shape[0]) weights[idx] = np.nextafter(np.float32(0), np.float32(1)) # Eliminate outliers y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test) mse_fixed = mean_squared_error(self.y_test, y_pred_weighted) self.assertLess(mse_fixed, mse_corrupt)
def test_params(self): reg = RGFRegressor() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, memory_policy='conservative', verbose=True) reg.set_params(**valid_params) reg.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, memory_policy='Generos', verbose=-1) for key in non_valid_params: reg.set_params(**valid_params) # Reset to valid params reg.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
# Drop irrelevant columns from train and test x_train = x_train.drop([id_column], axis=1) test = test.drop([id_column], axis=1) # ----------------------------------------------------------------------------- # STEP 4 - TRAIN ML MODEL AND GENERATE PREDICTIONS # ----------------------------------------------------------------------------- # XGBoost if ml_algorithm == 'XGBoost': d_train = xgb.DMatrix(x_train, label=y_train) d_test = xgb.DMatrix(test) model = xgb.train(params, d_train, num_rounds, verbose_eval=10) prediction = model.predict(d_test) # LightGBM elif ml_algorithm == 'LightGBM': d_train = lgb.Dataset(x_train, label=y_train) d_test = lgb.Dataset(test) model = lgb.train(params, d_train, num_rounds, verbose_eval=10) prediction = model.predict(d_test) # RGFRegressor, FastRGFRegressor, Ridge Regression, Lasso Regression else: model.fit(x_train, y_train) prediction = model.predict(test) # ----------------------------------------------------------------------------- # STEP 5 - GENERATE KAGGLE SUBMISSION FILE # ----------------------------------------------------------------------------- print('Generate Submission ...') submission = submission.append(pd.DataFrame( {id_column_label: test_id, target_column_label: prediction})) submission.to_csv(submission_file_path, index=False)
from rgf.sklearn import FastRGFRegressor, RGFRegressor boston = load_boston() rng = check_random_state(42) perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] train_x = boston.data[:300] test_x = boston.data[300:] train_y = boston.target[:300] test_y = boston.target[300:] start = time.time() reg = RGFRegressor() reg.fit(train_x, train_y) score = reg.score(test_x, test_y) end = time.time() print("RGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() reg = FastRGFRegressor() reg.fit(train_x, train_y) score = reg.score(test_x, test_y) end = time.time() print("FastRGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() reg = RandomForestRegressor(n_estimators=100)
def stacklearning(self): class extAll(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): return self def predict(self, X): return self class extMorgan(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): _,morgan,_=sepTables(X) return morgan class extMACCS(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,_=sepTables(X) maccs = pd.concat([morgan,maccs],axis=1) return maccs class extDescriptor(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,descriptor=sepTables(X) descriptor = pd.concat([morgan,descriptor],axis=1) descriptor = pd.concat([maccs,descriptor],axis=1) return descriptor class extPCA(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): model = PCA(n_components=64) _,morgan,_=sepTables(X) morgan = morgan.reset_index().drop('index', axis=1) W = pd.DataFrame(model.fit_transform(X)) W = pd.concat([morgan,W],axis=1) return W lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) pipe1 = make_pipeline(extMACCS(), rgf) pipe2 = make_pipeline(extMorgan(), rgf1) pipe3 = make_pipeline(extDescriptor(), rgf2) pipe4 = make_pipeline(extPCA(), rgf3) pipe7 =make_pipeline(extDescriptor(), rgf4) pipe8 =make_pipeline(extDescriptor(), rgf4) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') sgd = SGDRegressor(max_iter=1000) pls = PLSRegression(n_components=3) ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) pipe5 = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1) scores = cross_val_score(stack2, X, y, cv=10) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking')) stack1_score = cross_val_score(stack1,X,y, cv=10) rgf_score = cross_val_score(rgf,X,y,cv=10) stack2.fit(X_train, y_train) y_pred = stack2.predict(X_train) y_val = stack2.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) rgf.fit(X_train, y_train) y_pred = rgf.predict(X_train) y_val = rgf.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) pipe1.fit(X_train, y_train) y_pred = pipe1.predict(X_train) y_val = pipe1.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) cols = np.arange(1,550,1).tolist() cols = X.columns.tolist() cols = [1,2,3] # Initializing Classifiers reg1 = Ridge(random_state=1) #reg2 = ExtraTreesRegressor() reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) reg3 = SVR(gamma='auto',kernel='linear') reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) pls = PLSRegression(n_components=3) pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50)) #linear =SGDRegressor(max_iter=1000) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) nbrs = KNeighborsRegressor(2) pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31)) meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1) stackReg.fit(X_train, y_train) y_pred = stackReg.predict(X_train) y_val = stackReg.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test)) rgf.fit(X_train, y_train) y_pred = reg4.predict(X_train) y_val = reg4.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
def run(seed): # create folders for scores models and preds folder_models = './models/age/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['age'].copy().reset_index(drop=True) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge(), HuberRegressor(epsilon=2.5, alpha=1), OrthogonalMatchingPursuit(n_nonzero_coefs=300) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 5, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 5, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['RGF', 'ENet', 'BRidge'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') print('Reading scores from ', folder_preds) # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train[prefix + '_score'] = np.load(folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test[prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list( set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_cols=pca_cols) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # dl dict_cols = sorted( list( set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) - set(['IC_20']))) train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols) # learning process on different datasets names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ MLPRegressor(activation='tanh', random_state=0), RGFRegressor(max_leaf=1500, loss='Abs'), NuSVR(C=10, nu=0.4, kernel='rbf'), BayesianRidge(), OrthogonalMatchingPursuitCV(), ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0), KernelRidge(kernel='poly', alpha=0.5) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 + [train_dl] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 + [test_dl] * 2, names, is_blend=False) # rewrite folders for models and preds folder_models = './models/age/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') folds = KFold(n_splits=10, shuffle=True, random_state=0) stack = pd.DataFrame(zoo.oof_preds).T stack.columns = names model_stacker_rgf = RGFRegressor(max_leaf=1000, reg_depth=25, verbose=False) rgf_pred = cross_val_predict(model_stacker_rgf, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_br = BayesianRidge() br_pred = cross_val_predict(model_stacker_br, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_rgf.fit(stack, y.dropna()) model_stacker_br.fit(stack, y.dropna()) # save models save_pickle(model_stacker_br, folder_models + 'BRidge_stack_seed{}'.format(seed)) save_pickle(model_stacker_rgf, folder_models + 'RGF_stack_seed{}'.format(seed)) print('Final age NMAE: {:.5f}'.format( NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred))) test_preds = pd.DataFrame(preds).T test_preds.columns = names age_prediction = pd.DataFrame() age_prediction['Id'] = test['Id'].values age_prediction['pred'] = 0.25 * model_stacker_rgf.predict( test_preds) + 0.75 * model_stacker_br.predict(test_preds) age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed), index=False) print('age seed pred is saved as', folder_preds + 'age_stack_seed{}.csv'.format(seed))
def stacklearning(self): class sparseNorm(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): from sklearn import preprocessing Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values)) return Y fm = sgd.FMRegression( n_iter=4743, init_stdev=0.1, rank=100, l2_reg_w=0, l2_reg_V=0, step_size=0.1, ) fm = sgd.FMRegression( n_iter=9943, init_stdev=0.1, rank=219, l2_reg_w=0, l2_reg_V=0.06454, step_size=0.1, ) pipe = make_pipeline(sparseNorm(), fm) calcACC(pipe, X=X2) xgb = xgboost.XGBRegressor( n_estimators=100, max_depth=7, gamma=0, colsample_bytree=0.1 ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=367, learning_rate=0.06,feature_fraction=0.14, max_depth=28, min_data_in_leaf=8 ) rgf = RGFRegressor( max_leaf=1211, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.93, min_samples_leaf=2 ) rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=56,min_samples_split=2, max_features=0.21 ) rf = RandomForestRegressor() ext = ExtraTreesRegressor( n_estimators=384,max_features= 2228, min_samples_split= 0.01,max_depth= 856, min_samples_leaf= 1 ) svr = SVR( gamma=9.5367431640625e-07, epsilon=0.0009765625, C= 2048.0 ) #test combination desNew = make_pipeline(extdescriptorNew(),rf) morNew = make_pipeline(extMorganNew(),rf) kotNew = make_pipeline(extklekotaTothNew(),rf) macNew = make_pipeline(extMACCSNew(),rf) desMac = make_pipeline(extDescriptorMACCS(),rf) morMac = make_pipeline(extMorganMACCS(),rf) kotMac = make_pipeline(extKlekotaTothMACCS(),rf) morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf) des = make_pipeline(extOnlyDescriptor(),rf) mor = make_pipeline(extOnlyMorgan(),rf) kot = make_pipeline(extOnlyklekotaToth(),rf) mac = make_pipeline(extOnlyMACCS(),rf) all = make_pipeline(extAll(),rf) allwithoutNew = make_pipeline(extAllwithoutNew(),rf) allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf) allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf) testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew, "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without " "new":allwithoutNew, "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes} #10fold cv = KFold(n_splits=10, shuffle=True, random_state=0) #Fingerprinttest resultDic={} resultDic2={} for name,model in testDic.items(): #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1) #calcACC(model,X=X,y=y2,name=name) Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc) RMSETmp = Scores['test_RMSE'].mean() CORRTmP = Scores['test_Correlation coefficient'].mean() resultDic.update({name:[RMSETmp,CORRTmP]}) print(name,RMSETmp,CORRTmP) #stacking alldata = make_pipeline(extAll()) # random forest #1.1546 0.70905 stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1) # Light Gradient boosting # 1.160732 0.703776 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1) # XGboost # 1.1839805 0.689571 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1) # Regularized greedily forest # 1.17050 0.6992 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1) #pls 22.808047774809697 0.6410026452910016 i=4 for i in np.arange(3,11,1): pls = PLSRegression(n_components=i) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0) calcACC(testmodel) pls = PLSRegression(n_components=4) #SVR svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592, epsilon=0.0009765625,) svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1) calcACC(svr) #Extratree 1.157420824123527 0.7061010221224269 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1) calcACC(testmodel) #k-NN nbrs = KNeighborsRegressor(3) ##Linear regressions #Stochastic Gradient Descenta sgd = SGDRegressor(max_iter=1000) # Ridge for i in [1,10,100,1000]: ridge = Ridge(alpha=i) calcACC(ridge) ridge = Ridge(alpha=45.50940042350705) calcACC(ridge) # multiple linear lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1)) calcACC(lin) #stacking #0.69 testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1) #1.1532 0.70926 testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf, verbose=1) #1.16420 0.7041 testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1) #1.16379 0.7044 stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1) testmodel = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1) #1.1535496740699531 0.7108839199109559 pcaFeature = make_pipeline(extPCA()) testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=rf,verbose=1) #1.181801005432221 0.6889745579620922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=lgbm,verbose=1) #0.70613 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=xgb,verbose=1) #0.71641717 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #0.7146922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #new features pcaFeature = make_pipeline(extPCA()) #old pipe1 = make_pipeline(extMACCS(), rf) pipe2 = make_pipeline(extMorgan(), rf) pipe3 = make_pipeline(extDescriptor(), rf) pipe4 = make_pipeline(extPCA(), rgf) pipe7 =make_pipeline(extDescriptor(), rgf) pipe8 =make_pipeline(extDescriptor(), rgf) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') pls = PLSRegression(n_components=4) extMACCSdata = make_pipeline(extMACCS()) nbrsPipe = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) ave = extAverage() withoutdesc = make_pipeline(extMACCS()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1) #0.70 stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) #0.69###################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) #0.70 stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1) #0.71 stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1) ########################### ########################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1) stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1) ########################### #stackingwithknn stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1) #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1) cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) cv = KFold(n_splits=10, shuffle=True, random_state=0) St1Scores = cross_validate(stack1,X,y,cv=cv) St1Scores['test_score'].mean()**(1/2) St2Scores = cross_validate(stack2,X,y,cv=cv) St2Scores['test_score'].mean()**(1/2) St3Scores = cross_validate(stack3,X,y,cv=cv) St3Scores['test_score'].mean()**(1/2) stackScore = cross_validate(stack, X, y, cv=cv) stackScore['test_score'].mean()**(1/2) lgbmScores =cross_validate(lgbm,X,y,cv=cv) lgbmScores['test_score'].mean()**(1/2) rgfScores = cross_validate(rgf,X,y,cv=cv) rgfScores['test_score'].mean()**(1/2) RFScores = cross_validate(rf,X,y,cv=cv) RFScores['test_score'].mean()**(1/2) scores = cross_validate(stack2,X,y,cv=cv) scores['test_score'].mean()**(1/2) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking')) stack3.fit(X, y) y_pred = stack3.predict(X_train) y_val = stack3.predict(X_test) #stack3.score(X_train, y_train) exX = preprocess(extractDf, changeList) valy = (10 **(stack3.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) stack1.fit(X, y) valy = (10 **(stack1.predict(exX))).tolist() sgd.fit(X,y) valy = (10 **(sgd.predict(exX))).tolist() rgfpipe = make_pipeline(extMACCS(), rf) rgf.fit(X,y) valy = (10 **(rgf.predict(exX))).tolist() nbrs.fit(X,y) valy = (10 **(nbrs.predict(exX))).tolist() pipe = make_pipeline(extMACCS(), rf) pipe.fit(X,y) valy = (10 **(pipe.predict(exX))).tolist() rf.fit(X, y) y_pred = rf.predict(X_train) y_val = rf.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(rf.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) lgbm.fit(X, y) #y_pred = pipe1.predict(X_train) #y_val = pipe1.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(lgbm.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
class RGF(ModelBase): """""" _l_drop_cols = ['Item_Outlet_Sales', 'index'] ## training, parameter tuning for single L1 def train(self, importance=False): """""" print('\n parameters %s \n' % self.parameters) d_fold_val = {} for fold in range(self.kfold): print('\n---- fold %s begins.\n' % fold) ## load data TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## train and predict on valid self.__fit() eval = self.__predict() d_fold_val[fold] = eval ## save OutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) DataUtil.save(self.TrainData, '%s/train.%s' % (OutputDir, self.data_format), format=self.data_format) DataUtil.save(self.TestData, '%s/test.%s' % (OutputDir, self.data_format), format=self.data_format) print('\n---- Fold %d done. ----\n' % fold) return d_fold_val ## inferring for fold data and holdout data def infer(self, head, HoldoutData, SubmitData, metric_pk=False): """""" ## l_pred_fold = [] PredHoldout = pd.DataFrame(index=HoldoutData.index) PredHoldout['index'] = HoldoutData['index'] PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales'] PredSubmit = pd.DataFrame(index=SubmitData.index) for fold in range(self.kfold): ## load TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## fit PredFold = pd.DataFrame(index=self.TestData.index) PredFold['index'] = self.TestData['index'] PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales'] PredFold['fold'] = fold self.__fit() ## inferring PredFold[head] = self._model.predict( self.TestData[self._l_train_columns]) PredHoldout['fold%s' % (fold)] = self._model.predict( HoldoutData[self._l_train_columns]) PredSubmit['fold%s' % fold] = self._model.predict( SubmitData[self._l_train_columns]) l_pred_fold.append(PredFold) ## aggregate folds data PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True) ## save for folds data for fold in range(self.kfold): FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(FoldOutputDir) == False): os.makedirs(FoldOutputDir) TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format) TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format) TrainData = PredKFold[PredKFold['fold'] != fold] TestData = PredKFold[PredKFold['fold'] == fold] DataUtil.save(TrainData, TrainFile, format=self.data_format) DataUtil.save(TestData, TestFile, format=self.data_format) HoldCols = [ col for col in PredHoldout.columns if col.startswith('fold') ] ## save for holdout data PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1) HoldoutOutputDir = '%s/holdout' % self.OutputDir if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(PredHoldout, '%s/test.%s' % (HoldoutOutputDir, self.data_format), format=self.data_format) ## save for submit data PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1) SubmitOutputDir = '%s/submit' % self.OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(PredSubmit, '%s/test.%s' % (SubmitOutputDir, self.data_format), format=self.data_format) ## metric PK if (metric_pk): d_metric = {} for col in self._l_train_columns: diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales']) rmse = np.sqrt(np.sum(diff * diff) / len(diff)) d_metric[col] = rmse diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales'] ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff)) print('\n===== metric pk result ====\n') print('single model: %s, ensemble model %s: %s' % (d_metric, head, ensemble_metric)) print('\n===== metric pk result ====\n') return ## L1 fitting def __fit(self): """""" start = time.time() ## id_cols = [ col for col in self.TrainData.columns if (col.startswith('Item_Identifier')) ] self._l_drop_cols.extend(id_cols) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['Item_Outlet_Sales'] ## self._l_train_columns = X.columns print('Size of feature space: %s' % len(self._l_train_columns)) ## self._model = RGFRegressor( algorithm=self.parameters['algorithm'], loss=self.parameters['loss'], learning_rate=self.parameters['learning_rate'], n_iter=self.parameters['n_iter'], reg_depth=self.parameters['reg_depth'], l2=self.parameters['l2'], sl2=self.parameters['sl2'], #min_samples_leaf= self.parameters['min_samples_leaf'], max_leaf=self.parameters['max_leaf'], verbose=True) self._model.fit(X, Y) end = time.time() print('\nTraining is done. Time elapsed %ds' % (end - start)) return ## predict def __predict(self): """""" start = time.time() ## x_test = self.TestData[self._l_train_columns] pred_test = self._model.predict(x_test) truth_test = self.TestData['Item_Outlet_Sales'] ## RMSE diff = (pred_test - truth_test) rmse = np.sqrt(np.sum(diff * diff) / len(diff)) ## end = time.time() print('\n Prediction done. Time consumed %ds' % (end - start)) return rmse
): # explain for regression convert y to bins and use that for split dev_X, val_X = train.iloc[dev_index, :], train.iloc[val_index, :] dev_y, val_y = y[dev_index], y[val_index] dev_X = dev_X[(dev_y > lbound) & (dev_y < ubound)] dev_y = dev_y[(dev_y > lbound) & (dev_y < ubound)] val_X2 = val_X[(val_y > lbound) & (val_y < ubound)] val_y2 = val_y[(val_y > lbound) & (val_y < ubound)] print(dev_X.shape) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF_Sib", test_interval=100, loss="LS", learning_rate=0.01, verbose=False) model = rgf.fit(dev_X, dev_y) print("predicting..") preds = model.predict(val_X) oobval[val_index] += preds.reshape(-1, 1) valerr.append(mean_absolute_error(val_y, preds)) print(valerr, "mean:", np.mean(valerr), "std:", np.std(valerr)) oobtest += model.predict(test.values).reshape(-1, 1) val_scores.append(mean_absolute_error(model.predict(valid), yval)) del (rgf, model) gc.collect() print(val_scores, np.mean(val_scores), "---", np.std(val_scores)) pred2 = oobtest / (nbag * nfold) oobpred2 = oobval / (nbag) print(mean_absolute_error(y, oobpred2))
def rgf_state_prediction(state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format( state, city)): print(city, 'done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = RGFRegressor(max_leaf=300, algorithm="RGF_Sib", test_interval=100, loss="LS", verbose=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print( '-----------------------------------------------------' ) print(city, 'ERRO') print( '-----------------------------------------------------' ) break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format( 'saved_models/rgf', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
# In[9]: #https://www.analyticsvidhya.com/blog/2018/02/introductory-guide-regularized-greedy-forests-rgf-python/ ###############Classifier##################### from rgf.sklearn import RGFRegressor from sklearn.model_selection import GridSearchCV from sklearn.utils.validation import check_random_state from sklearn.model_selection import StratifiedKFold, cross_val_score rgf = RGFRegressor(max_leaf=400, algorithm="RGF_Sib", test_interval=100, verbose=True) rgf.fit(train_all[features], train_all['kda_ratio']) valid_preds = list(rgf.predict(validation_all[features])) test_preds = list(rgf.predict(test_all[features])) valid_preds = model.predict(validation_all[features]) print('The rmse of prediction using validation set is:', mean_squared_error(validation_set['kda_ratio'], valid_preds) ** 0.5) test_preds = list(rgf.predict(test_all[features])) ##Using grid serach parameters = {'max_leaf':[1000,1200,1300,1400,1500,1600,1700,1800,1900,2000], 'l2':[0.1,0.2,0.3], 'min_samples_leaf':[5,10]} model = GridSearchCV(estimator=rgf,