def test_attributes(self): clf = RGFClassifier() attributes = ('estimators_', 'classes_', 'n_classes_', 'n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_') for attr in attributes: self.assertRaises(NotFittedError, getattr, clf, attr) clf.fit(self.X_train, self.y_train) self.assertEqual(len(clf.estimators_), len(np.unique(self.y_train))) np.testing.assert_array_equal(clf.classes_, sorted(np.unique(self.y_train))) self.assertEqual(clf.n_classes_, len(clf.estimators_)) self.assertEqual(clf.n_features_, self.X_train.shape[-1]) self.assertTrue(clf.fitted_) if clf.sl2 is None: self.assertEqual(clf.sl2_, clf.l2) else: self.assertEqual(clf.sl2_, clf.sl2) if clf.min_samples_leaf < 1: self.assertLessEqual(clf.min_samples_leaf_, 0.5 * self.X_train.shape[0]) else: self.assertEqual(clf.min_samples_leaf_, clf.min_samples_leaf) if clf.n_iter is None: if clf.loss == "LS": self.assertEqual(clf.n_iter_, 10) else: self.assertEqual(clf.n_iter_, 5) else: self.assertEqual(clf.n_iter_, clf.n_iter)
def run_rgf(): model = RGFClassifier(max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0) fit_model = model.fit(X_train, y_train) pred = fit_model.predict_proba(X_valid)[:, 1] pred_test = fit_model.predict_proba(X_test)[:, 1] try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) return pred, pred_test
def test_classifier_sparse_input(self): clf = RGFClassifier(prefix='clf', calc_prob='Softmax') for sparse_format in (csr_matrix, csc_matrix, coo_matrix): iris_sparse = sparse_format(self.iris.data) clf.fit(iris_sparse, self.iris.target) score = clf.score(iris_sparse, self.iris.target) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def run_rgf(): model = RGFClassifier( max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 ) fit_model = model.fit( X_train, y_train ) pred = fit_model.predict_proba(X_valid)[:,1] pred_test = fit_model.predict_proba(X_test)[:,1] try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) return pred, pred_test
def test_classifier_sparse_input(self): clf = RGFClassifier(calc_prob='softmax') for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix, sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): iris_sparse = sparse_format(self.iris.data) clf.fit(iris_sparse, self.iris.target) score = clf.score(iris_sparse, self.iris.target) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_softmax_classifier(self): clf = RGFClassifier(calc_prob='softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier() bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_string_y(self): clf = RGFClassifier() y_str = np.array(self.iris.target, dtype=str) y_str[y_str == '0'] = 'Zero' y_str[y_str == '1'] = 'One' y_str[y_str == '2'] = 'Two' clf.fit(self.iris.data, y_str) y_pred = clf.predict(self.iris.data) score = accuracy_score(y_str, y_pred) self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
def test_joblib_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) joblib.dump(clf, 'test_clf.pkl') # Remove model file _cleanup() clf2 = joblib.load('test_clf.pkl') y_pred2 = clf2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) s = pickle.dumps(clf) # Remove model file _cleanup() reg2 = pickle.loads(s) y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def model_pred(trn_tmp_x,trn_tmp_y,val_tmp_x,val_tmp_y,tst_x): best_iter = 1200 model = RGFClassifier(max_leaf=best_iter, #Try increasing this as a starter algorithm="RGF", loss="Log", l2=0.01, normalize=False, min_samples_leaf=20, learning_rate=0.5, verbose=False) fit_model = model.fit( trn_tmp_x, trn_tmp_y ) return fit_model.predict_proba(val_tmp_x)[:,1], fit_model.predict_proba(tst_x)[:,1],best_iter
def test_sample_weight(self): clf = RGFClassifier() y_pred = clf.fit(self.X_train, self.y_train).predict_proba(self.X_test) y_pred_weighted = clf.fit(self.X_train, self.y_train, np.ones(self.y_train.shape[0]) ).predict_proba(self.X_test) np.testing.assert_allclose(y_pred, y_pred_weighted) weights = np.ones(self.y_train.shape[0]) * np.nextafter(np.float32(0), np.float32(1)) weights[0] = 1 y_pred_weighted = clf.fit(self.X_train, self.y_train, weights).predict(self.X_test) np.testing.assert_equal(y_pred_weighted, np.full(self.y_test.shape[0], self.y_test[0]))
def rgf04(x_train, y_train, x_test, folds, max_round, n_splits=5): clf = RGFClassifier( max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=7, # 10, n_iter=None, opt_interval=100, learning_rate=.45, # .3, calc_prob="sigmoid", n_jobs=-2, memory_policy="generous", verbose=0) # Additional processing of data x_train, x_test = feature_engineering_4(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(rgf04.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1) cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def set_model(self): self.model = RGFClassifier(max_leaf=1000, # 1000, algorithm="RGF", # RGF_Sib, RGF_Opt loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 )
def test_parallel_gridsearch(self): param_grid = dict(max_leaf=[100, 300]) grid = GridSearchCV(RGFClassifier(n_jobs=1), param_grid=param_grid, refit=True, cv=2, verbose=0, n_jobs=-1) grid.fit(self.X_train, self.y_train) y_pred = grid.best_estimator_.predict(self.X_train) score = accuracy_score(self.y_train, y_pred) self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
def test_params(self): clf = RGFClassifier() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, verbose=True, prefix='rgf_classifier', inc_prefix=True, calc_prob='Sigmoid', clean=True) clf.set_params(**valid_params) clf.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, verbose=-1, prefix='', inc_prefix=1, calc_prob=True, clean=0) for key in non_valid_params: clf.set_params(**valid_params) # Reset to valid params clf.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
def model_selection(select, X): if select == 'xgb': filename = "main_xgboost" # xgboost model model = XGBClassifier(num_round=1000, nthread=25, eta=0.02, gamma=1, max_depth=20, min_child_weight=0.1, subsample=0.9, colsample_bytree=0.5, objective='binary:logistic', seed=1) elif select == 'rgf': filename = "main_rgf" # regression greedy forest model model = RGFClassifier(max_leaf=400, algorithm="RGF", test_interval=150, loss="LS") elif select == 'logit': filename = "main_logit" # LogisticRegression model model = LogisticRegression(C=0.7, penalty="l2") elif select == 'knn': filename = "main_knn" # KNearestNeighbor model model = KNeighborsClassifier(n_neighbors=7) elif select == 'xtree': filename = "main_xtratree" # extratree model model = ExtraTreesClassifier(n_estimators=10000, criterion='entropy', max_depth=9, min_samples_leaf=1, n_jobs=30, random_state=1) elif select == 'rfc': filename = "main_rfc" # RandomForest model model = RandomForestClassifier(random_state=13) elif select == 'cat': filename = "main_catboost" # Catboost model model = CatBoostClassifier(iterations=80, depth=3, learning_rate=0.1, loss_function='Logloss') elif select == 'svm': filename = "main_svm" # Support vector machine model model = svm.SVC(kernel='linear', probability=True) elif select == 'kerasnn': filename = "main_kerasnn" # Keras Neural network model model = keras_network(X) else: filename = "main_lgmboost" # light gradient boosting model model = lgb.LGBMClassifier(num_leaves=150, objective='binary', max_depth=6, learning_rate=.01, max_bin=400, auc='binary_logloss') return filename, model
def objective(max_leaf, l2, min_samples_leaf, learning_rate): max_leaf = int(max_leaf) min_samples_leaf = int(min_samples_leaf) assert type(max_leaf) == int assert type(min_samples_leaf) == int model = RGFClassifier( max_leaf=max_leaf, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, algorithm="RGF_Sib", test_interval=100, ) model.fit(train_m, label_m) pred_proba = model.predict_proba(train_val) score = roc_auc_score(label_val, pred_proba[:, 1]) return score
def test_input_arrays_shape(self): clf = RGFClassifier() n_samples = self.y_train.shape[0] self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train[:(n_samples - 1)]) self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train, np.ones(n_samples - 1)) self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train, np.ones((n_samples, 2)))
def model(opt): rgf = RGFClassifier( max_leaf=opt["max_leaf"], reg_depth=opt["reg_depth"], min_samples_leaf=opt["min_samples_leaf"], algorithm="RGF_Sib", test_interval=100, verbose=False, ) scores = cross_val_score(rgf, X, y, cv=3) return scores.mean()
def test_params(self): clf = RGFClassifier() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, calc_prob='sigmoid', n_jobs=-1, memory_policy='conservative', verbose=True) clf.set_params(**valid_params) clf.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, calc_prob=True, n_jobs='-1', memory_policy='Generos', verbose=-1) for key in non_valid_params: clf.set_params(**valid_params) # Reset to valid params clf.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
def test_softmax_classifier(self): clf = RGFClassifier(prefix='clf', calc_prob='Softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier(prefix='clf') bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def __init__(self, task, fast=False): if task == 'classification': self.metric = 'roc_auc' self.task = "classification" if fast: self.model = FastRGFClassifier() else: self.model = RGFClassifier(loss="Log") else: self.metric = 'neg_mean_squared_error' self.task = "regression" if fast: self.model = FastRGFRegressor() else: self.model = RGFRegressor(loss="LS", normalize=True) self.X_test = None self.X_train = None self.y_test = None self.y_train = None self.grid_search = None self.y_predict = None self.test_score = None
def train(params): # log hyperparams for this run for k, v in params.items(): mlflow.log_param(k, v) # load dataset files # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta' dataset = np.load('preprocessed/dataset.npz') X_arr = dataset['X_arr'] Y_arr = dataset['Y_arr'] # split for train-test X_train, X_test, Y_train, Y_test = train_test_split(X_arr, Y_arr, stratify=Y_arr, test_size=0.2) # instantiate model with params rgf_clf = RGFClassifier(**params) rgf_clf.fit(X_train, Y_train) # predict on test data Y_pred = rgf_clf.predict(X_test) Y_pred_proba = rgf_clf.predict_proba(X_test) # log logistic loss value logistic_loss = log_loss(Y_test, Y_pred_proba) mlflow.log_metric('log_loss', logistic_loss) # log precision, recall, f1 p, r, f, _ = precision_recall_fscore_support(y_true=Y_test, y_pred=Y_pred, average='binary') mlflow.log_metric('precision', p) mlflow.log_metric('recall', r) mlflow.log_metric('f1', f) # which features matter the most print("========== FEATURE IMPORTANCES ==========") print(rgf_clf.feature_importances_)
def rgf(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame, parameters: Dict): n_splits = 5 # n_neighbors = parameters["n_neighbors"] folds = KFold(n_splits=n_splits, shuffle=True, random_state=42) oof = np.zeros((df.shape[0] + test.shape[0], 9)) for trn_idx, val_idx in folds.split(df, target): train_x = df.iloc[trn_idx, :].values val_x = df.iloc[val_idx, :].values train_y = target[trn_idx].values val_y = target[val_idx].values classifier = RGFClassifier( n_jobs=14, algorithm="RGF", loss="Log", ) classifier.fit(train_x, train_y) y_hat = classifier.predict_proba(val_x) print(log_loss(val_y, y_hat)) print(oof.shape, y_hat.shape) oof[val_idx] = y_hat pred = classifier.predict_proba(test.values) oof[len(target):, :] += pred / n_splits print(oof.shape) # np.save("data/04_features/oof.npz", oof) # oof = np.load("data/04_features/oof.npy") n_name = ["knn_{}".format(i) for i in range(9)] oof = pd.DataFrame(oof) oof.to_csv("data/09_oof/rgf_{}.csv".format(3)) return oof[len(target):].values
from sklearn import datasets from sklearn.utils.validation import check_random_state from sklearn.model_selection import StratifiedKFold, cross_val_score from sklearn.ensemble import GradientBoostingClassifier from rgf.sklearn import RGFClassifier iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] rgf = RGFClassifier(max_leaf=400, algorithm="RGF_Sib", test_interval=100, verbose=True) gb = GradientBoostingClassifier(n_estimators=20, learning_rate=0.01, subsample=0.6, random_state=rng) n_folds = 3 rgf_scores = cross_val_score(rgf, iris.data, iris.target, cv=StratifiedKFold(n_folds)) gb_scores = cross_val_score(gb, iris.data, iris.target,
if __name__ == '__main__': print(sys.argv) if len(sys.argv) >= 2: model = model_set[sys.argv[1]] else: model = model_set['xgb1'] metric = model['metric'] if model['mdl_type'] == 'xgb': mdl = XGBClassifier(**model['param']) elif model['mdl_type'] == 'lgb': mdl = LGBMClassifier(**model['param']) elif model['mdl_type'] == 'rgf': mdl = RGFClassifier(**model['param']) elif model['mdl_type'] == 'lr': mdl = LogisticRegression(**model['param']) elif model['mdl_type'] == 'xgbl': mdl = XGBRegressor(**model['param']) elif model['mdl_type'] == 'mlp': mdl = MLPClassifier(**model['param']) train_pred, test_pred, mean, std, full_score = five_fold_with_baging( l2_train.values, y, l2_test.values, train_id, test_id, metric, mdl,
subsample=.8, min_child_weight=6, colsample_bytree=.8, scale_pos_weight=1.6, gamma=10, reg_alpha=8, reg_lambda=1.3, ) rgf = RGFClassifier( # See https://www.kaggle.com/scirpus/regularized-greedy-forest#241285 max_leaf=1200, # Parameters suggested by olivier in link above algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0) gini_results = [] # Run CV for i, (train_index, test_index) in enumerate(kf.split(train_df)): # Create data for this fold y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
def train_classifiers(X_data, y): """ Trains several classifiers and reporting model quality. :param X_data: :param y: :return: trained models """ # Split the dataset into Train and Test seed = 7 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=test_size, random_state=seed) svm = SVC() svm_params = { 'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf'] } svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params, X_train, X_test, y_train, y_test) knn = KNeighborsClassifier() knn_params = { 'n_neighbors': [5, 6, 7, 8, 9, 10], 'leaf_size': [1, 2, 3, 5], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'n_jobs': [-1] } knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params, X_train, X_test, y_train, y_test) # Train the XGboost Model for Classification xgb_model = xgb.XGBClassifier() # brute force scan for all parameters, here are the tricks # usually max_depth is 6,7,8 # learning rate is around 0.05, but small changes may make big diff # tuning min_child_weight subsample colsample_bytree can have # much fun of fighting against overfit # n_estimators is how many round of boosting # finally, ensemble xgboost with multiple seeds may reduce variance xgb_parameters = { 'nthread': [4], # when use hyperthread, xgboost may become slower 'objective': ['binary:logistic'], 'learning_rate': [0.05, 0.1], # so called `eta` value 'max_depth': [6, 7, 8], 'min_child_weight': [1, 11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7, 0.8], 'n_estimators': [5, 100, 1000], # number of trees, change it to 1000 for better results 'missing': [-999], 'seed': [1337] } train_model1, xgb_grid = train_single_classifier_type( xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test) rfc = RandomForestClassifier() rfc_parameters = { 'max_depth': [4, 5, 6], 'n_estimators': [100, 200], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [2, 4], 'min_samples_split': [2, 5, 10], } rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest", rfc_parameters, X_train, X_test, y_train, y_test) ext = ExtraTreesClassifier() ext_parameters = { 'n_estimators': [50, 100], 'max_features': [5, 10, 25], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10], } ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees", ext_parameters, X_train, X_test, y_train, y_test) lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=-1, # Updated from 'nthread' silent=True) # Create parameters to search lgbm_parameters = { 'max_depth': [5, 6, 7, 8, 9, 10, 15, 20], 'learning_rate': [0.005], 'n_estimators': [100, 150, 500], 'num_leaves': [6, 8, 12, 16], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'random_state': [501], # Updated from 'seed' 'colsample_bytree': [0.65], 'subsample': [0.7], 'reg_alpha': [1, 10], 'reg_lambda': [10, 100], } lgbm_model, lgbm_grid = train_single_classifier_type( lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test) rgf = RGFClassifier() rgf_parameters = { 'max_leaf': [900], 'l2': [0.1, 0.05, 1.0], 'min_samples_leaf': [5, 4, 3], 'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"], 'loss': ["Log"], } rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF", rgf_parameters, X_train, X_test, y_train, y_test) frgf = FastRGFClassifier() frgf_parameters = { 'max_leaf': [100, 200, 900], 'n_estimators': [100, 1000], 'max_bin': [10, 100], 'l2': [0.1, 100, 1000], 'min_samples_leaf': [5, 6], 'opt_algorithm': ['rgf'], 'loss': ["LS"], } frgf_model, frgf_grid = train_single_classifier_type( frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test) return svm_model, svm_grid, \ train_model1, xgb_grid, \ rfc_model, rfc_grid, \ ext_model, ext_grid, \ lgbm_model, lgbm_grid, \ rgf_model, rgf_grid, \ frgf_model, frgf_grid
def train_predict(train_df, test_df, params, model_name=None): if model_name == None: #model_name = 'l1_rgf_%s'%datetime.now().strftime('%m%d%H%M') model_name = 'l1_rgf' log = Logger(os.path.join('log', '%s.log' % model_name)) cols = [c for c in train_df.columns if c not in ['id', 'target']] log.info('Features:') for col in cols: log.info('- %s' % col) log.info('\n') log.info('Parameters:') param_items = params.items() for param_item in param_items: log.info('- %s: %s' % (param_item[0], str(param_item[1]))) log.info('\n') X = train_df[cols].values y = train_df['target'].values X_test = test_df[cols].values prob_train = np.zeros(len(X)) prob_test = np.zeros(len(X_test)) kfold = 5 scores = [] skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=41) for i, (train_ind, valid_ind) in enumerate(skf.split(X, y)): X_train, X_valid = X[train_ind], X[valid_ind] y_train, y_valid = y[train_ind], y[valid_ind] model = RGFClassifier(**params) model.fit(X_train, y_train) prob = model.predict_proba(X_valid)[:, 1] prob_train[valid_ind] = prob score = gini_norm(prob, y_valid) scores.append(score) log.info('- Fold %d/%d score: %f' % (i + 1, kfold, score)) prob = model.predict_proba(X_test)[:, 1] prob_test += prob / kfold try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) mean_score = np.mean(scores) log.info('- Mean score: %f' % mean_score) prob_train_df = pd.DataFrame({'id': train_df['id'], 'target': prob_train}) prob_train_df.to_csv(os.path.join('local_cv', '%s.csv.gz' % model_name), index=False, compression='gzip') prob_test_df = pd.DataFrame({'id': test_df['id'], 'target': prob_test}) prob_test_df.to_csv(os.path.join('submission', '%s.csv.gz' % model_name), index=False, compression='gzip') return mean_score
# Set up folds K = Number_of_folds kf = comm_skf sl2_list = [0.08, 0.09, 0.11, 0.12] for sl2 in sl2_list: logging.info('test with sl2 : {0}'.format(sl2)) # Set up classifier model = RGFClassifier( max_leaf=1200, algorithm="RGF", loss="Log", l2=0.012, sl2=sl2, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=0.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 ) # Run CV logging.info('feature shape {0}'.format(X.shape)) for i, (train_index, test_index) in enumerate(kf.split(train_df, y)): # Create data for this fold y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
import time from sklearn import datasets from sklearn.utils.validation import check_random_state from sklearn.ensemble import GradientBoostingClassifier from rgf.sklearn import RGFClassifier, FastRGFClassifier iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] start = time.time() clf = RGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("RGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = FastRGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("FastRGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = GradientBoostingClassifier()
xgb_homeless_bag_kf = layer1.Layer1Train(XGBClassifier(**xgb_homeless_params), 'xgb_homeless_bag_kf', drop_stupid='default', cat_transform='smooth', recon_category=True, engineer_stats=True) xgb_18 = layer1.Layer1Train(XGBClassifier(**xgb_18_params), 'xgb_18', drop_stupid='default', cat_transform='smooth', recon_category=True, data_transform='round', engineer_stats=True) rgf_bojan_na_bag_kf = layer1.Layer1Train(RGFClassifier(**rgf_bojan_params), 'rgf_bojan_na_bag_kf', drop_stupid='default', cat_transform='smooth', recon_category=True, engineer_stats=True) # fm_sgd = layer1.Layer1Train(sgd.FMClassification(**fm_sgd_params), 'fm_sgd', # drop_stupid=True, cat_transform='onehot', recon_category=True, engineer_stats=True) # # fm_als = layer1.Layer1Train(sgd.FMClassification(**fm_als_params), 'fm_als', # drop_stupid=True, cat_transform='smooth', data_transform='log', recon_category=True, engineer_stats=True) cat_1 = layer1.Layer1Train(CatBoostClassifier(**cat_1_params), 'cat_1', drop_stupid='default',
def test_cleanup(self): clf1 = RGFClassifier() clf1.fit(self.X_train, self.y_train) clf2 = RGFClassifier() clf2.fit(self.X_train, self.y_train) self.assertNotEqual(clf1.cleanup(), 0) self.assertEqual(clf1.cleanup(), 0) for est in clf1.estimators_: glob_file = os.path.join(_get_temp_path(), est._file_prefix + "*") self.assertFalse(glob.glob(glob_file)) self.assertRaises(NotFittedError, clf1.predict, self.X_test) clf2.predict(self.X_test)