Exemple #1
0
    def test_attributes(self):
        clf = RGFClassifier()
        attributes = ('estimators_', 'classes_', 'n_classes_', 'n_features_', 'fitted_',
                      'sl2_', 'min_samples_leaf_', 'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, clf, attr)
        clf.fit(self.X_train, self.y_train)
        self.assertEqual(len(clf.estimators_), len(np.unique(self.y_train)))
        np.testing.assert_array_equal(clf.classes_, sorted(np.unique(self.y_train)))
        self.assertEqual(clf.n_classes_, len(clf.estimators_))
        self.assertEqual(clf.n_features_, self.X_train.shape[-1])
        self.assertTrue(clf.fitted_)
        if clf.sl2 is None:
            self.assertEqual(clf.sl2_, clf.l2)
        else:
            self.assertEqual(clf.sl2_, clf.sl2)
        if clf.min_samples_leaf < 1:
            self.assertLessEqual(clf.min_samples_leaf_, 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(clf.min_samples_leaf_, clf.min_samples_leaf)
        if clf.n_iter is None:
            if clf.loss == "LS":
                self.assertEqual(clf.n_iter_, 10)
            else:
                self.assertEqual(clf.n_iter_, 5)
        else:
            self.assertEqual(clf.n_iter_, clf.n_iter)
Exemple #2
0
def run_rgf():
    model = RGFClassifier(max_leaf=1000,
                          algorithm="RGF",
                          loss="Log",
                          l2=0.01,
                          sl2=0.01,
                          normalize=False,
                          min_samples_leaf=10,
                          n_iter=None,
                          opt_interval=100,
                          learning_rate=.5,
                          calc_prob="sigmoid",
                          n_jobs=-1,
                          memory_policy="generous",
                          verbose=0)

    fit_model = model.fit(X_train, y_train)
    pred = fit_model.predict_proba(X_valid)[:, 1]
    pred_test = fit_model.predict_proba(X_test)[:, 1]
    try:
        subprocess.call('rm -rf /tmp/rgf/*', shell=True)
        print("Clean up is successfull")
        print(glob.glob("/tmp/rgf/*"))
    except Exception as e:
        print(str(e))

    return pred, pred_test
Exemple #3
0
    def test_attributes(self):
        clf = RGFClassifier()
        attributes = ('estimators_', 'classes_', 'n_classes_', 'n_features_',
                      'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, clf, attr)
        clf.fit(self.X_train, self.y_train)
        self.assertEqual(len(clf.estimators_), len(np.unique(self.y_train)))
        np.testing.assert_array_equal(clf.classes_,
                                      sorted(np.unique(self.y_train)))
        self.assertEqual(clf.n_classes_, len(clf.estimators_))
        self.assertEqual(clf.n_features_, self.X_train.shape[-1])
        self.assertTrue(clf.fitted_)
        if clf.sl2 is None:
            self.assertEqual(clf.sl2_, clf.l2)
        else:
            self.assertEqual(clf.sl2_, clf.sl2)
        if clf.min_samples_leaf < 1:
            self.assertLessEqual(clf.min_samples_leaf_,
                                 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(clf.min_samples_leaf_, clf.min_samples_leaf)
        if clf.n_iter is None:
            if clf.loss == "LS":
                self.assertEqual(clf.n_iter_, 10)
            else:
                self.assertEqual(clf.n_iter_, 5)
        else:
            self.assertEqual(clf.n_iter_, clf.n_iter)
Exemple #4
0
 def test_classifier_sparse_input(self):
     clf = RGFClassifier(prefix='clf', calc_prob='Softmax')
     for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
         iris_sparse = sparse_format(self.iris.data)
         clf.fit(iris_sparse, self.iris.target)
         score = clf.score(iris_sparse, self.iris.target)
         self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def run_rgf():
    model = RGFClassifier(
        max_leaf=1000,
        algorithm="RGF",
        loss="Log",
        l2=0.01,
        sl2=0.01,
        normalize=False,
        min_samples_leaf=10,
        n_iter=None,
        opt_interval=100,
        learning_rate=.5,
        calc_prob="sigmoid",
        n_jobs=-1,
        memory_policy="generous",
        verbose=0
    )

    fit_model = model.fit( X_train, y_train )
    pred = fit_model.predict_proba(X_valid)[:,1]
    pred_test = fit_model.predict_proba(X_test)[:,1]
    try:
        subprocess.call('rm -rf /tmp/rgf/*', shell=True)
        print("Clean up is successfull")
        print(glob.glob("/tmp/rgf/*"))
    except Exception as e:
        print(str(e))

    return pred, pred_test
Exemple #6
0
 def test_classifier_sparse_input(self):
     clf = RGFClassifier(calc_prob='softmax')
     for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix,
                           sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix):
         iris_sparse = sparse_format(self.iris.data)
         clf.fit(iris_sparse, self.iris.target)
         score = clf.score(iris_sparse, self.iris.target)
         self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #7
0
    def test_softmax_classifier(self):
        clf = RGFClassifier(calc_prob='softmax')
        clf.fit(self.iris.data, self.iris.target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0]))

        score = clf.score(self.iris.data, self.iris.target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #8
0
    def test_bin_classifier(self):
        clf = RGFClassifier()
        bin_target = (self.iris.target == 2).astype(int)
        clf.fit(self.iris.data, bin_target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0]))

        score = clf.score(self.iris.data, bin_target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #9
0
    def test_string_y(self):
        clf = RGFClassifier()

        y_str = np.array(self.iris.target, dtype=str)
        y_str[y_str == '0'] = 'Zero'
        y_str[y_str == '1'] = 'One'
        y_str[y_str == '2'] = 'Two'

        clf.fit(self.iris.data, y_str)
        y_pred = clf.predict(self.iris.data)
        score = accuracy_score(y_str, y_pred)
        self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
Exemple #10
0
    def test_joblib_pickle(self):
        clf = RGFClassifier()
        clf.fit(self.X_train, self.y_train)
        y_pred1 = clf.predict(self.X_test)
        joblib.dump(clf, 'test_clf.pkl')

        # Remove model file
        _cleanup()

        clf2 = joblib.load('test_clf.pkl')
        y_pred2 = clf2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Exemple #11
0
    def test_pickle(self):
        clf = RGFClassifier()
        clf.fit(self.X_train, self.y_train)
        y_pred1 = clf.predict(self.X_test)
        s = pickle.dumps(clf)

        # Remove model file
        _cleanup()

        reg2 = pickle.loads(s)
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
def model_pred(trn_tmp_x,trn_tmp_y,val_tmp_x,val_tmp_y,tst_x): 
    best_iter = 1200
    model = RGFClassifier(max_leaf=best_iter, #Try increasing this as a starter
                    algorithm="RGF",
                    loss="Log",
                    l2=0.01,
                    normalize=False,
                    min_samples_leaf=20,
                    learning_rate=0.5,
                    verbose=False)
    
    fit_model = model.fit( trn_tmp_x, trn_tmp_y )
    
    return fit_model.predict_proba(val_tmp_x)[:,1], fit_model.predict_proba(tst_x)[:,1],best_iter
Exemple #13
0
    def test_sample_weight(self):
        clf = RGFClassifier()

        y_pred = clf.fit(self.X_train, self.y_train).predict_proba(self.X_test)
        y_pred_weighted = clf.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict_proba(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        weights = np.ones(self.y_train.shape[0]) * np.nextafter(np.float32(0), np.float32(1))
        weights[0] = 1
        y_pred_weighted = clf.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        np.testing.assert_equal(y_pred_weighted, np.full(self.y_test.shape[0], self.y_test[0]))
Exemple #14
0
    def test_sample_weight(self):
        clf = RGFClassifier()

        y_pred = clf.fit(self.X_train, self.y_train).predict_proba(self.X_test)
        y_pred_weighted = clf.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict_proba(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        weights = np.ones(self.y_train.shape[0]) * np.nextafter(np.float32(0), np.float32(1))
        weights[0] = 1
        y_pred_weighted = clf.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        np.testing.assert_equal(y_pred_weighted, np.full(self.y_test.shape[0], self.y_test[0]))
Exemple #15
0
def rgf04(x_train, y_train, x_test, folds, max_round, n_splits=5):
    clf = RGFClassifier(
        max_leaf=1000,
        algorithm="RGF",
        loss="Log",
        l2=0.01,
        sl2=0.01,
        normalize=False,
        min_samples_leaf=7,  # 10,
        n_iter=None,
        opt_interval=100,
        learning_rate=.45,  # .3,
        calc_prob="sigmoid",
        n_jobs=-2,
        memory_policy="generous",
        verbose=0)

    # Additional processing of data
    x_train, x_test = feature_engineering_4(x_train, x_test, y_train)

    # Cross Validate
    cv = Cross_Validate(rgf04.__name__, n_splits, x_train.shape[0],
                        x_test.shape[0], clf, -1, -1)
    cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
 def set_model(self):
     self.model = RGFClassifier(max_leaf=1000,  # 1000,
                                algorithm="RGF",  # RGF_Sib, RGF_Opt
                                loss="Log",
                                l2=0.01,
                                sl2=0.01,
                                normalize=False,
                                min_samples_leaf=10,
                                n_iter=None,
                                opt_interval=100,
                                learning_rate=.5,
                                calc_prob="sigmoid",
                                n_jobs=-1,
                                memory_policy="generous",
                                verbose=0
                                )
Exemple #17
0
 def test_parallel_gridsearch(self):
     param_grid = dict(max_leaf=[100, 300])
     grid = GridSearchCV(RGFClassifier(n_jobs=1),
                         param_grid=param_grid, refit=True, cv=2, verbose=0, n_jobs=-1)
     grid.fit(self.X_train, self.y_train)
     y_pred = grid.best_estimator_.predict(self.X_train)
     score = accuracy_score(self.y_train, y_pred)
     self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
Exemple #18
0
    def test_params(self):
        clf = RGFClassifier()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            verbose=True,
                            prefix='rgf_classifier',
                            inc_prefix=True,
                            calc_prob='Sigmoid',
                            clean=True)
        clf.set_params(**valid_params)
        clf.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                verbose=-1,
                                prefix='',
                                inc_prefix=1,
                                calc_prob=True,
                                clean=0)
        for key in non_valid_params:
            clf.set_params(**valid_params)  # Reset to valid params
            clf.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
Exemple #19
0
def model_selection(select, X):
    if select == 'xgb':
        filename = "main_xgboost"  # xgboost model
        model = XGBClassifier(num_round=1000,
                              nthread=25,
                              eta=0.02,
                              gamma=1,
                              max_depth=20,
                              min_child_weight=0.1,
                              subsample=0.9,
                              colsample_bytree=0.5,
                              objective='binary:logistic',
                              seed=1)
    elif select == 'rgf':
        filename = "main_rgf"  # regression greedy forest model
        model = RGFClassifier(max_leaf=400,
                              algorithm="RGF",
                              test_interval=150,
                              loss="LS")
    elif select == 'logit':
        filename = "main_logit"  # LogisticRegression model
        model = LogisticRegression(C=0.7, penalty="l2")
    elif select == 'knn':
        filename = "main_knn"  # KNearestNeighbor model
        model = KNeighborsClassifier(n_neighbors=7)
    elif select == 'xtree':
        filename = "main_xtratree"  # extratree model
        model = ExtraTreesClassifier(n_estimators=10000,
                                     criterion='entropy',
                                     max_depth=9,
                                     min_samples_leaf=1,
                                     n_jobs=30,
                                     random_state=1)
    elif select == 'rfc':
        filename = "main_rfc"  # RandomForest model
        model = RandomForestClassifier(random_state=13)
    elif select == 'cat':
        filename = "main_catboost"  # Catboost model
        model = CatBoostClassifier(iterations=80,
                                   depth=3,
                                   learning_rate=0.1,
                                   loss_function='Logloss')
    elif select == 'svm':
        filename = "main_svm"  # Support vector machine model
        model = svm.SVC(kernel='linear', probability=True)
    elif select == 'kerasnn':
        filename = "main_kerasnn"  # Keras Neural network model
        model = keras_network(X)
    else:
        filename = "main_lgmboost"  # light gradient boosting model
        model = lgb.LGBMClassifier(num_leaves=150,
                                   objective='binary',
                                   max_depth=6,
                                   learning_rate=.01,
                                   max_bin=400,
                                   auc='binary_logloss')
    return filename, model
    def objective(max_leaf, l2, min_samples_leaf, learning_rate):
        max_leaf = int(max_leaf)
        min_samples_leaf = int(min_samples_leaf)

        assert type(max_leaf) == int
        assert type(min_samples_leaf) == int

        model = RGFClassifier(
            max_leaf=max_leaf,
            l2=l2,
            min_samples_leaf=min_samples_leaf,
            learning_rate=learning_rate,
            algorithm="RGF_Sib",
            test_interval=100,
        )
        model.fit(train_m, label_m)
        pred_proba = model.predict_proba(train_val)
        score = roc_auc_score(label_val, pred_proba[:, 1])
        return score
Exemple #21
0
    def test_input_arrays_shape(self):
        clf = RGFClassifier()

        n_samples = self.y_train.shape[0]
        self.assertRaises(ValueError, clf.fit, self.X_train,
                          self.y_train[:(n_samples - 1)])
        self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train,
                          np.ones(n_samples - 1))
        self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train,
                          np.ones((n_samples, 2)))
def model(opt):
    rgf = RGFClassifier(
        max_leaf=opt["max_leaf"],
        reg_depth=opt["reg_depth"],
        min_samples_leaf=opt["min_samples_leaf"],
        algorithm="RGF_Sib",
        test_interval=100,
        verbose=False,
    )
    scores = cross_val_score(rgf, X, y, cv=3)

    return scores.mean()
Exemple #23
0
    def test_params(self):
        clf = RGFClassifier()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            calc_prob='sigmoid',
                            n_jobs=-1,
                            memory_policy='conservative',
                            verbose=True)
        clf.set_params(**valid_params)
        clf.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                calc_prob=True,
                                n_jobs='-1',
                                memory_policy='Generos',
                                verbose=-1)
        for key in non_valid_params:
            clf.set_params(**valid_params)  # Reset to valid params
            clf.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
Exemple #24
0
    def test_softmax_classifier(self):
        clf = RGFClassifier(prefix='clf', calc_prob='Softmax')
        clf.fit(self.iris.data, self.iris.target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0]))

        score = clf.score(self.iris.data, self.iris.target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #25
0
    def test_bin_classifier(self):
        clf = RGFClassifier(prefix='clf')
        bin_target = (self.iris.target == 2).astype(int)
        clf.fit(self.iris.data, bin_target)

        proba_sum = clf.predict_proba(self.iris.data).sum(axis=1)
        np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0]))

        score = clf.score(self.iris.data, bin_target)
        print('Score: {0:.5f}'.format(score))
        self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
Exemple #26
0
 def __init__(self, task, fast=False):
     if task == 'classification':
         self.metric = 'roc_auc'
         self.task = "classification"
         if fast:
             self.model = FastRGFClassifier()
         else:
             self.model = RGFClassifier(loss="Log")
     else:
         self.metric = 'neg_mean_squared_error'
         self.task = "regression"
         if fast:
             self.model = FastRGFRegressor()
         else:
             self.model = RGFRegressor(loss="LS", normalize=True)
     self.X_test = None
     self.X_train = None
     self.y_test = None
     self.y_train = None
     self.grid_search = None
     self.y_predict = None
     self.test_score = None
Exemple #27
0
def train(params):
    # log hyperparams for this run
    for k, v in params.items():
        mlflow.log_param(k, v)

    # load dataset files
    # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta'
    dataset = np.load('preprocessed/dataset.npz')
    X_arr = dataset['X_arr']
    Y_arr = dataset['Y_arr']

    # split for train-test
    X_train, X_test, Y_train, Y_test = train_test_split(X_arr,
                                                        Y_arr,
                                                        stratify=Y_arr,
                                                        test_size=0.2)

    # instantiate model with params
    rgf_clf = RGFClassifier(**params)
    rgf_clf.fit(X_train, Y_train)

    # predict on test data
    Y_pred = rgf_clf.predict(X_test)
    Y_pred_proba = rgf_clf.predict_proba(X_test)

    # log logistic loss value
    logistic_loss = log_loss(Y_test, Y_pred_proba)
    mlflow.log_metric('log_loss', logistic_loss)

    # log precision, recall, f1
    p, r, f, _ = precision_recall_fscore_support(y_true=Y_test,
                                                 y_pred=Y_pred,
                                                 average='binary')
    mlflow.log_metric('precision', p)
    mlflow.log_metric('recall', r)
    mlflow.log_metric('f1', f)

    # which features matter the most
    print("========== FEATURE IMPORTANCES ==========")
    print(rgf_clf.feature_importances_)
Exemple #28
0
def rgf(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame,
        parameters: Dict):
    n_splits = 5
    # n_neighbors = parameters["n_neighbors"]
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof = np.zeros((df.shape[0] + test.shape[0], 9))

    for trn_idx, val_idx in folds.split(df, target):
        train_x = df.iloc[trn_idx, :].values
        val_x = df.iloc[val_idx, :].values
        train_y = target[trn_idx].values
        val_y = target[val_idx].values

        classifier = RGFClassifier(
            n_jobs=14,
            algorithm="RGF",
            loss="Log",
        )
        classifier.fit(train_x, train_y)

        y_hat = classifier.predict_proba(val_x)

        print(log_loss(val_y, y_hat))
        print(oof.shape, y_hat.shape)
        oof[val_idx] = y_hat
        pred = classifier.predict_proba(test.values)

        oof[len(target):, :] += pred / n_splits

    print(oof.shape)
    # np.save("data/04_features/oof.npz", oof)
    # oof = np.load("data/04_features/oof.npy")
    n_name = ["knn_{}".format(i) for i in range(9)]
    oof = pd.DataFrame(oof)
    oof.to_csv("data/09_oof/rgf_{}.csv".format(3))
    return oof[len(target):].values
from sklearn import datasets
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from rgf.sklearn import RGFClassifier

iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

rgf = RGFClassifier(max_leaf=400,
                    algorithm="RGF_Sib",
                    test_interval=100,
                    verbose=True)
gb = GradientBoostingClassifier(n_estimators=20,
                                learning_rate=0.01,
                                subsample=0.6,
                                random_state=rng)

n_folds = 3

rgf_scores = cross_val_score(rgf,
                             iris.data,
                             iris.target,
                             cv=StratifiedKFold(n_folds))

gb_scores = cross_val_score(gb,
                            iris.data,
                            iris.target,
Exemple #30
0
if __name__ == '__main__':
    print(sys.argv)

    if len(sys.argv) >= 2:
        model = model_set[sys.argv[1]]
    else:
        model = model_set['xgb1']

    metric = model['metric']
    if model['mdl_type'] == 'xgb':
        mdl = XGBClassifier(**model['param'])
    elif model['mdl_type'] == 'lgb':
        mdl = LGBMClassifier(**model['param'])
    elif model['mdl_type'] == 'rgf':
        mdl = RGFClassifier(**model['param'])
    elif model['mdl_type'] == 'lr':
        mdl = LogisticRegression(**model['param'])
    elif model['mdl_type'] == 'xgbl':
        mdl = XGBRegressor(**model['param'])
    elif model['mdl_type'] == 'mlp':
        mdl = MLPClassifier(**model['param'])

    train_pred, test_pred, mean, std, full_score = five_fold_with_baging(
        l2_train.values,
        y,
        l2_test.values,
        train_id,
        test_id,
        metric,
        mdl,
Exemple #31
0
    subsample=.8,
    min_child_weight=6,
    colsample_bytree=.8,
    scale_pos_weight=1.6,
    gamma=10,
    reg_alpha=8,
    reg_lambda=1.3,
)
rgf = RGFClassifier(  # See https://www.kaggle.com/scirpus/regularized-greedy-forest#241285
    max_leaf=1200,  # Parameters suggested by olivier in link above
    algorithm="RGF",
    loss="Log",
    l2=0.01,
    sl2=0.01,
    normalize=False,
    min_samples_leaf=10,
    n_iter=None,
    opt_interval=100,
    learning_rate=.5,
    calc_prob="sigmoid",
    n_jobs=-1,
    memory_policy="generous",
    verbose=0)

gini_results = []

# Run CV
for i, (train_index, test_index) in enumerate(kf.split(train_df)):

    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
Exemple #32
0
def train_classifiers(X_data, y):
    """
    Trains several classifiers and reporting model quality.
    :param X_data:
    :param y:
    :return: trained models
    """
    # Split the dataset into Train and Test
    seed = 7
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed)

    svm = SVC()
    svm_params = {
        'C': [1, 10, 100, 1000],
        'gamma': [1, 0.1, 0.001, 0.0001],
        'kernel': ['linear', 'rbf']
    }
    svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    knn = KNeighborsClassifier()
    knn_params = {
        'n_neighbors': [5, 6, 7, 8, 9, 10],
        'leaf_size': [1, 2, 3, 5],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'n_jobs': [-1]
    }
    knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    # Train the XGboost Model for Classification
    xgb_model = xgb.XGBClassifier()

    # brute force scan for all parameters, here are the tricks
    # usually max_depth is 6,7,8
    # learning rate is around 0.05, but small changes may make big diff
    # tuning min_child_weight subsample colsample_bytree can have
    # much fun of fighting against overfit
    # n_estimators is how many round of boosting
    # finally, ensemble xgboost with multiple seeds may reduce variance
    xgb_parameters = {
        'nthread': [4],  # when use hyperthread, xgboost may become slower
        'objective': ['binary:logistic'],
        'learning_rate': [0.05, 0.1],  # so called `eta` value
        'max_depth': [6, 7, 8],
        'min_child_weight': [1, 11],
        'silent': [1],
        'subsample': [0.8],
        'colsample_bytree': [0.7, 0.8],
        'n_estimators':
        [5, 100,
         1000],  # number of trees, change it to 1000 for better results
        'missing': [-999],
        'seed': [1337]
    }

    train_model1, xgb_grid = train_single_classifier_type(
        xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test)

    rfc = RandomForestClassifier()

    rfc_parameters = {
        'max_depth': [4, 5, 6],
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [2, 5, 10],
    }

    rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest",
                                                       rfc_parameters, X_train,
                                                       X_test, y_train, y_test)

    ext = ExtraTreesClassifier()

    ext_parameters = {
        'n_estimators': [50, 100],
        'max_features': [5, 10, 25],
        'min_samples_leaf': [2, 5, 10],
        'min_samples_split': [2, 5, 10],
    }

    ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees",
                                                       ext_parameters, X_train,
                                                       X_test, y_train, y_test)

    lgbm = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        n_jobs=-1,  # Updated from 'nthread'
        silent=True)
    # Create parameters to search
    lgbm_parameters = {
        'max_depth': [5, 6, 7, 8, 9, 10, 15, 20],
        'learning_rate': [0.005],
        'n_estimators': [100, 150, 500],
        'num_leaves': [6, 8, 12, 16],
        'boosting_type': ['gbdt'],
        'objective': ['binary'],
        'random_state': [501],  # Updated from 'seed'
        'colsample_bytree': [0.65],
        'subsample': [0.7],
        'reg_alpha': [1, 10],
        'reg_lambda': [10, 100],
    }
    lgbm_model, lgbm_grid = train_single_classifier_type(
        lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test)

    rgf = RGFClassifier()
    rgf_parameters = {
        'max_leaf': [900],
        'l2': [0.1, 0.05, 1.0],
        'min_samples_leaf': [5, 4, 3],
        'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"],
        'loss': ["Log"],
    }

    rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF",
                                                       rgf_parameters, X_train,
                                                       X_test, y_train, y_test)

    frgf = FastRGFClassifier()
    frgf_parameters = {
        'max_leaf': [100, 200, 900],
        'n_estimators': [100, 1000],
        'max_bin': [10, 100],
        'l2': [0.1, 100, 1000],
        'min_samples_leaf': [5, 6],
        'opt_algorithm': ['rgf'],
        'loss': ["LS"],
    }

    frgf_model, frgf_grid = train_single_classifier_type(
        frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test)

    return svm_model, svm_grid, \
           train_model1, xgb_grid, \
           rfc_model, rfc_grid, \
           ext_model, ext_grid, \
           lgbm_model, lgbm_grid, \
           rgf_model, rgf_grid, \
           frgf_model, frgf_grid
Exemple #33
0
def train_predict(train_df, test_df, params, model_name=None):
    if model_name == None:
        #model_name = 'l1_rgf_%s'%datetime.now().strftime('%m%d%H%M')
        model_name = 'l1_rgf'
    log = Logger(os.path.join('log', '%s.log' % model_name))

    cols = [c for c in train_df.columns if c not in ['id', 'target']]

    log.info('Features:')
    for col in cols:
        log.info('- %s' % col)
    log.info('\n')

    log.info('Parameters:')
    param_items = params.items()
    for param_item in param_items:
        log.info('- %s: %s' % (param_item[0], str(param_item[1])))
    log.info('\n')

    X = train_df[cols].values
    y = train_df['target'].values
    X_test = test_df[cols].values

    prob_train = np.zeros(len(X))
    prob_test = np.zeros(len(X_test))

    kfold = 5
    scores = []
    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=41)
    for i, (train_ind, valid_ind) in enumerate(skf.split(X, y)):
        X_train, X_valid = X[train_ind], X[valid_ind]
        y_train, y_valid = y[train_ind], y[valid_ind]

        model = RGFClassifier(**params)
        model.fit(X_train, y_train)

        prob = model.predict_proba(X_valid)[:, 1]
        prob_train[valid_ind] = prob
        score = gini_norm(prob, y_valid)
        scores.append(score)
        log.info('- Fold %d/%d score: %f' % (i + 1, kfold, score))

        prob = model.predict_proba(X_test)[:, 1]
        prob_test += prob / kfold

        try:
            subprocess.call('rm -rf /tmp/rgf/*', shell=True)
            print("Clean up is successfull")
            print(glob.glob("/tmp/rgf/*"))
        except Exception as e:
            print(str(e))

    mean_score = np.mean(scores)
    log.info('- Mean score: %f' % mean_score)

    prob_train_df = pd.DataFrame({'id': train_df['id'], 'target': prob_train})
    prob_train_df.to_csv(os.path.join('local_cv', '%s.csv.gz' % model_name),
                         index=False,
                         compression='gzip')
    prob_test_df = pd.DataFrame({'id': test_df['id'], 'target': prob_test})
    prob_test_df.to_csv(os.path.join('submission', '%s.csv.gz' % model_name),
                        index=False,
                        compression='gzip')

    return mean_score
# Set up folds
K = Number_of_folds
kf = comm_skf

sl2_list = [0.08, 0.09, 0.11, 0.12]
for sl2 in sl2_list:
    logging.info('test with sl2 : {0}'.format(sl2))
    # Set up classifier
    model = RGFClassifier(
        max_leaf=1200,
        algorithm="RGF",
        loss="Log",
        l2=0.012,
        sl2=sl2,
        normalize=False,
        min_samples_leaf=10,
        n_iter=None,
        opt_interval=100,
        learning_rate=0.5,
        calc_prob="sigmoid",
        n_jobs=-1,
        memory_policy="generous",
        verbose=0
    )

    # Run CV
    logging.info('feature shape {0}'.format(X.shape))

    for i, (train_index, test_index) in enumerate(kf.split(train_df, y)):

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
import time

from sklearn import datasets
from sklearn.utils.validation import check_random_state
from sklearn.ensemble import GradientBoostingClassifier
from rgf.sklearn import RGFClassifier, FastRGFClassifier

iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

start = time.time()
clf = RGFClassifier()
clf.fit(iris.data, iris.target)
score = clf.score(iris.data, iris.target)
end = time.time()
print("RGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
clf = FastRGFClassifier()
clf.fit(iris.data, iris.target)
score = clf.score(iris.data, iris.target)
end = time.time()
print("FastRGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
clf = GradientBoostingClassifier()
Exemple #36
0
xgb_homeless_bag_kf = layer1.Layer1Train(XGBClassifier(**xgb_homeless_params),
                                         'xgb_homeless_bag_kf',
                                         drop_stupid='default',
                                         cat_transform='smooth',
                                         recon_category=True,
                                         engineer_stats=True)

xgb_18 = layer1.Layer1Train(XGBClassifier(**xgb_18_params),
                            'xgb_18',
                            drop_stupid='default',
                            cat_transform='smooth',
                            recon_category=True,
                            data_transform='round',
                            engineer_stats=True)

rgf_bojan_na_bag_kf = layer1.Layer1Train(RGFClassifier(**rgf_bojan_params),
                                         'rgf_bojan_na_bag_kf',
                                         drop_stupid='default',
                                         cat_transform='smooth',
                                         recon_category=True,
                                         engineer_stats=True)

# fm_sgd = layer1.Layer1Train(sgd.FMClassification(**fm_sgd_params), 'fm_sgd',
#         drop_stupid=True, cat_transform='onehot', recon_category=True, engineer_stats=True)
#
# fm_als = layer1.Layer1Train(sgd.FMClassification(**fm_als_params), 'fm_als',
#         drop_stupid=True, cat_transform='smooth', data_transform='log', recon_category=True, engineer_stats=True)

cat_1 = layer1.Layer1Train(CatBoostClassifier(**cat_1_params),
                           'cat_1',
                           drop_stupid='default',
Exemple #37
0
    def test_cleanup(self):
        clf1 = RGFClassifier()
        clf1.fit(self.X_train, self.y_train)

        clf2 = RGFClassifier()
        clf2.fit(self.X_train, self.y_train)

        self.assertNotEqual(clf1.cleanup(), 0)
        self.assertEqual(clf1.cleanup(), 0)

        for est in clf1.estimators_:
            glob_file = os.path.join(_get_temp_path(), est._file_prefix + "*")
            self.assertFalse(glob.glob(glob_file))

        self.assertRaises(NotFittedError, clf1.predict, self.X_test)
        clf2.predict(self.X_test)