Ejemplo n.º 1
0
    def test_cleanup(self):
        clf1 = RGFClassifier()
        clf1.fit(self.X_train, self.y_train)

        clf2 = RGFClassifier()
        clf2.fit(self.X_train, self.y_train)

        self.assertNotEqual(clf1.cleanup(), 0)
        self.assertEqual(clf1.cleanup(), 0)

        for est in clf1.estimators_:
            glob_file = os.path.join(_get_temp_path(), est._file_prefix + "*")
            self.assertFalse(glob.glob(glob_file))

        self.assertRaises(NotFittedError, clf1.predict, self.X_test)
        clf2.predict(self.X_test)
Ejemplo n.º 2
0
    def test_string_y(self):
        clf = RGFClassifier()

        y_str = np.array(self.iris.target, dtype=str)
        y_str[y_str == '0'] = 'Zero'
        y_str[y_str == '1'] = 'One'
        y_str[y_str == '2'] = 'Two'

        clf.fit(self.iris.data, y_str)
        y_pred = clf.predict(self.iris.data)
        score = accuracy_score(y_str, y_pred)
        self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
Ejemplo n.º 3
0
    def test_pickle(self):
        clf = RGFClassifier()
        clf.fit(self.X_train, self.y_train)
        y_pred1 = clf.predict(self.X_test)
        s = pickle.dumps(clf)

        # Remove model file
        _cleanup()

        reg2 = pickle.loads(s)
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Ejemplo n.º 4
0
    def test_joblib_pickle(self):
        clf = RGFClassifier()
        clf.fit(self.X_train, self.y_train)
        y_pred1 = clf.predict(self.X_test)
        joblib.dump(clf, 'test_clf.pkl')

        # Remove model file
        _cleanup()

        clf2 = joblib.load('test_clf.pkl')
        y_pred2 = clf2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Ejemplo n.º 5
0
def train(params):
    # log hyperparams for this run
    for k, v in params.items():
        mlflow.log_param(k, v)

    # load dataset files
    # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta'
    dataset = np.load('preprocessed/dataset.npz')
    X_arr = dataset['X_arr']
    Y_arr = dataset['Y_arr']

    # split for train-test
    X_train, X_test, Y_train, Y_test = train_test_split(X_arr,
                                                        Y_arr,
                                                        stratify=Y_arr,
                                                        test_size=0.2)

    # instantiate model with params
    rgf_clf = RGFClassifier(**params)
    rgf_clf.fit(X_train, Y_train)

    # predict on test data
    Y_pred = rgf_clf.predict(X_test)
    Y_pred_proba = rgf_clf.predict_proba(X_test)

    # log logistic loss value
    logistic_loss = log_loss(Y_test, Y_pred_proba)
    mlflow.log_metric('log_loss', logistic_loss)

    # log precision, recall, f1
    p, r, f, _ = precision_recall_fscore_support(y_true=Y_test,
                                                 y_pred=Y_pred,
                                                 average='binary')
    mlflow.log_metric('precision', p)
    mlflow.log_metric('recall', r)
    mlflow.log_metric('f1', f)

    # which features matter the most
    print("========== FEATURE IMPORTANCES ==========")
    print(rgf_clf.feature_importances_)