def test_cleanup(self): clf1 = RGFClassifier() clf1.fit(self.X_train, self.y_train) clf2 = RGFClassifier() clf2.fit(self.X_train, self.y_train) self.assertNotEqual(clf1.cleanup(), 0) self.assertEqual(clf1.cleanup(), 0) for est in clf1.estimators_: glob_file = os.path.join(_get_temp_path(), est._file_prefix + "*") self.assertFalse(glob.glob(glob_file)) self.assertRaises(NotFittedError, clf1.predict, self.X_test) clf2.predict(self.X_test)
def test_string_y(self): clf = RGFClassifier() y_str = np.array(self.iris.target, dtype=str) y_str[y_str == '0'] = 'Zero' y_str[y_str == '1'] = 'One' y_str[y_str == '2'] = 'Two' clf.fit(self.iris.data, y_str) y_pred = clf.predict(self.iris.data) score = accuracy_score(y_str, y_pred) self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
def test_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) s = pickle.dumps(clf) # Remove model file _cleanup() reg2 = pickle.loads(s) y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_joblib_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) joblib.dump(clf, 'test_clf.pkl') # Remove model file _cleanup() clf2 = joblib.load('test_clf.pkl') y_pred2 = clf2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def train(params): # log hyperparams for this run for k, v in params.items(): mlflow.log_param(k, v) # load dataset files # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta' dataset = np.load('preprocessed/dataset.npz') X_arr = dataset['X_arr'] Y_arr = dataset['Y_arr'] # split for train-test X_train, X_test, Y_train, Y_test = train_test_split(X_arr, Y_arr, stratify=Y_arr, test_size=0.2) # instantiate model with params rgf_clf = RGFClassifier(**params) rgf_clf.fit(X_train, Y_train) # predict on test data Y_pred = rgf_clf.predict(X_test) Y_pred_proba = rgf_clf.predict_proba(X_test) # log logistic loss value logistic_loss = log_loss(Y_test, Y_pred_proba) mlflow.log_metric('log_loss', logistic_loss) # log precision, recall, f1 p, r, f, _ = precision_recall_fscore_support(y_true=Y_test, y_pred=Y_pred, average='binary') mlflow.log_metric('precision', p) mlflow.log_metric('recall', r) mlflow.log_metric('f1', f) # which features matter the most print("========== FEATURE IMPORTANCES ==========") print(rgf_clf.feature_importances_)