def test_predict_proba_in_regression(self): model = AutoML(explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir) model.fit(boston.data, boston.target) with self.assertRaises(AutoMLException) as context: # Try to call predict_proba in regression task model.predict_proba(boston.data)
def test_too_small_time_limit(self): rows = 100000 X = np.random.uniform(size=(rows, 100)) y = np.random.randint(0, 2, size=(rows, )) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, train_ensemble=False) with self.assertRaises(AutoMLException) as context: automl.fit(X, y)
def test_new_directory(self): """ Directory does not exist, create it """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Create model with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit( X, y) # AutoML only validates constructor params on `fit()` call # Assert directory was created self.assertTrue(os.path.exists(self.automl_dir))
def test_different_input_types(self): """ Test the different data input types for AutoML""" model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) X, y = datasets.make_regression() # First test - X and y as numpy arrays pred = model.fit(X, y).predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0]) del model model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) # Second test - X and y as pandas dataframe X_pandas = pd.DataFrame(X) y_pandas = pd.DataFrame(y) pred_pandas = model.fit(X_pandas, y_pandas).predict(X_pandas) self.assertIsInstance(pred_pandas, np.ndarray) self.assertEqual(len(pred_pandas), X.shape[0]) del model model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) # Third test - X and y as lists X_list = pd.DataFrame(X).values.tolist() y_list = pd.DataFrame(y).values.tolist() pred_list = model.fit(X_pandas, y_pandas).predict(X_pandas) self.assertIsInstance(pred_list, np.ndarray) self.assertEqual(len(pred_list), X.shape[0])
def test_one_column_input_regression(self): a = AutoML(results_path=self.automl_dir, total_time_limit=5, explain_level=0) a.set_advanced(start_random_models=1) X = pd.DataFrame({"feature_1": np.random.rand(100)}) y = np.random.rand(100) a.fit(X, y) p = a.predict(X) self.assertTrue("prediction" in p.columns) self.assertTrue(p.shape[0] == 100)
def test_tune_only_default(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, tuning_mode="Insane", algorithms=["Xgboost"], ) automl.fit(X, y) self.assertEqual(len(automl._models), 1)
def test_regression(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.rand(self.rows) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) self.assertTrue(len(pred.columns.tolist()) == 1) self.assertTrue(pred.columns[0] == "prediction")
def test_one_column_input_regression(self): a = AutoML( results_path=self.automl_dir, total_time_limit=5, explain_level=0, start_random_models=1, ) X, y = datasets.make_regression(n_features=1) a.fit(X, y) p = a.predict(X) self.assertIsInstance(p, np.ndarray) self.assertEqual(len(p), X.shape[0])
def test_encoding_strange_characters(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2)) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Baseline"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y)
def test_fit_returns_self(self): """Tests if the `fit()` method returns `self`. This allows to quickly implement one-liners with AutoML""" model = AutoML() self.assertTrue( isinstance(model.fit(iris.data, iris.target), AutoML), "`fit()` method must return 'self'", )
def test_breast_cancer_dataset(self): """ Tests AutoML in the breast cancer (binary classification)""" model = AutoML(explain_level=0, verbose=0, random_state=1) score = model.fit(breast_cancer.data, breast_cancer.target).score(breast_cancer.data, breast_cancer.target) self.assertGreater(score, 0.5)
def test_empty_directory(self): """ Directory exists and is empty, use it """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Make dir os.mkdir(self.automl_dir) # Assert dir exists self.assertTrue(os.path.exists(self.automl_dir)) # Create automl with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit( X, y) # AutoML only validates constructor params on `fit()` call self.assertTrue(os.path.exists(self.automl_dir))
def test_disable_stack_models_adjusted_validation(self): X = np.random.uniform(size=(100, 2)) y = np.random.randint(0, 2, size=(100, )) X[:, 0] = y X[:, 1] = -y automl = AutoML(results_path=self.automl_dir, total_time_limit=5, mode="Compete") automl.fit(X, y) # the stacking should be disabled # because of small time limit self.assertFalse(automl._stack_models) self.assertFalse(automl.tuner._stack_models) self.assertFalse(automl._time_ctrl._is_stacking)
def test_no_constructor_args(self): """Tests the use of AutoML without passing any args. Should work without any arguments""" # Create model with no arguments model = AutoML() # Assert than an Exception is raised score = model.fit(iris.data, iris.target).score(iris.data, iris.target) self.assertGreater(score, 0.5)
def test_one_column_input_bin_class(self): a = AutoML( results_path=self.automl_dir, total_time_limit=5, explain_level=0, start_random_models=1, ) X = pd.DataFrame({"feature_1": np.random.rand(100)}) y = (np.random.rand(X.shape[0]) > 0.5).astype(int) a.fit(X, y) p = a.predict(X) self.assertIsInstance(p, np.ndarray) self.assertEqual(len(p), X.shape[0])
def test_get_params(self): """ Passes params in AutoML constructor and uses `get_params()` after fitting. Initial params must be equal to the ones returned by `get_params()`. """ # Create model model = AutoML(hill_climbing_steps=3, start_random_models=1) # Get params before fit params_before_fit = model.get_params() # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit(X, y) # Get params after fit params_after_fit = model.get_params() # Assert before and after params are equal self.assertEquals(params_before_fit, params_after_fit)
def test_bin_class_01(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in ["prediction_0", "prediction_1", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(0 in u or 1 in u) self.assertTrue(len(u) <= 2)
def test_disable_stack_models(self): X = np.random.uniform(size=(100, 2)) y = np.random.randint(0, 2, size=(100, )) X[:, 0] = y X[:, 1] = -y automl = AutoML( results_path=self.automl_dir, total_time_limit=5, mode="Compete", validation_strategy={"validation_type": "split"}, ) automl.fit(X, y) self.assertFalse(automl._stack_models) self.assertFalse(automl.tuner._stack_models) self.assertFalse(automl._time_ctrl._is_stacking)
def test_category_data_type(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) X["f1"] = X["f1"].astype("category") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["CatBoost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y)
def test_repeated_kfold(self): REPEATS = 3 FOLDS = 2 a = AutoML( results_path=self.automl_dir, total_time_limit=10, algorithms=["Random Forest"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": FOLDS, "repeats": REPEATS, "shuffle": True, "stratify": True, }, start_random_models=1, ) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir( os.path.join(self.automl_dir, "1_Default_RandomForest")) cnt = 0 for repeat in range(REPEATS): for fold in range(FOLDS): learner_name = construct_learner_name(fold, repeat, REPEATS) self.assertTrue( f"{learner_name}.random_forest" in result_files) self.assertTrue(f"{learner_name}_training.log" in result_files) cnt += 1 self.assertTrue(cnt, 6)
def test_iris_dataset(self): """ Tests AutoML in the iris dataset (Multiclass classification)""" model = AutoML(explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir) score = model.fit(iris.data, iris.target).score(iris.data, iris.target) self.assertGreater(score, 0.5)
def test_custom_init(self): X = np.random.uniform(size=(30, 2)) y = np.random.randint(0, 2, size=(30, )) automl = AutoML(results_path=self.automl_dir, model_time_limit=1, algorithms=["Xgboost"], explain_level=0, train_ensemble=False, stack_models=False, validation_strategy={"validation_type": "split"}, start_random_models=3, hill_climbing_steps=1, top_models_to_improve=1) automl.fit(X, y) self.assertGreater(len(automl._models), 4)
def test_boston_dataset(self): """ Tests AutoML in the boston dataset (Regression)""" model = AutoML(explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir) score = model.fit(boston.data, boston.target).score(boston.data, boston.target) self.assertGreater(score, 0.5)
def test_regression(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.rand(self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0])
def test_regression_missing_target(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) self.assertTrue(len(pred.columns.tolist()) == 1) self.assertTrue(pred.columns[0] == "prediction")
def test_bin_class_AB(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation(["a", "B"] * int(self.rows / 2)) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) u = np.unique(pred) self.assertTrue("a" in u or "B" in u) self.assertTrue(len(u) <= 2)
def test_score_without_y(self): """Tests the use of `score()` without passing y. Should raise AutoMLException""" model = AutoML(explain_level=0, verbose=0, random_state=1) # Assert than an Exception is raised with self.assertRaises(AutoMLException) as context: # Try to score without passing 'y' score = model.fit(breast_cancer.data, breast_cancer.target).score(breast_cancer.data) self.assertTrue("y must be specified" in str(context.exception))
def test_multi_class_abcd_mixed_int(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation([1, "B", "CC", "d"] * self.rows), name="target") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4)
def test_predict_on_empty_dataframe(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) with self.assertRaises(AutoMLException) as context: pred = automl.predict(pd.DataFrame()) with self.assertRaises(AutoMLException) as context: pred = automl.predict(np.empty(shape=(0, 3)))
def test_multi_class_0123(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(0 in u or 1 in u or 2 in u or 3 in u) self.assertTrue(len(u) <= 4)