def test_multi_class_abcd_missing_target(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in [ "prediction_a", "prediction_B", "prediction_CC", "prediction_d", "label", ]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4)
def test_bin_class_AB_missing_targets(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation(["a", "B"] * int(self.rows / 2)), name="target") y.iloc[1] = None y.iloc[3] = np.NaN y.iloc[13] = np.nan automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) for col in ["prediction_a", "prediction_B", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue("a" in u or "B" in u) self.assertTrue(len(u) <= 2)
def test_save_load(self): a = AutoML( results_path=self.automl_dir, total_time_limit=10, explain_level=0, mode="Explain", train_ensemble=True, ) a.set_advanced(start_random_models=1) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) p = a.predict(X) a2 = AutoML(results_path=self.automl_dir) p2 = a2.predict(X) self.assertTrue((p["label"] == p2["label"]).all())
def test_multi_class_0123(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in [ "prediction_0", "prediction_1", "prediction_2", "prediction_3", "label", ]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u) self.assertTrue(len(u) <= 4)
def test_explain_just_permutation_importance(self): a = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, validation={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, explain_level=1) a.set_advanced(start_random_models=1) X, y = datasets.make_regression(n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir(os.path.join(self.automl_dir, "model_1")) # There should be no files with: # - permutation importance # - shap importance # - shap dependence # - shap decisions # Check permutation importance produced = False for f in result_files: if "importance.csv" in f and "shap" not in f: produced = True break self.assertTrue(produced) # Check shap importance produced = False for f in result_files: if "importance.csv" in f and "shap" in f: produced = True break self.assertFalse(produced) # Check shap dependence produced = False for f in result_files: if "dependence.png" in f: produced = True break self.assertFalse(produced) # Check shap decisions produced = False for f in result_files: if "decisions.png" in f: produced = True break self.assertFalse(produced)
def test_one_column_input_regression(self): a = AutoML(results_path=self.automl_dir, total_time_limit=5, explain_level=0) a.set_advanced(start_random_models=1) X = pd.DataFrame({"feature_1": np.random.rand(100)}) y = np.random.rand(100) a.fit(X, y) p = a.predict(X) self.assertTrue("prediction" in p.columns) self.assertTrue(p.shape[0] == 100)
def test_regression(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.rand(self.rows) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) self.assertTrue(len(pred.columns.tolist()) == 1) self.assertTrue(pred.columns[0] == "prediction")
def test_bin_class_AB(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation(['a', 'B'] * int(self.rows / 2)) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False) automl.set_advanced(start_random_models=1) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) for col in ["prediction_a", "prediction_B", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue('a' in u or 'B' in u) self.assertTrue(len(u) <= 2)
def test_regression_missing_target(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) self.assertTrue(len(pred.columns.tolist()) == 1) self.assertTrue(pred.columns[0] == "prediction")
def test_integration(self): a = AutoML(results_path=self.automl_dir, model_time_limit=1) a.set_advanced(start_random_models=1) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) p = a.predict(X) self.assertTrue("label" in p.columns)
def test_bin_class_11(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) * 2 - 1 automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) for col in ["prediction_-1", "prediction_1", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(-1 in u or 1 in u) self.assertTrue(len(u) <= 2)