def test_predict_proba_in_regression(self): model = AutoML(explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir) model.fit(boston.data, boston.target) with self.assertRaises(AutoMLException) as context: # Try to call predict_proba in regression task model.predict_proba(boston.data)
def define_and_evaluate_mljar_pipeline(X, y, random_state=0): outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state) nested_scores = [] for train_inds, test_inds in outer_cv.split(X, y): X_train, y_train = X[train_inds, :], y[train_inds] X_test, y_test = X[test_inds, :], y[test_inds] binary = len((set(y))) == 2 eval_metric = "auc" if binary else "logloss" ml_task = "binary_classification" if binary else "multiclass_classification" shutil.rmtree("AutoML_1", ignore_errors=True) automl = AutoML(results_path="AutoML_1", mode="Compete", eval_metric=eval_metric, total_time_limit=SEC, ml_task=ml_task) automl.fit(X_train, y_train) y_pred = automl.predict_proba(X_test) # same as roc_auc_ovr_weighted if binary: score = roc_auc_score(y_test, y_pred[:, 1], average="weighted", multi_class="ovr") else: score = roc_auc_score(y_test, y_pred, average="weighted", multi_class="ovr") nested_scores.append(score) return nested_scores
import pandas as pd from supervised import AutoML train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") sub = pd.read_csv("sample_submission.csv") x_cols = train.columns[2:] print(x_cols) automl = AutoML(mode="Compete", eval_metric="auc", total_time_limit=4 * 3600) automl.fit(train[x_cols], train["target"]) sub[sub.columns[1:]] = automl.predict_proba(test)[:, 1] sub.to_csv("sub_1.csv", index=False)
import pandas as pd from supervised import AutoML train = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/train.csv") test = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/test.csv") X_train = train.drop(["id", "target"], axis=1) y_train = train.target X_test = test.drop(["id"], axis=1) automl = AutoML( mode="Optuna", eval_metric="auc", algorithms=["CatBoost"], optuna_time_budget=1800, # tune each algorithm for 30 minutes total_time_limit=48 * 3600, # total time limit, set large enough to have time to compute all steps features_selection=False) automl.fit(X_train, y_train) preds = automl.predict_proba(X_test) submission = pd.DataFrame({"id": test.id, "target": preds[:, 1]}) submission.to_csv("1_submission.csv", index=False)
import pandas as pd from supervised import AutoML train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") sub = pd.read_csv("sample_submission.csv") x_cols = train.columns[2:] automl = AutoML(mode="Compete", total_time_limit=4 * 3600, stack_models=True, features_selection=False) automl.fit(train[x_cols], train["target"]) sub["PredictedProb"] = automl.predict_proba(test)[:, 1] sub.to_csv("sub_1.csv", index=False)
import pandas as pd from supervised import AutoML train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") sub = pd.read_csv("sample_submission.csv") x_cols = train.columns[2:] automl = AutoML(mode="Compete", total_time_limit=4 * 3600, eval_metric="auc") automl.fit(train[x_cols], train["target"]) sub["target"] = automl.predict_proba(test)[:, 1] sub.to_csv("sub_1.csv", index=False)