def train_digits(): digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame( digits.data), digits.target, stratify=digits.target, test_size=0.25, random_state=123) # train models with AutoML automl = AutoML(mode="Perform", results_path="AutoML_digits") automl.fit(X_train, y_train) # compute predictions = automl.predict_all(X_test) print(predictions.head()) print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))) plot_digits(X_test, predictions)
import numpy as np import pandas as pd from supervised.automl import AutoML from sklearn.model_selection import train_test_split import os from sklearn.metrics import log_loss import warnings # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) df = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", skipinitialspace=True, ) X = df[df.columns[:-1]] y = df["income"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) automl = AutoML(results_path="AutoML_7", mode="Compete") automl.fit(X_train, y_train) predictions = automl.predict_all(X_test) print(predictions.head()) print(predictions.tail()) print(X_test.shape, predictions.shape) print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
def run(dataset, config): log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n") save_metadata(config, version=supervised.__version__) # Mapping of benchmark metrics to MLJAR metrics metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse') eval_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else "auto" # Mapping of benchmark task to MLJAR ML task problem_mapping = dict( binary="binary_classification", multiclass="multiclass_classification", regression="regression", ) ml_task = problem_mapping.get( dataset.problem_type ) # if None the AutoML will guess about the ML task is_classification = config.type == "classification" results_path = output_subdir("results", config) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith("_") } column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) label = dataset.target.name train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False) X_train = train.drop(columns=label) y_train = train[label] test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False) X_test = test.drop(columns=label) y_test = test[label] automl = AutoML(results_path=results_path, total_time_limit=config.max_runtime_seconds, random_state=config.seed, ml_task=ml_task, eval_metric=eval_metric, **training_params) with utils.Timer() as training: automl.fit(X_train, y_train) with utils.Timer() as predict: preds = automl.predict_all(X_test) predictions, probabilities = None, None if is_classification: predictions = preds["label"].values cols = [f"prediction_{c}" for c in np.unique(y_train)] probabilities = preds[cols].values else: predictions = preds["prediction"].values # clean the results if not config.framework_params.get("_save_artifacts", False): shutil.rmtree(results_path, ignore_errors=True) return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, models_count=len(automl._models), training_duration=training.duration, predict_duration=predict.duration)
import numpy as np import pandas as pd from supervised.automl import AutoML from sklearn.metrics import accuracy_score import os nrows = 100 ncols = 3 X = np.random.rand(nrows, ncols) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)]) #y = np.random.randint(0, 2, nrows) y = np.random.permutation(["a", "B"] * 50) automl = AutoML(model_time_limit=10) #, algorithms=["Decision Tree"]) automl.fit(X, y) print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"])) #X = np.random.rand(1000, 10) #X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)]) #y = np.random.randint(0, 2, 1000) #print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
y = df["Survived"] for col in df.columns: print(col, df[col].dtype) automl = AutoML( results_path="AutoML_39", # algorithms=["Xgboost"], # model_time_limit=20, # train_ensemble=True, mode="Explain" ) # automl.set_advanced(start_random_models=3) automl.fit(X, y) pred = automl.predict_all(X) print("Train accuracy", accuracy_score(y, pred["label"])) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") test_cols = [ "Parch", "Ticket", "Fare", "Pclass", "Name", "Sex", "Age", "SibSp", "Cabin", "Embarked",
def run(dataset, config): log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n") # Mapping of benchmark metrics to MLJAR metrics metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse') eval_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else "auto" # Mapping of benchmark task to MLJAR ML task problem_mapping = dict( binary="binary_classification", multiclass="multiclass_classification", regression="regression", ) ml_task = problem_mapping.get( dataset.problem_type ) # if None the AutoML will guess about the ML task is_classification = config.type == "classification" results_path = output_subdir("results", config) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith("_") } X_train, y_train = dataset.train.X, dataset.train.y.squeeze() X_test, y_test = dataset.test.X, dataset.test.y.squeeze() automl = AutoML(results_path=results_path, total_time_limit=config.max_runtime_seconds, random_state=config.seed, ml_task=ml_task, eval_metric=eval_metric, **training_params) with Timer() as training: automl.fit(X_train, y_train) with Timer() as predict: preds = automl.predict_all(X_test) predictions, probabilities, probabilities_labels = None, None, None if is_classification: # preds is a dataframe with columns ["prediction_LABEL", .., "label"] if y_train.dtype == bool and preds["label"].dtype == int: # boolean target produces integer predictions for mljar-supervised <= 0.10.6 # https://github.com/mljar/mljar-supervised/issues/442 preds = preds.rename( { "prediction_0": "False", "prediction_1": "True" }, axis=1) preds["label"] = preds["label"].astype(bool) else: preds.columns = [ c.replace("prediction_", "", 1) for c in preds.columns ] predictions = preds["label"].values probabilities_labels = list(preds.columns)[:-1] probabilities = preds[probabilities_labels].values else: predictions = preds["prediction"].values # clean the results if not config.framework_params.get("_save_artifacts", False): shutil.rmtree(results_path, ignore_errors=True) return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, probabilities_labels=probabilities_labels, models_count=len(automl._models), training_duration=training.duration, predict_duration=predict.duration)
import warnings from sklearn import datasets from sklearn.pipeline import make_pipeline from sklearn.decomposition import PCA from supervised import AutoML from supervised.exceptions import AutoMLException # warnings.filterwarnings('error') warnings.filterwarnings( "error", category=pd.core.common.SettingWithCopyWarning) # message="*ndarray*") df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] y = df["class"] automl = AutoML() automl.fit(X, y) predictions = automl.predict_all(X) print(predictions.head()) print(predictions.tail()) print(X.shape) print(predictions.shape)