Example #1
0
def run(dataset, config):
    log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__)

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config

    log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
    log.warning("We completely ignore the requirement to stay within the time limit.")
    log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rf = estimator(n_jobs=n_jobs,
                   random_state=config.seed,
                   **training_params)

    with utils.Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(rf),
                  training_duration=training.duration)
Example #2
0
def run(dataset, config):
    log.info("\n**** mljar-supervised ****\n")

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    X_train = pd.DataFrame(dataset.train.X,
                           columns=column_names).astype(column_types,
                                                        copy=False)
    X_test = pd.DataFrame(dataset.test.X,
                          columns=column_names).astype(column_types,
                                                       copy=False)

    y_train = dataset.train.y.flatten()
    y_test = dataset.test.y.flatten()

    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    is_classification = config.type == "classification"
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    seed=config.seed,
                    ml_task=ml_task,
                    **training_params)

    with Timer() as training:
        automl.fit(X_train, y_train)

    preds = automl.predict(X_test)

    predictions, probabilities = None, None
    if is_classification:
        predictions = preds["label"].values
        probabilities = preds[preds.columns[:-1]].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(
        output_file=config.output_predictions_file,
        predictions=predictions,
        truth=y_test,
        probabilities=probabilities,
        models_count=len(automl._models),
        training_duration=training.duration,
    )
Example #3
0
def run(dataset, config):
    log.info(f"\n**** FLAML [v{__version__}] ****\n")

    X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()

    is_classification = config.type == 'classification'
    time_budget = config.max_runtime_seconds
    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    log.info("Running FLAML with {} number of cores".format(config.cores))
    aml = AutoML()

    # Mapping of benchmark metrics to flaml metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='log_loss',
        mae='mae',
        mse='mse',
        rmse='rmse',
        r2='r2',
    )
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else 'auto'
    if perf_metric is None:
        log.warning("Performance metric %s not supported.", config.metric)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log_dir = output_subdir("logs", config)
    flaml_log_file_name = os.path.join(log_dir, "flaml.log")
    with Timer() as training:
        aml.fit(X_train,
                y_train,
                metric=perf_metric,
                task=config.type,
                n_jobs=n_jobs,
                log_file_name=flaml_log_file_name,
                time_budget=time_budget,
                **training_params)

    with Timer() as predict:
        predictions = aml.predict(X_test)
    probabilities = aml.predict_proba(X_test) if is_classification else None
    labels = aml.classes_ if is_classification else None
    return result(
        output_file=config.output_predictions_file,
        probabilities=probabilities,
        predictions=predictions,
        truth=y_test,
        models_count=len(aml.config_history),
        training_duration=training.duration,
        predict_duration=predict.duration,
        probabilities_labels=labels,
    )
Example #4
0
def run(dataset, config):
    log.info("\n**** GAMA  %s ****", __version__)
    log.info("sklearn == %s", sklearn.__version__)
    log.info("category_encoders == %s", category_encoders.__version__)

    is_classification = (config.type == 'classification')
    # Mapping of benchmark metrics to GAMA metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='neg_log_loss',
        mae='neg_mean_absolute_error',
        mse='neg_mean_squared_error',
        msle='neg_mean_squared_log_error',
        r2='r2',
        rmse='neg_root_mean_squared_error',
    )
    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(config.metric))

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config

    *_, did, fold = dataset.train_path.split('/')
    fold = fold.split('.')[0].split('_')[-1]
    log_file = os.path.join(config.output_dir, "logs", '{}_{}.log'.format(did, fold))
    utils.touch(log_file)

    log.info('Running GAMA with a maximum time of %ss on %s cores, optimizing %s.',
             config.max_runtime_seconds, n_jobs, scoring_metric)

    estimator = GamaClassifier if is_classification else GamaRegressor
    gama_automl = estimator(n_jobs=n_jobs,
                            max_total_time=config.max_runtime_seconds,
                            scoring=scoring_metric,
                            random_state=config.seed,
                            keep_analysis_log=log_file,
                            **training_params)

    with utils.Timer() as training:
        gama_automl.fit_arff(dataset.train_path, dataset.target, encoding='utf-8')

    log.info('Predicting on the test set.')
    predictions = gama_automl.predict_arff(dataset.test_path, dataset.target, encoding='utf-8')
    if is_classification is not None:
        probabilities = gama_automl.predict_proba_arff(dataset.test_path, dataset.target, encoding='utf-8')
    else:
        probabilities = None

    return result(
        output_file=config.output_predictions_file,
        predictions=predictions,
        probabilities=probabilities,
        target_is_encoded=False,
        models_count=len(gama_automl._final_pop),
        training_duration=training.duration
    )
Example #5
0
def run(dataset, config):
    log.info("\n**** TPOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='neg_log_loss',
        mae='neg_mean_absolute_error',
        mse='neg_mean_squared_error',
        msle='neg_mean_squared_log_error',
        r2='r2'
    )
    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(config.metric))

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config

    log.info('Running TPOT with a maximum time of %ss on %s cores, optimizing %s.',
             config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds/60)

    estimator = TPOTClassifier if is_classification else TPOTRegressor
    tpot = estimator(n_jobs=n_jobs,
                     max_time_mins=runtime_min,
                     scoring=scoring_metric,
                     random_state=config.seed,
                     **training_params)

    with Timer() as training:
        tpot.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = tpot.predict(X_test)
    try:
        probabilities = tpot.predict_proba(X_test) if is_classification else None
    except RuntimeError:
        # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`.
        probabilities = "predictions"  # encoding is handled by caller in `__init__.py`

    save_artifacts(tpot, config)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(tpot.evaluated_individuals_),
                  training_duration=training.duration)
Example #6
0
def run(dataset, config):
    log.info(f"\n**** lightautoml (R) [{__version__}] ****\n")
    save_metadata(config, version=__version__)

    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    label = dataset.target.name

    df_train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False)
    df_train[dataset.target.name] = y_train

    max_mem_size_gb = float(config.max_mem_size_mb) / 1024
    task = Task(dataset.problem_type if dataset.problem_type != 'regression' else 'reg')
    automl = TabularUtilizedAutoML(task=task, timeout=config.max_runtime_seconds, cpu_limit=config.cores,
                                   memory_limit=max_mem_size_gb, random_state=config.seed)

    log.info("Training...")
    with utils.Timer() as training:
        automl.fit_predict(train_data=df_train, roles={'target': label})

    df_test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False)
    df_x_test = df_test.drop(columns=label)

    log.info("Predicting on the test set...")
    with utils.Timer() as predict:
        preds = automl.predict(df_x_test).data

    if is_classification:
        probabilities = preds

        if dataset.problem_type == 'binary':
            probabilities = np.vstack([
                1 - probabilities[:, 0], probabilities[:, 0]
            ]).T

        predictions = np.argmax(probabilities, axis=1)

    else:
        probabilities = None
        predictions = preds

    log.debug(probabilities)
    log.debug(config.output_predictions_file)

    save_artifacts(automl, config)

    return result(
        output_file=config.output_predictions_file,
        probabilities=probabilities,
        predictions=predictions,
        truth=y_test,
        target_is_encoded=is_classification,
        training_duration=training.duration,
        predict_duration=predict.duration,
    )
Example #7
0
def run(dataset, config):
    log.info(f"\n**** AutoGluon [v{__version__}] ****\n")
    save_metadata(config, version=__version__)

    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        r2=metrics.r2,
        # rmse=metrics.root_mean_squared_error,  # metrics.root_mean_squared_error incorrectly registered in autogluon REGRESSION_METRICS
        rmse=metrics.
        mean_squared_error,  # for now, we can let autogluon optimize training on mse: anyway we compute final score from predictions.
    )

    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    is_classification = config.type == 'classification'
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    train = pd.DataFrame(dataset.train.data,
                         columns=column_names).astype(column_types, copy=False)
    label = dataset.target.name
    print(f"Columns dtypes:\n{train.dtypes}")

    output_dir = output_subdir("models", config)
    with utils.Timer() as training:
        predictor = task.fit(train_data=train,
                             label=label,
                             problem_type=dataset.problem_type,
                             output_directory=output_dir,
                             time_limits=config.max_runtime_seconds,
                             eval_metric=perf_metric.name,
                             **training_params)

    test = pd.DataFrame(dataset.test.data,
                        columns=column_names).astype(column_types, copy=False)
    X_test = test.drop(columns=label)
    y_test = test[label]

    with utils.Timer() as predict:
        predictions = predictor.predict(X_test)

    probabilities = predictor.predict_proba(
        dataset=X_test, as_pandas=True,
        as_multiclass=True) if is_classification else None
    prob_labels = probabilities.columns.values.tolist(
    ) if probabilities is not None else None

    leaderboard = predictor._learner.leaderboard(X_test, y_test, silent=True)
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None, 'display.width', 1000):
        print(leaderboard)

    save_artifacts(predictor, leaderboard, config)

    num_models_trained = len(leaderboard)
    num_models_ensemble = len(
        predictor._trainer.get_minimum_model_set(
            predictor._trainer.model_best))

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  probabilities_labels=prob_labels,
                  target_is_encoded=False,
                  models_count=num_models_trained,
                  models_ensemble_count=num_models_ensemble,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #8
0
def run(dataset, config):
    log.info(f"\n**** H2O AutoML [v{h2o.__version__}] ****\n")
    save_metadata(config, version=h2o.__version__)
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           r2='r2',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)
        jvm_memory = str(
            round(config.max_mem_size_mb * 2 /
                  3)) + "M"  # leaving 1/3rd of available memory for XGBoost

        log.info("Starting H2O cluster with %s cores, %s memory.", nthreads,
                 jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        rnd_port = os.getpid() % (max_port_range -
                                  min_port_range) + min_port_range
        port = config.framework_params.get('_port', rnd_port)

        init_params = config.framework_params.get('_init', {})
        if "logs" in config.framework_params.get('_save_artifacts', []):
            init_params['ice_root'] = output_subdir("logs", config)

        h2o.init(nthreads=nthreads,
                 port=port,
                 min_mem_size=jvm_memory,
                 max_mem_size=jvm_memory,
                 **init_params)

        import_kwargs = {}
        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = None
        if version.parse(h2o.__version__) >= version.parse(
                "3.32.0.3"
        ):  # previous versions may fail to parse correctly some rare arff files using single quotes as enum/string delimiters (pandas also fails on same datasets)
            import_kwargs['quotechar'] = '"'
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            if not verify_loaded_frame(train, dataset):
                h2o.remove(train)
                train = None
                import_kwargs['quotechar'] = "'"

        if not train:
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config),
                               **import_kwargs)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        monitor = (
            BackendMemoryMonitoring(
                frequency_seconds=config.ext.monitoring.frequency_seconds,
                check_on_exit=True,
                verbosity=config.ext.monitoring.verbosity)
            if config.framework_params.get('_monitor_backend', False)
            # else contextlib.nullcontext  # Py 3.7+ only
            else contextlib.contextmanager(iter)([0]))
        with utils.Timer() as training:
            with monitor:
                aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise FrameworkError(
                "H2O could not produce any model in the requested time.")

        with utils.Timer() as predict:
            preds = aml.predict(test)

        preds = extract_preds(preds, test, dataset=dataset)
        save_artifacts(aml, dataset=dataset, config=config)

        return result(output_file=config.output_predictions_file,
                      predictions=preds.predictions,
                      truth=preds.truth,
                      probabilities=preds.probabilities,
                      probabilities_labels=preds.probabilities_labels,
                      models_count=len(aml.leaderboard),
                      training_duration=training.duration,
                      predict_duration=predict.duration)

    finally:
        if h2o.connection():
            # h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
Example #9
0
def run(dataset, config):
    log.info(
        f"\n**** Tuned Random Forest [sklearn v{sklearn.__version__}] ****\n")
    save_metadata(config, version=sklearn.__version__)

    is_classification = config.type == 'classification'

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    tuning_params = config.framework_params.get('_tuning', training_params)
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    metric = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='neg_log_loss',
        mae='neg_mean_absolute_error',
        mse='neg_mean_squared_error',
        r2='r2',
        rmse='neg_root_mean_squared_error',
    )[config.metric]

    n_features = X_train.shape[1]
    default_value = max(1, int(math.sqrt(n_features)))
    below_default = pick_values_uniform(start=1,
                                        end=default_value,
                                        length=5 + 1)[:-1]  # 5 below
    above_default = pick_values_uniform(start=default_value,
                                        end=n_features,
                                        length=10 + 1 -
                                        len(below_default))[1:]  # 5 above
    # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time
    # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values]
    # max_features_to_try = below_default[1:] + above_default[:-1]
    # max_features_values = ([default_value, 1, n_features]
    #                        + random.sample(max_features_to_try, k=len(max_features_to_try)))
    max_features_values = [default_value] + below_default + above_default
    # Define up to how much of total time we spend 'optimizing' `max_features`.
    # (the remainder if used for fitting the final model).
    safety_factor = 0.85
    with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds *
                                             safety_factor)):
        log.info("Evaluating multiple values for `max_features`: %s.",
                 max_features_values)
        max_feature_scores = []
        tuning_durations = []
        for i, max_features_value in enumerate(max_features_values):
            log.info("[{:2d}/{:2d}] Evaluating max_features={}".format(
                i + 1, len(max_features_values), max_features_value))
            imputation = SimpleImputer()
            random_forest = estimator(n_jobs=n_jobs,
                                      random_state=config.seed,
                                      max_features=max_features_value,
                                      **tuning_params)
            pipeline = Pipeline(steps=[('preprocessing',
                                        imputation), ('learning',
                                                      random_forest)])
            with utils.Timer() as cv_scoring:
                try:
                    scores = cross_val_score(estimator=pipeline,
                                             X=dataset.train.X_enc,
                                             y=dataset.train.y_enc,
                                             scoring=metric,
                                             cv=5)
                    max_feature_scores.append(
                        (statistics.mean(scores), max_features_value))
                except stopit.utils.TimeoutException as toe:
                    log.error(
                        "Failed CV scoring for max_features=%s : Timeout",
                        max_features_value)
                    tuning_durations.append(
                        (max_features_value, cv_scoring.duration))
                    raise toe
                except Exception as e:
                    log.error("Failed CV scoring for max_features=%s :\n%s",
                              max_features_value, e)
                    log.debug("Exception:", exc_info=True)
            tuning_durations.append((max_features_value, cv_scoring.duration))

    log.info("Tuning scores:\n%s", sorted(max_feature_scores))
    log.info("Tuning durations:\n%s", sorted(tuning_durations))
    _, best_max_features_value = max(
        max_feature_scores) if len(max_feature_scores) > 0 else (math.nan,
                                                                 'auto')
    log.info("Training final model with `max_features={}`.".format(
        best_max_features_value))
    rf = estimator(n_jobs=n_jobs,
                   random_state=config.seed,
                   max_features=best_max_features_value,
                   **training_params)
    with utils.Timer() as training:
        rf.fit(X_train, y_train)

    with utils.Timer() as predict:
        predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(rf),
                  training_duration=training.duration +
                  sum(map(lambda t: t[1], tuning_durations)),
                  predict_duration=predict.duration)
Example #10
0
def run(dataset, config):
    log.info("\n**** FEDOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to FEDOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2',
                           rmse='neg_mean_squared_error')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    if is_classification:
        metric = ClassificationMetricsEnum.ROCAUC
        task_type = TaskTypesEnum.classification
    else:
        metric = RegressionMetricsEnum.RMSE
        task_type = TaskTypesEnum.regression

    task = Task(task_type)

    x_train, y_train = shuffle(dataset.train.X_enc,
                               dataset.train.y_enc,
                               random_state=0)

    if len(y_train.shape) > 1 and y_train.shape[1] == 1:
        y_train = squeeze(y_train, axis=1)

    x_test = dataset.test.X_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    dataset_to_compose = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_train,
                  target=y_train,
                  task=task,
                  data_type=DataTypesEnum.table)

    dataset_to_test = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_test,
                  target=None,
                  task=task,
                  data_type=DataTypesEnum.table)

    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository().metric_by_id(metric)

    if True:
        with utils.Timer() as training:
            # the choice and initialisation of the GP search
            composer_requirements = GPComposerRequirements(
                primary=available_model_types,
                secondary=available_model_types,
                max_arity=3,
                max_depth=2,
                max_lead_time=datetime.timedelta(minutes=runtime_min * 0.8))

            # GP optimiser parameters choice
            scheme_type = GeneticSchemeTypesEnum.parameter_free
            optimiser_parameters = GPChainOptimiserParameters(
                genetic_scheme_type=scheme_type)

            # Create builder for composer and set composer params
            builder = GPComposerBuilder(task=task).with_requirements(
                composer_requirements).with_metrics(
                    metric_function).with_optimiser_parameters(
                        optimiser_parameters)

            composer = builder.build()

            # the optimal chain generation by composition - the most time-consuming task
            chain_evo_composed = composer.compose_chain(
                data=dataset_to_compose, is_visualise=False)

    else:
        with utils.Timer() as training:
            if is_classification:
                chain_evo_composed = Chain(PrimaryNode('logit'))
            else:
                chain_evo_composed = Chain(PrimaryNode('lasso'))

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)

    log.info('Predicting on the test set.')
    y_test = dataset.test.y_enc
    predictions = chain_evo_composed.predict(dataset_to_test,
                                             output_mode='labels').predict

    if not is_classification:
        probabilities = None
    else:
        probabilities = chain_evo_composed.predict(
            dataset_to_test, output_mode='full_probs').predict

    save_artifacts(chain_evo_composed, config)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=1,
                  training_duration=training.duration)
Example #11
0
def run(dataset, config):
    log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n")
    save_metadata(config, version=supervised.__version__)

    # Mapping of benchmark metrics to MLJAR metrics
    metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse')
    eval_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else "auto"

    # Mapping of benchmark task to MLJAR ML task
    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    is_classification = config.type == "classification"
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    label = dataset.target.name

    train = pd.DataFrame(dataset.train.data,
                         columns=column_names).astype(column_types, copy=False)
    X_train = train.drop(columns=label)
    y_train = train[label]

    test = pd.DataFrame(dataset.test.data,
                        columns=column_names).astype(column_types, copy=False)
    X_test = test.drop(columns=label)
    y_test = test[label]

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    random_state=config.seed,
                    ml_task=ml_task,
                    eval_metric=eval_metric,
                    **training_params)

    with utils.Timer() as training:
        automl.fit(X_train, y_train)

    with utils.Timer() as predict:
        preds = automl.predict_all(X_test)

    predictions, probabilities = None, None
    if is_classification:
        predictions = preds["label"].values
        cols = [f"prediction_{c}" for c in np.unique(y_train)]
        probabilities = preds[cols].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  models_count=len(automl._models),
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #12
0
def run(dataset, config):
    log.info("\n**** GAMA [v%s] ****", __version__)
    log.info("sklearn == %s", sklearn.__version__)
    log.info("category_encoders == %s", category_encoders.__version__)

    is_classification = (config.type == 'classification')
    # Mapping of benchmark metrics to GAMA metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='neg_log_loss',
        mae='neg_mean_absolute_error',
        mse='neg_mean_squared_error',
        msle='neg_mean_squared_log_error',
        r2='r2',
        rmse='neg_mean_squared_error',
    )
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running GAMA with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)

    estimator = GamaClassifier if is_classification else GamaRegressor
    kwargs = dict(n_jobs=n_jobs,
                  max_total_time=config.max_runtime_seconds,
                  scoring=scoring_metric,
                  random_state=config.seed,
                  **training_params)
    version_leq_20_2_0 = version.parse(__version__) <= version.parse('20.2.0')
    if version_leq_20_2_0:
        log_file = touch(
            os.path.join(output_subdir('logs', config), 'gama.log'))
        kwargs['keep_analysis_log'] = log_file
    else:
        kwargs['max_memory_mb'] = config.max_mem_size_mb
        kwargs['output_directory'] = output_subdir('logs', config)

    gama_automl = estimator(**kwargs)

    X_train, y_train = dataset.train.X, dataset.train.y
    # data = file_to_pandas(dataset.train.path, encoding='utf-8')
    # X_train, y_train = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]

    with Timer() as training_timer:
        gama_automl.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    X_test, y_test = dataset.test.X, dataset.test.y
    # data = file_to_pandas(dataset.test.path, encoding='utf-8')
    # X_test, y_test = data.loc[:, data.columns != dataset.target], data.loc[:, dataset.target]

    with Timer() as predict_timer:
        predictions = gama_automl.predict(X_test)
    if is_classification:
        probabilities = gama_automl.predict_proba(X_test)
    else:
        probabilities = None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  probabilities=probabilities,
                  truth=y_test,
                  target_is_encoded=False,
                  models_count=len(gama_automl._final_pop),
                  training_duration=training_timer.duration,
                  predict_duration=predict_timer.duration)
Example #13
0
def run(dataset, config):
    log.info("\n**** genens ****\n")

    is_classification = config.type == 'classification'

    if not is_classification:
        Warning("Regression not supported.")
        return None

    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = {
        'acc': get_scorer('accuracy'),
        'auc': get_scorer('roc_auc'),
        'f1': get_scorer('f1'),
        'logloss': get_scorer('neg_log_loss'),
        'mae': get_scorer('neg_mean_absolute_error'),
        'mse': get_scorer('neg_mean_squared_error'),
        'msle': get_scorer('neg_mean_squared_log_error'),
        'r2': get_scorer('r2')
    }
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    sample_size = config.framework_params.get('_sample_size', None)
    if sample_size is not None:
        evaluator = SampleCrossValEvaluator(sample_size=sample_size,
                                            per_gen=True,
                                            cv_k=5)
    else:
        evaluator = CrossValEvaluator(cv_k=5)

    print(f"Chosen sample size: {sample_size}.")
    print(f'cv_k: {evaluator.cv_k}')

    training_params['evaluator'] = evaluator

    runtime_s = config.max_runtime_seconds
    if runtime_s >= 600:
        runtime_s -= 5 * 60  # avoid premature process termination
    elif runtime_s > 10:
        runtime_s -= 5

    if not config.framework_params.get('disable_logging', True):
        log_path = os.path.join(output_subdir('logs', config),
                                'evo_log_file.txt')
    else:
        log_path = None

    print(f"Setting time limit to {runtime_s} seconds.")

    log.info(
        'Running genens with a maximum time of %ss on %s cores, optimizing %s.',
        runtime_s, n_jobs, scoring_metric)

    if config.seed is not None:
        # random state is yet to be unified in genens
        np.random.seed(config.seed)
        random.seed(config.seed)

    print(f'Training params: {training_params}')

    estimator = GenensClassifier if is_classification else GenensRegressor
    genens_est = estimator(n_jobs=n_jobs,
                           max_evo_seconds=runtime_s,
                           scorer=scoring_metric,
                           log_path=log_path,
                           **training_params)

    with utils.Timer() as training:
        genens_est.fit(X_train, y_train)

    log.info('Predicting on the test set.')

    best_pipe = genens_est.get_best_pipelines()[0]
    best_pipe.fit(X_train, y_train)

    predictions = best_pipe.predict(X_test)

    try:
        probabilities = best_pipe.predict_proba(
            X_test) if is_classification else None
    except AttributeError:
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = utils.Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)

    save_artifacts(genens_est, config)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(genens_est.get_best_pipelines()),
                  training_duration=training.duration)
Example #14
0
def run(dataset, config):
    log.info(f"\n**** Oboe [{config.framework_version}] ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        # regression currently fails (as of 26.02.2019: still under development state by oboe team)
        raise ValueError(
            'Regression is not yet supported (under development).')

    X_train = dataset.train.X
    y_train = dataset.train.y

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_cores = config.framework_params.get('_n_cores', config.cores)

    log.info('Running oboe with a maximum time of {}s on {} cores.'.format(
        config.max_runtime_seconds, n_cores))
    log.warning(
        'We completely ignore the advice to optimize towards metric: {}.'.
        format(config.metric))

    aml = AutoLearner(
        p_type='classification' if is_classification else 'regression',
        n_cores=n_cores,
        runtime_limit=config.max_runtime_seconds,
        **training_params)

    aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(
        aml.ensemble.base_learners) > 0 else []

    with Timer() as training:
        try:
            aml.fit(X_train, y_train)
        except IndexError as e:
            if len(
                    aml_models()
            ) == 0:  # incorrect handling of some IndexError in oboe if ensemble is empty
                raise ValueError(
                    "Oboe could not produce any model in the requested time.")
            raise e

    log.info('Predicting on the test set.')
    X_test = dataset.test.X
    y_test = dataset.test.y
    with Timer() as predict:
        predictions = aml.predict(X_test)
    predictions = predictions.reshape(len(X_test))

    if is_classification:
        probabilities = "predictions"  # encoding is handled by caller in `__init__.py`
    else:
        probabilities = None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(aml_models()),
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #15
0
def run(dataset, config):
    log.info("\n**** AutoSklearn ****\n")
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    # Mapping of benchmark metrics to autosklearn metrics
    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        rmse=metrics.
        mean_squared_error,  # autosklearn can optimize on mse, and we compute rmse independently on predictions
        r2=metrics.r2)
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # Set resources based on datasize
    log.info(
        "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.",
        config.max_runtime_seconds, config.cores, config.max_mem_size_mb,
        perf_metric)
    log.info("Environment: %s", os.environ)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc
    predictors_type = dataset.predictors_type
    log.info("predictors_type=%s", predictors_type)
    # log.info("finite=%s", np.isfinite(X_train))

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto')
    ensemble_memory_limit = config.framework_params.get(
        '_ensemble_memory_limit', 'auto')

    # when memory is large enough, we should have:
    # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb
    total_memory_mb = system_memory_mb().total
    if ml_memory_limit == 'auto':
        ml_memory_limit = max(
            min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)),
            3072)  # 3072 is autosklearn defaults
    if ensemble_memory_limit == 'auto':
        ensemble_memory_limit = max(
            math.ceil(ml_memory_limit -
                      (total_memory_mb - config.max_mem_size_mb)),
            math.ceil(ml_memory_limit / 3),  # default proportions
            1024)  # 1024 is autosklearn defaults
    log.info(
        "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.",
        ml_memory_limit, ensemble_memory_limit, n_jobs)

    log.warning(
        "Using meta-learned initialization, which might be bad (leakage).")
    # TODO: do we need to set per_run_time_limit too?
    estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor
    auto_sklearn = estimator(
        time_left_for_this_task=config.max_runtime_seconds,
        n_jobs=n_jobs,
        ml_memory_limit=ml_memory_limit,
        ensemble_memory_limit=ensemble_memory_limit,
        seed=config.seed,
        **training_params)
    with Timer() as training:
        auto_sklearn.fit(X_train,
                         y_train,
                         metric=perf_metric,
                         feat_type=predictors_type)

    # Convert output to strings for classification
    log.info("Predicting on the test set.")
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = auto_sklearn.predict(X_test)
    probabilities = auto_sklearn.predict_proba(
        X_test) if is_classification else None

    save_artifacts(auto_sklearn, config)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(auto_sklearn.get_models_with_weights()),
                  training_duration=training.duration)
Example #16
0
def run(dataset, config):

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')

    log.info("Running SNN")
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = NNClassifier if is_classification else None
    (_, y_train_counts) = np.unique(y_train, return_counts=True)
    n_input = len(X_train[0])
    n_output = len(y_train_counts)
    n_hidden = [
        n_input * 20, n_input * 20, n_input * 20, n_input * 20, n_input * 20,
        n_input * 20, n_input * 20, n_input * 20, n_input * 20
    ]

    print(n_input, len(X_train))

    if n_output > 2:
        loss = nn.CrossEntropyLoss()
    else:
        loss = nn.BCEWithLogitsLoss()
        n_output = 1
    # loss = nn.CrossEntropyLoss()

    mlp = MLP(n_input=n_input, n_hidden=n_hidden, n_output=n_output)

    batch_size = 5
    epochs = 500

    net = estimator(mlp,
                    batch_size=batch_size,
                    optimizer=torch.optim.SGD,
                    lr=0.01,
                    loss=loss,
                    device='cuda:0',
                    max_epochs=epochs)

    with utils.Timer() as training:
        net.fit(X_train, y_train)

    with utils.Timer() as predict:
        predictions = []
        idx = 0
        count = 0
        for x in X_test:
            x = torch.Tensor(x).to('cuda:0')
            pred_x = net.predict(x).to('cpu').tolist()
            predictions.append(pred_x)

            # if int(y_test[idx]) == int(pred_x):
            #   count += 1
            # idx += 1

    xx_test = torch.Tensor(X_test).to('cuda:0')
    probabilities = net.predict_proba(xx_test).detach().to(
        'cpu') if is_classification else None
    probabilities = probabilities.tolist()

    res = []
    [res.extend(l) for l in y_test]
    res = list(map(int, res))
    # print(res, predictions)
    if n_output == 1:
        auc = roc_auc_score(y_test, predictions)
        print('TEST AUC ', auc)
        print('TEST LOGLOSS ', log_loss(res, predictions))
    else:
        auc_ovo = roc_auc_score(res, probabilities, multi_class='ovo')
        auc_ovr = roc_auc_score(res, probabilities, multi_class='ovr')
        print('TEST AUC OVO {} OVR {}'.format(auc_ovo, auc_ovr))
        print('TEST LOGLOSS ', log_loss(res, probabilities))

    print('TRAIN AND TEST FINISHED')

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #17
0
def run(dataset, config):
    log.info(f"\n**** lightautoml (R) [{__version__}] ****\n")

    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    label = dataset.target.name
    df_train = dataset.train.data

    max_mem_size_gb = float(config.max_mem_size_mb) / 1024
    task = Task(dataset.problem_type
                if dataset.problem_type != 'regression' else 'reg')
    automl = TabularUtilizedAutoML(task=task,
                                   timeout=config.max_runtime_seconds,
                                   cpu_limit=config.cores,
                                   memory_limit=max_mem_size_gb,
                                   random_state=config.seed)

    log.info("Training...")
    with Timer() as training:
        automl.fit_predict(train_data=df_train, roles={'target': label})

    X_test, y_test = dataset.test.X, dataset.test.y
    log.info("Predicting on the test set...")
    with Timer() as predict:
        preds = automl.predict(X_test).data

    probabilities_labels = None
    if is_classification:
        probabilities = preds

        if dataset.problem_type == 'binary':
            probabilities = np.vstack(
                [1 - probabilities[:, 0], probabilities[:, 0]]).T

        predictions = np.argmax(probabilities, axis=1)
        class_map = automl.outer_pipes[0].ml_algos[0].models[0][
            0].reader.class_mapping
        if class_map is None and df_train[label].dtype == bool:
            class_map = {False: 0, True: 1}
        if class_map:
            column_to_class = {
                col: class_
                for class_, col in class_map.items()
            }
            predictions = list(map(column_to_class.get, predictions))
            probabilities_labels = [
                column_to_class[col] for col in sorted(column_to_class)
            ]
    else:
        probabilities = None
        predictions = preds

    log.debug(probabilities)
    log.debug(config.output_predictions_file)

    save_artifacts(automl, config)

    return result(
        output_file=config.output_predictions_file,
        probabilities_labels=probabilities_labels,
        probabilities=probabilities,
        predictions=predictions,
        training_duration=training.duration,
        predict_duration=predict.duration,
    )
Example #18
0
def run(dataset, config):
    log.info(f"\n**** AutoGluon [v{__version__}] ****\n")

    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        r2=metrics.r2,
        rmse=metrics.root_mean_squared_error,
    )

    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    is_classification = config.type == 'classification'
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    train, test = dataset.train.path, dataset.test.path
    label = dataset.target.name
    problem_type = dataset.problem_type

    models_dir = tempfile.mkdtemp() + os.sep  # passed to AG

    with Timer() as training:
        predictor = TabularPredictor(
            label=label,
            eval_metric=perf_metric.name,
            path=models_dir,
            problem_type=problem_type,
        ).fit(train_data=train,
              time_limit=config.max_runtime_seconds,
              **training_params)

    del train

    if is_classification:
        with Timer() as predict:
            probabilities = predictor.predict_proba(test, as_multiclass=True)
        predictions = probabilities.idxmax(axis=1).to_numpy()
    else:
        with Timer() as predict:
            predictions = predictor.predict(test, as_pandas=False)
        probabilities = None

    prob_labels = probabilities.columns.values.astype(
        str).tolist() if probabilities is not None else None

    _leaderboard_extra_info = config.framework_params.get(
        '_leaderboard_extra_info',
        False)  # whether to get extra model info (very verbose)
    _leaderboard_test = config.framework_params.get(
        '_leaderboard_test',
        False)  # whether to compute test scores in leaderboard (expensive)
    leaderboard_kwargs = dict(silent=True, extra_info=_leaderboard_extra_info)
    # Disabled leaderboard test data input by default to avoid long running computation, remove 7200s timeout limitation to re-enable
    if _leaderboard_test:
        leaderboard_kwargs['data'] = test

    leaderboard = predictor.leaderboard(**leaderboard_kwargs)
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None, 'display.width', 1000):
        log.info(leaderboard)

    num_models_trained = len(leaderboard)
    if predictor._trainer.model_best is not None:
        num_models_ensemble = len(
            predictor._trainer.get_minimum_model_set(
                predictor._trainer.model_best))
    else:
        num_models_ensemble = 1

    save_artifacts(predictor, leaderboard, config)
    shutil.rmtree(predictor.path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  probabilities=probabilities,
                  probabilities_labels=prob_labels,
                  target_is_encoded=False,
                  models_count=num_models_trained,
                  models_ensemble_count=num_models_ensemble,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #19
0
def run(dataset, config):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
        rmse=(mean_squared_error, False),
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **training_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = estimator.predict(X_test)

    if is_classification:
        probabilities = "predictions"  # encoding is handled by caller in `__init__.py`
    else:
        probabilities = None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.trials),
                  training_duration=training.duration)
Example #20
0
def run(dataset, config):
    askl_method_version = 2 if config.framework_params.get('_askl2',
                                                           False) else 1
    askl_string = "Auto-sklearn2.0" if askl_method_version == 2 else "Auto-sklearn"

    log.info(f"\n**** {askl_string} [v{autosklearn.__version__}]****\n")
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'
    dataset_name = config.name

    # Mapping of benchmark metrics to autosklearn metrics
    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        rmse=metrics.mean_squared_error if askl_version < version.parse("0.10")
        else metrics.root_mean_squared_error,
        r2=metrics.r2)
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # Set resources based on datasize
    log.info(
        "Running %s for %s with a maximum time of %ss on %s cores with %sMB, optimizing %s.",
        askl_string,
        dataset_name,
        config.max_runtime_seconds,
        config.cores,
        config.max_mem_size_mb,
        perf_metric,
    )
    log.info("Environment: %s", os.environ)

    X_train = dataset.train.X
    y_train = dataset.train.y
    predictors_type = dataset.predictors_type
    log.debug("predictors_type=%s", predictors_type)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto')

    constr_params = {}
    fit_extra_params = {'dataset_name': dataset_name}

    total_memory_mb = system_memory_mb().total
    if ml_memory_limit == 'auto':
        ml_memory_limit = max(
            min(config.max_mem_size_mb / n_jobs,
                math.ceil(total_memory_mb / n_jobs)),
            3072  # 3072 is autosklearn default and we use it as a lower bound
        )
    if isinstance(
            askl_version,
            version.LegacyVersion) or askl_version >= version.parse("0.11"):
        log.info("Using %sMB memory per job and on a total of %s jobs.",
                 ml_memory_limit, n_jobs)
        constr_params["memory_limit"] = ml_memory_limit
    else:
        ensemble_memory_limit = config.framework_params.get(
            '_ensemble_memory_limit', 'auto')
        # when memory is large enough, we should have:
        # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb
        if ensemble_memory_limit == 'auto':
            ensemble_memory_limit = max(
                math.ceil(ml_memory_limit -
                          (total_memory_mb - config.max_mem_size_mb)),
                math.ceil(ml_memory_limit / 3),  # default proportions
                1024)  # 1024 is autosklearn defaults
        log.info(
            "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.",
            ml_memory_limit, ensemble_memory_limit, n_jobs)
        constr_params["ml_memory_limit"] = ml_memory_limit
        constr_params["ensemble_memory_limit"] = ensemble_memory_limit

    log.warning(
        "Using meta-learned initialization, which might be bad (leakage).")
    if is_classification:
        estimator = AutoSklearn2Classifier if askl_method_version == 2 else AutoSklearnClassifier
    else:
        if askl_method_version == 2:
            log.warning(
                '%s does not support regression, falling back to regular Auto-sklearn!',
                askl_string,
            )
        estimator = AutoSklearnRegressor

    if isinstance(
            askl_version,
            version.LegacyVersion) or askl_version >= version.parse("0.8"):
        constr_params['metric'] = perf_metric
    else:
        fit_extra_params['metric'] = perf_metric

    constr_params["time_left_for_this_task"] = config.max_runtime_seconds
    constr_params["n_jobs"] = n_jobs
    constr_params["seed"] = config.seed

    log.info("%s constructor arguments: %s", askl_string, constr_params)
    log.info("%s additional constructor arguments: %s", askl_string,
             training_params)
    log.info("%s fit() arguments: %s", askl_string, fit_extra_params)

    auto_sklearn = estimator(**constr_params, **training_params)
    with Timer() as training:
        auto_sklearn.fit(X_train,
                         y_train,
                         feat_type=predictors_type,
                         **fit_extra_params)

    # Convert output to strings for classification
    log.info("Predicting on the test set.")
    X_test = dataset.test.X
    y_test = dataset.test.y
    with Timer() as predict:
        predictions = auto_sklearn.predict(X_test)
    probabilities = auto_sklearn.predict_proba(
        X_test) if is_classification else None

    save_artifacts(auto_sklearn, config)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(auto_sklearn.get_models_with_weights()),
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #21
0
def run(dataset, config):
    log.info(
        f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config
    estimators_params = {
        e: config.framework_params.get(f'_{e}_params', {})
        for e in ['rf', 'gbm', 'linear', 'svc', 'final']
    }

    log.info(
        "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores."
        .format(config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    if is_classification:
        estimator = StackingClassifier(
            estimators=[
                ('rf',
                 RandomForestClassifier(n_jobs=n_jobs,
                                        random_state=config.seed,
                                        **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingClassifier(random_state=config.seed,
                                            **estimators_params['gbm'])),
                ('linear',
                 SGDClassifier(n_jobs=n_jobs,
                               random_state=config.seed,
                               **estimators_params['linear'])),
                # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
            ],
            # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
            final_estimator=LogisticRegression(n_jobs=n_jobs,
                                               random_state=config.seed,
                                               **estimators_params['final']),
            stack_method='predict_proba',
            n_jobs=n_jobs,
            **training_params)
    else:
        estimator = StackingRegressor(
            estimators=[
                ('rf',
                 RandomForestRegressor(n_jobs=n_jobs,
                                       random_state=config.seed,
                                       **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingRegressor(random_state=config.seed,
                                           **estimators_params['gbm'])),
                ('linear',
                 SGDRegressor(random_state=config.seed,
                              **estimators_params['linear'])),
                ('svc',
                 LinearSVR(random_state=config.seed,
                           **estimators_params['svc']))
            ],
            # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
            final_estimator=LinearRegression(n_jobs=n_jobs,
                                             random_state=config.seed,
                                             **estimators_params['final']),
            n_jobs=n_jobs,
            **training_params)

    with utils.Timer() as training:
        estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = estimator.predict_proba(
        X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.estimators_) + 1,
                  training_duration=training.duration)
Example #22
0
def run(dataset, config):
    log.info(f"\n**** AutoGluon [v{__version__}] ****\n")
    save_metadata(config, version=__version__)

    metrics_mapping = dict(
        acc=metrics.accuracy,
        auc=metrics.roc_auc,
        f1=metrics.f1,
        logloss=metrics.log_loss,
        mae=metrics.mean_absolute_error,
        mse=metrics.mean_squared_error,
        r2=metrics.r2,
        rmse=metrics.root_mean_squared_error,
    )

    label = dataset.target.name
    problem_type = dataset.problem_type

    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    is_classification = config.type == 'classification'
    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}

    load_raw = config.framework_params.get('_load_raw', False)
    if load_raw:
        train, test = load_data_raw(dataset=dataset)
    else:
        column_names, _ = zip(*dataset.columns)
        column_types = dict(dataset.columns)
        train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False)
        print(f"Columns dtypes:\n{train.dtypes}")
        test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False)

    del dataset
    gc.collect()

    output_dir = output_subdir("models", config)
    with utils.Timer() as training:
        predictor = TabularPredictor(
            label=label,
            eval_metric=perf_metric.name,
            path=output_dir,
            problem_type=problem_type,
        ).fit(
            train_data=train,
            time_limit=config.max_runtime_seconds,
            **training_params
        )

    del train

    y_test = test[label]
    test = test.drop(columns=label)

    if is_classification:
        with utils.Timer() as predict:
            probabilities = predictor.predict_proba(test, as_multiclass=True)
        predictions = probabilities.idxmax(axis=1).to_numpy()
    else:
        with utils.Timer() as predict:
            predictions = predictor.predict(test, as_pandas=False)
        probabilities = None

    prob_labels = probabilities.columns.values.tolist() if probabilities is not None else None

    leaderboard = predictor.leaderboard(silent=True)  # Removed test data input to avoid long running computation, remove 7200s timeout limitation to re-enable
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        print(leaderboard)

    save_artifacts(predictor, leaderboard, config)

    num_models_trained = len(leaderboard)
    if predictor._trainer.model_best is not None:
        num_models_ensemble = len(predictor._trainer.get_minimum_model_set(predictor._trainer.model_best))
    else:
        num_models_ensemble = 1

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  probabilities_labels=prob_labels,
                  target_is_encoded=False,
                  models_count=num_models_trained,
                  models_ensemble_count=num_models_ensemble,
                  training_duration=training.duration,
                  predict_duration=predict.duration)
Example #23
0
def run(dataset, config):
    jar_file = glob.glob("{here}/lib/mlplan/mlplan-cli*.jar".format(
        here=os.path.dirname(__file__)))[0]
    version = re.match(r".*/mlplan-cli-(.*).jar", jar_file)[1]
    log.info(f"\n**** ML-Plan [v{version}] ****\n")

    is_classification = config.type == 'classification'

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='ERRORRATE',
                           auc='AUC',
                           logloss='LOGLOSS',
                           f1='F1',
                           r2='R2',
                           rmse='ROOT_MEAN_SQUARED_ERROR',
                           mse='MEAN_SQUARED_ERROR',
                           rmsle='ROOT_MEAN_SQUARED_LOGARITHM_ERROR',
                           mae='MEAN_ABSOLUTE_ERROR')

    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError('Performance metric {} is not supported.'.format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    backend = config.framework_params.get('_backend', 'weka')

    if backend == "weka":
        mem_limit = str(max(config.max_mem_size_mb - 1024, 2048))
    else:
        mem_limit = str(
            max(round((config.max_mem_size_mb - 1024) / config.cores), 2048))

    mode = backend
    if config.type == 'regression':
        mode += '-regression'

    log.info(
        "Running ML-Plan with backend %s in mode %s and a maximum time of %ss on %s cores with %sMB for the JVM, optimizing %s.",
        backend, mode, config.max_runtime_seconds, config.cores,
        config.max_mem_size_mb, metric)
    log.info("Environment: %s", os.environ)

    predictions_file = os.path.join(output_subdir('mlplan_out', config),
                                    'predictions.csv')
    statistics_file = os.path.join(output_subdir('mlplan_out', config),
                                   'statistics.json')
    #tmp_dir = output_subdir('mlplan_tmp', config)

    cmd_root = f"java -jar -Xmx{mem_limit}M {jar_file}"

    with tempfile.TemporaryDirectory() as tmp_dir:
        cmd_params = dict(
            f='"{}"'.format(train_file),
            p='"{}"'.format(test_file),
            t=config.max_runtime_seconds,
            ncpus=config.cores,
            l=metric,
            m=mode,
            s=config.seed,  # weka accepts only int16 as seeds
            ooab=predictions_file,
            os=statistics_file,
            tmp=tmp_dir,
            **training_params)

        cmd = cmd_root + ''.join(
            [" -{} {}".format(k, v) for k, v in cmd_params.items()])

        with utils.Timer() as training:
            utils.run_cmd(cmd, _live_output_=True)

    with open(statistics_file, 'r') as f:
        stats = json.load(f)

    predictions = stats["predictions"]
    truth = stats["truth"]
    numEvals = stats["num_evaluations"]

    # only for classification tasks we have probabilities available, thus check whether the json contains the respective fields
    if "probabilities" in stats and "probabilities_labels" in stats:
        probabilities = stats["probabilities"]
        probabilities_labels = stats["probabilities_labels"]
    else:
        probabilities = []
        probabilities_labels = []

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=truth,
                  probabilities=probabilities,
                  probabilities_labels=probabilities_labels,
                  target_is_encoded=is_classification,
                  models_count=numEvals,
                  training_duration=training.duration)
Example #24
0
def run(dataset, config):
    log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n")

    # Mapping of benchmark metrics to MLJAR metrics
    metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse')
    eval_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else "auto"

    # Mapping of benchmark task to MLJAR ML task
    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    is_classification = config.type == "classification"
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    random_state=config.seed,
                    ml_task=ml_task,
                    eval_metric=eval_metric,
                    **training_params)

    with Timer() as training:
        automl.fit(X_train, y_train)

    with Timer() as predict:
        preds = automl.predict_all(X_test)

    predictions, probabilities, probabilities_labels = None, None, None
    if is_classification:
        # preds is a dataframe with columns ["prediction_LABEL", .., "label"]
        if y_train.dtype == bool and preds["label"].dtype == int:
            # boolean target produces integer predictions for mljar-supervised <= 0.10.6
            # https://github.com/mljar/mljar-supervised/issues/442
            preds = preds.rename(
                {
                    "prediction_0": "False",
                    "prediction_1": "True"
                }, axis=1)
            preds["label"] = preds["label"].astype(bool)
        else:
            preds.columns = [
                c.replace("prediction_", "", 1) for c in preds.columns
            ]

        predictions = preds["label"].values
        probabilities_labels = list(preds.columns)[:-1]
        probabilities = preds[probabilities_labels].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  probabilities_labels=probabilities_labels,
                  models_count=len(automl._models),
                  training_duration=training.duration,
                  predict_duration=predict.duration)