예제 #1
0
def _assert_data_paths(dataset, definition):
    assert dataset.train.path == definition.train
    assert dataset.test.path == definition.test
    sp = split_path(definition.train)
    fmt = sp.extension[1:]
    for f in ['arff', 'csv', 'parquet']:
        if f == fmt:
            assert dataset.train.data_path(f) == dataset.train.path
        else:
            s = copy.copy(sp)
            s.extension = f
            assert dataset.train.data_path(f) == path_from_split(s)
예제 #2
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n")
    save_metadata(config)

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='errorRate',
                           auc='areaUnderROC',
                           logloss='kBInformation')
    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path
    # Weka to requires target as the last attribute
    if dataset.target.index != len(dataset.predictors):
        train_file = reorder_dataset(dataset.train.path,
                                     target_src=dataset.target.index)
        test_file = reorder_dataset(dataset.test.path,
                                    target_src=dataset.target.index)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    parallelRuns = config.framework_params.get('_parallelRuns', config.cores)

    memLimit = config.framework_params.get('_memLimit', 'auto')
    if memLimit == 'auto':
        memLimit = max(
            min(config.max_mem_size_mb,
                math.ceil(config.max_mem_size_mb / parallelRuns)),
            1024)  # AutoWEKA default memLimit
    log.info("Using %sMB memory per run on %s parallel runs.", memLimit,
             parallelRuns)

    f = split_path(config.output_predictions_file)
    f.extension = '.weka_pred.csv'
    weka_file = path_from_split(f)
    cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format(
        here=dir_of(__file__))
    cmd_params = dict(
        t='"{}"'.format(train_file),
        T='"{}"'.format(test_file),
        memLimit=memLimit,
        classifications=
        '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""'
        .format(weka_file),
        timeLimit=int(config.max_runtime_seconds / 60),
        parallelRuns=parallelRuns,
        metric=metric,
        seed=config.seed % (1 << 16),  # weka accepts only int16 as seeds
        **training_params)
    cmd = cmd_root + ' '.join(
        ["-{} {}".format(k, v) for k, v in cmd_params.items()])
    with Timer() as training:
        run_cmd(cmd, _live_output_=True)

    # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order
    # interestingly, other frameworks seem to always sort the target values first
    # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function
    probabilities_labels = dataset.target.values
    if not os.path.exists(weka_file):
        raise NoResultError("AutoWEKA failed producing any prediction.")
    with open(weka_file, 'r') as weka_file:
        probabilities = []
        predictions = []
        truth = []
        for line in weka_file.readlines()[1:-1]:
            inst, actual, predicted, error, *distribution = line.split(',')
            pred_probabilities = [
                pred_probability.replace('*', '').replace('\n', '')
                for pred_probability in distribution
            ]
            _, pred = predicted.split(':')
            _, tru = actual.split(':')
            probabilities.append(pred_probabilities)
            predictions.append(pred)
            truth.append(tru)

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=truth,
                     probabilities_labels=probabilities_labels)

    return dict(training_duration=training.duration)
예제 #3
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** AutoSklearn ****\n")
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    # Mapping of benchmark metrics to autosklearn metrics
    metrics_mapping = dict(acc=metrics.accuracy,
                           auc=metrics.roc_auc,
                           f1=metrics.f1,
                           logloss=metrics.log_loss,
                           mae=metrics.mean_absolute_error,
                           mse=metrics.mean_squared_error,
                           r2=metrics.r2)
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # Set resources based on datasize
    log.info(
        "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.",
        config.max_runtime_seconds, config.cores, config.max_mem_size_mb,
        perf_metric)
    log.info("Environment: %s", os.environ)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc
    # log.info("finite=%s", np.isfinite(X_train))
    predictors_type = [
        'Categorical' if p.is_categorical() else 'Numerical'
        for p in dataset.predictors
    ]

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto')
    ensemble_memory_limit = config.framework_params.get(
        '_ensemble_memory_limit', 'auto')

    # when memory is large enough, we should have:
    # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb
    total_memory_mb = system_memory_mb().total
    if ml_memory_limit == 'auto':
        ml_memory_limit = max(
            min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)),
            3072)  # 3072 is autosklearn defaults
    if ensemble_memory_limit == 'auto':
        ensemble_memory_limit = max(
            math.ceil(ml_memory_limit -
                      (total_memory_mb - config.max_mem_size_mb)),
            math.ceil(ml_memory_limit / 3),  # default proportions
            1024)  # 1024 is autosklearn defaults
    log.info(
        "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.",
        ml_memory_limit, ensemble_memory_limit, n_jobs)

    log.warning(
        "Using meta-learned initialization, which might be bad (leakage).")
    # TODO: do we need to set per_run_time_limit too?
    estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor
    auto_sklearn = estimator(
        time_left_for_this_task=config.max_runtime_seconds,
        n_jobs=n_jobs,
        ml_memory_limit=ml_memory_limit,
        ensemble_memory_limit=ensemble_memory_limit,
        seed=config.seed,
        **training_params)
    with Timer() as training:
        auto_sklearn.fit(X_train,
                         y_train,
                         metric=perf_metric,
                         feat_type=predictors_type)

    models_repr = auto_sklearn.show_models()
    log.debug("Trained Ensemble:\n%s", models_repr)
    models_file = split_path(config.output_predictions_file)
    models_file.extension = '.models.txt'
    models_file = path_from_split(models_file)
    with open(models_file, 'w') as f:
        f.write(models_repr)

    # Convert output to strings for classification
    log.info("Predicting on the test set.")
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = auto_sklearn.predict(X_test)
    probabilities = auto_sklearn.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(auto_sklearn.get_models_with_weights()),
                training_duration=training.duration)
예제 #4
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** TPOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    estimator = TPOTClassifier if is_classification else TPOTRegressor
    tpot = estimator(n_jobs=n_jobs,
                     max_time_mins=runtime_min,
                     scoring=scoring_metric,
                     random_state=config.seed,
                     **training_params)

    with Timer() as training:
        tpot.fit(X_train, y_train)

    log.debug("All individuals :\n%s",
              list(tpot.evaluated_individuals_.items()))
    models = tpot.pareto_front_fitted_pipelines_
    hall_of_fame = list(
        zip(reversed(tpot._pareto_front.keys), tpot._pareto_front.items))
    models_file = split_path(config.output_predictions_file)
    models_file.extension = '.models.txt'
    models_file = path_from_split(models_file)
    with open(models_file, 'w') as f:
        for m in hall_of_fame:
            pprint.pprint(dict(
                fitness=str(m[0]),
                model=str(m[1]),
                pipeline=models[str(m[1])],
            ),
                          stream=f)

    log.info('Predicting on the test set.')
    predictions = tpot.predict(X_test)
    try:
        probabilities = tpot.predict_proba(
            X_test) if is_classification else None
    except RuntimeError:
        # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`.
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=is_classification)

    return dict(models_count=len(tpot.evaluated_individuals_),
                training_duration=training.duration)
예제 #5
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads,
                 config.max_mem_size_mb)
        h2o.init(nthreads=nthreads,
                 min_mem_size=str(config.max_mem_size_mb) + "M",
                 max_mem_size=str(config.max_mem_size_mb) + "M",
                 log_dir=os.path.join(config.output_dir, 'logs', config.name,
                                      str(config.fold)))

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path)
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        lb = aml.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        lbf = split_path(config.output_predictions_file)
        lbf.extension = '.leaderboard.csv'
        lbf = path_from_split(lbf)
        write_csv(lb, lbf)

        h2o_preds = aml.predict(test).as_data_frame(use_pandas=False)
        preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
        y_pred = preds.iloc[:, 0]

        h2o_truth = test[:,
                         dataset.target.index].as_data_frame(use_pandas=False,
                                                             header=False)
        y_truth = to_data_frame(h2o_truth)

        predictions = y_pred.values
        probabilities = preds.iloc[:, 1:].values
        truth = y_truth.values

        save_predictions_to_file(dataset=dataset,
                                 output_file=config.output_predictions_file,
                                 probabilities=probabilities,
                                 predictions=predictions,
                                 truth=truth)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()