Exemple #1
0
def save_predictions(model,
                     test,
                     dataset,
                     config,
                     predictions_file=None,
                     preview=True):
    h2o_preds = model.predict(test).as_data_frame(use_pandas=False)
    preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
    y_pred = preds.iloc[:, 0]

    h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False,
                                                            header=False)
    y_truth = to_data_frame(h2o_truth)

    predictions = y_pred.values
    probabilities = preds.iloc[:, 1:].values
    prob_labels = h2o_preds[0][1:]
    truth = y_truth.values

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file
                             if predictions_file is None else predictions_file,
                             probabilities=probabilities,
                             probabilities_labels=prob_labels,
                             predictions=predictions,
                             truth=truth,
                             preview=preview)
Exemple #2
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** dabl AnyClassifier ****\n")

    is_classification = config.type == 'classification'

    print(dataset.train.X)
    
    X_train, X_test = pd.DataFrame(dataset.train.X).astype(str), pd.DataFrame(dataset.test.X).astype(str)
    y_train, y_test = pd.Series(dataset.train.y), pd.Series(dataset.test.y)



    estimator = AnyClassifier if is_classification else None
    predictor = estimator(**config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(
        models_count=1,
        training_duration=training.duration
    )
def save_predictions(model,
                     test,
                     dataset,
                     config,
                     predictions_file=None,
                     preview=True):
    h2o_preds = model.predict(test).as_data_frame(use_pandas=False)
    preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
    y_pred = preds.iloc[:, 0]

    h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False,
                                                            header=False)
    y_truth = to_data_frame(h2o_truth)

    predictions = y_pred.values
    probabilities = preds.iloc[:, 1:].values
    prob_labels = h2o_preds[0][1:]
    if all([re.fullmatch(r"p\d+", p) for p in prob_labels]):
        # for categories represented as numerical values, h2o prefixes the probabilities columns with p
        # in this case, we let the app setting the labels to avoid mismatch
        prob_labels = None
    truth = y_truth.values

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file
                             if predictions_file is None else predictions_file,
                             probabilities=probabilities,
                             probabilities_labels=prob_labels,
                             predictions=predictions,
                             truth=truth,
                             preview=preview)
Exemple #4
0
def run(dataset: Dataset, config: TaskConfig):
    validate_config(config)
    tuner = NNITuner(config)
    if config.framework_params['limit_type'] == 'time':
        log.info("Tuning {} with NNI {} with a maximum time of {}s\n".format(
            config.framework_params['arch_type'], tuner.description,
            config.max_runtime_seconds))
    elif config.framework_params['limit_type'] == 'ntrials':
        log.info(
            "Tuning {} with NNI {} with a maximum number of trials of {}\n".
            format(config.framework_params['arch_type'], tuner.description,
                   config.framework_params['trial_limit']))
        log.info("Note: any time constraints are ignored.")

    probabilities, predictions, train_timer, y_test, intermediate_scores, intermediate_best_scores = run_experiment(
        dataset, config, tuner, log)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    scores_file = '/'.join(
        config.output_predictions_file.split('/')
        [:-3]) + '/scorelogs/' + config.output_predictions_file.split('/')[-1]
    assert (len(intermediate_scores) == len(intermediate_best_scores))
    save_scores_to_file(intermediate_scores, intermediate_best_scores,
                        scores_file)

    return dict(models_count=1, training_duration=train_timer.duration)
Exemple #5
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Constant predictor (sklearn dummy) ****\n")

    is_classification = config.type == 'classification'
    predictor = DummyClassifier(
        strategy='prior') if is_classification else DummyRegressor(
            strategy='median')

    encode = config.framework_params[
        'encode'] if 'encode' in config.framework_params else False
    X_train = dataset.train.X_enc if encode else dataset.train.X
    y_train = dataset.train.y_enc if encode else dataset.train.y
    X_test = dataset.test.X_enc if encode else dataset.test.X
    y_test = dataset.test.y_enc if encode else dataset.test.y

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=encode)

    return dict(models_count=1, training_duration=training.duration)
Exemple #6
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Gradient Boosting (sklearn %s) ****\n",
             sklearn.__version__)

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
Exemple #7
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Oboe ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        # regression currently fails (as of 26.02.2019: still under development state by oboe team)
        raise ValueError('Regression is not yet supported (under development).')

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_cores = config.framework_params.get('_n_cores', config.cores)

    log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores))
    log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric))

    aml = AutoLearner(p_type='classification' if is_classification else 'regression',
                      n_cores=n_cores,
                      runtime_limit=config.max_runtime_seconds,
                      **training_params)

    aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else []

    with Timer() as training:
        try:
            aml.fit(X_train, y_train)
        except IndexError as e:
            if len(aml_models()) == 0:  # incorrect handling of some IndexError in oboe if ensemble is empty
                raise NoResultError("Oboe could not produce any model in the requested time.") from e
            raise e

    predictions = aml.predict(X_test).reshape(len(X_test))

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(dataset.target.values)
        probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(
        models_count=len(aml_models()),
        training_duration=training.duration
    )
Exemple #8
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params)

    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf), training_duration=training.duration)
Exemple #9
0
def run(dataset: Dataset, config: TaskConfig):
    if 'tuner_type' not in config.framework_params:
        raise RuntimeError(
            'framework.yaml does not have a "tuner_type" field.')

    tuner = NNITuner(config)
    log.info("Tuning {} with NNI {} with a maximum time of {}s\n".format(
        config.framework_params['arch_type'], tuner.description,
        config.max_runtime_seconds))

    probabilities, predictions, train_timer, y_test = run_experiment(
        dataset, config, tuner, log)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=train_timer.duration)
Exemple #10
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
Exemple #11
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TmpDir() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        log.debug("Result from subprocess:\n%s", res)
        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions_to_file(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    **res.others.__dict__)
Exemple #12
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **training_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(estimator.trials),
                training_duration=training.duration)
Exemple #13
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** TPOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    estimator = TPOTClassifier if is_classification else TPOTRegressor
    tpot = estimator(n_jobs=n_jobs,
                     max_time_mins=runtime_min,
                     scoring=scoring_metric,
                     random_state=config.seed,
                     **training_params)

    with Timer() as training:
        tpot.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    predictions = tpot.predict(X_test)
    try:
        probabilities = tpot.predict_proba(
            X_test) if is_classification else None
    except RuntimeError:
        # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`.
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=is_classification)

    save_artifacts(tpot, config)

    return dict(models_count=len(tpot.evaluated_individuals_),
                training_duration=training.duration)
Exemple #14
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Tuned Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    tuning_params = config.framework_params.get('_tuning', training_params)
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    metric = dict(auc='roc_auc', logloss='neg_log_loss',
                  acc='accuracy')[config.metric]

    n_features = X_train.shape[1]
    default_value = max(1, int(math.sqrt(n_features)))
    below_default = pick_values_uniform(start=1,
                                        end=default_value,
                                        length=5 + 1)[:-1]  # 5 below
    above_default = pick_values_uniform(start=default_value,
                                        end=n_features,
                                        length=10 + 1 -
                                        len(below_default))[1:]  # 5 above
    # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time
    # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values]
    # max_features_to_try = below_default[1:] + above_default[:-1]
    # max_features_values = ([default_value, 1, n_features]
    #                        + random.sample(max_features_to_try, k=len(max_features_to_try)))
    max_features_values = [default_value] + below_default + above_default
    # Define up to how much of total time we spend 'optimizing' `max_features`.
    # (the remainder if used for fitting the final model).
    safety_factor = 0.85
    with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds *
                                             safety_factor)):
        log.info("Evaluating multiple values for `max_features`: %s.",
                 max_features_values)
        max_feature_scores = []
        tuning_durations = []
        for i, max_features_value in enumerate(max_features_values):
            log.info("[{:2d}/{:2d}] Evaluating max_features={}".format(
                i + 1, len(max_features_values), max_features_value))
            imputation = Imputer()
            random_forest = estimator(n_jobs=n_jobs,
                                      random_state=config.seed,
                                      max_features=max_features_value,
                                      **tuning_params)
            pipeline = Pipeline(steps=[('preprocessing',
                                        imputation), ('learning',
                                                      random_forest)])
            with Timer() as cv_scoring:
                try:
                    scores = cross_val_score(estimator=pipeline,
                                             X=dataset.train.X_enc,
                                             y=dataset.train.y_enc,
                                             scoring=metric,
                                             cv=5)
                    max_feature_scores.append(
                        (statistics.mean(scores), max_features_value))
                except stopit.utils.TimeoutException as toe:
                    log.error(
                        "Failed CV scoring for max_features=%s : Timeout",
                        max_features_value)
                    tuning_durations.append(
                        (max_features_value, cv_scoring.duration))
                    raise toe
                except Exception as e:
                    log.error("Failed CV scoring for max_features=%s :\n%s",
                              max_features_value, e)
                    log.debug("Exception:", exc_info=True)
            tuning_durations.append((max_features_value, cv_scoring.duration))

    log.info("Tuning scores:\n%s", sorted(max_feature_scores))
    log.info("Tuning durations:\n%s", sorted(tuning_durations))
    _, best_max_features_value = max(
        max_feature_scores) if len(max_feature_scores) > 0 else (math.nan,
                                                                 'auto')
    log.info("Training final model with `max_features={}`.".format(
        best_max_features_value))
    rf = estimator(n_jobs=n_jobs,
                   random_state=config.seed,
                   max_features=best_max_features_value,
                   **training_params)
    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf),
                training_duration=training.duration +
                sum(map(lambda t: t[1], tuning_durations)))
Exemple #15
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads,
                 config.max_mem_size_mb)
        h2o.init(nthreads=nthreads,
                 min_mem_size=str(config.max_mem_size_mb) + "M",
                 max_mem_size=str(config.max_mem_size_mb) + "M",
                 log_dir=os.path.join(config.output_dir, 'logs', config.name,
                                      str(config.fold)))

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path)
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        lb = aml.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        lbf = split_path(config.output_predictions_file)
        lbf.extension = '.leaderboard.csv'
        lbf = path_from_split(lbf)
        write_csv(lb, lbf)

        h2o_preds = aml.predict(test).as_data_frame(use_pandas=False)
        preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
        y_pred = preds.iloc[:, 0]

        h2o_truth = test[:,
                         dataset.target.index].as_data_frame(use_pandas=False,
                                                             header=False)
        y_truth = to_data_frame(h2o_truth)

        predictions = y_pred.values
        probabilities = preds.iloc[:, 1:].values
        truth = y_truth.values

        save_predictions_to_file(dataset=dataset,
                                 output_file=config.output_predictions_file,
                                 probabilities=probabilities,
                                 predictions=predictions,
                                 truth=truth)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
Exemple #16
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(
        "\n**** Random Forest (sklearn) Tuned with NNI EvolutionTuner ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X, dataset.test.X)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    # model = estimator(random_state=config.seed, **config.framework_params)
    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    log.info(
        "Tuning hyperparameters with NNI EvolutionTuner with a maximum time of {}s\n"
        .format(config.max_runtime_seconds))
    tuner = EvolutionTuner()
    tuner.update_search_space(SEARCH_SPACE)
    start_time = time.time()
    param_idx = 0
    while True:
        try:
            cur_params = tuner.generate_parameters(param_idx)
            cur_model = estimator(random_state=config.seed,
                                  **cur_params,
                                  **config.framework_params)
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if best_score is None or (score_higher_better
                                      and cur_score > best_score) or (
                                          not score_higher_better
                                          and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model

            log.info("Trial {}: \n{}\nScore: {}\n".format(
                param_idx, cur_params, cur_score))
            tuner.receive_trial_result(param_idx, cur_params, cur_score)
            param_idx += 1
            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > config.max_runtime_seconds:
                break
        except:
            break

    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset
    with Timer() as training:
        best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
Exemple #17
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** AutoSklearn ****\n")
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    # Mapping of benchmark metrics to autosklearn metrics
    metrics_mapping = dict(acc=metrics.accuracy,
                           auc=metrics.roc_auc,
                           f1=metrics.f1,
                           logloss=metrics.log_loss,
                           mae=metrics.mean_absolute_error,
                           mse=metrics.mean_squared_error,
                           r2=metrics.r2)
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # Set resources based on datasize
    log.info(
        "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.",
        config.max_runtime_seconds, config.cores, config.max_mem_size_mb,
        perf_metric)
    log.info("Environment: %s", os.environ)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc
    # log.info("finite=%s", np.isfinite(X_train))
    predictors_type = [
        'Categorical' if p.is_categorical() else 'Numerical'
        for p in dataset.predictors
    ]

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto')
    ensemble_memory_limit = config.framework_params.get(
        '_ensemble_memory_limit', 'auto')

    # when memory is large enough, we should have:
    # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb
    total_memory_mb = system_memory_mb().total
    if ml_memory_limit == 'auto':
        ml_memory_limit = max(
            min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)),
            3072)  # 3072 is autosklearn defaults
    if ensemble_memory_limit == 'auto':
        ensemble_memory_limit = max(
            math.ceil(ml_memory_limit -
                      (total_memory_mb - config.max_mem_size_mb)),
            math.ceil(ml_memory_limit / 3),  # default proportions
            1024)  # 1024 is autosklearn defaults
    log.info(
        "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.",
        ml_memory_limit, ensemble_memory_limit, n_jobs)

    log.warning(
        "Using meta-learned initialization, which might be bad (leakage).")
    # TODO: do we need to set per_run_time_limit too?
    estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor
    auto_sklearn = estimator(
        time_left_for_this_task=config.max_runtime_seconds,
        n_jobs=n_jobs,
        ml_memory_limit=ml_memory_limit,
        ensemble_memory_limit=ensemble_memory_limit,
        seed=config.seed,
        **training_params)
    with Timer() as training:
        auto_sklearn.fit(X_train,
                         y_train,
                         metric=perf_metric,
                         feat_type=predictors_type)

    # Convert output to strings for classification
    log.info("Predicting on the test set.")
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = auto_sklearn.predict(X_test)
    probabilities = auto_sklearn.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    save_artifacts(auto_sklearn, config)

    return dict(models_count=len(auto_sklearn.get_models_with_weights()),
                training_duration=training.duration)
Exemple #18
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** AutoWEKA ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='errorRate',
                           auc='areaUnderROC',
                           logloss='kBInformation')
    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path
    # Weka to requires target as the last attribute
    if dataset.target.index != len(dataset.predictors):
        train_file = reorder_dataset(dataset.train.path,
                                     target_src=dataset.target.index)
        test_file = reorder_dataset(dataset.test.path,
                                    target_src=dataset.target.index)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    parallelRuns = config.framework_params.get('_parallelRuns', config.cores)

    memLimit = config.framework_params.get('_memLimit', 'auto')
    if memLimit == 'auto':
        memLimit = max(
            min(config.max_mem_size_mb,
                math.ceil(config.max_mem_size_mb / parallelRuns)),
            1024)  # AutoWEKA default memLimit
    log.info("Using %sMB memory per run on %s parallel runs.", memLimit,
             parallelRuns)

    f = split_path(config.output_predictions_file)
    f.extension = '.weka_pred.csv'
    weka_file = path_from_split(f)
    cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format(
        here=dir_of(__file__))
    cmd_params = dict(
        t='"{}"'.format(train_file),
        T='"{}"'.format(test_file),
        memLimit=memLimit,
        classifications=
        '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""'
        .format(weka_file),
        timeLimit=int(config.max_runtime_seconds / 60),
        parallelRuns=parallelRuns,
        metric=metric,
        seed=config.seed % (1 << 16),  # weka accepts only int16 as seeds
        **training_params)
    cmd = cmd_root + ' '.join(
        ["-{} {}".format(k, v) for k, v in cmd_params.items()])
    with Timer() as training:
        run_cmd(cmd, _live_output_=True)

    # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order
    # interestingly, other frameworks seem to always sort the target values first
    # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function
    probabilities_labels = dataset.target.values
    if not os.path.exists(weka_file):
        raise NoResultError("AutoWEKA failed producing any prediction.")
    with open(weka_file, 'r') as weka_file:
        probabilities = []
        predictions = []
        truth = []
        for line in weka_file.readlines()[1:-1]:
            inst, actual, predicted, error, *distribution = line.split(',')
            pred_probabilities = [
                pred_probability.replace('*', '').replace('\n', '')
                for pred_probability in distribution
            ]
            _, pred = predicted.split(':')
            _, tru = actual.split(':')
            probabilities.append(pred_probabilities)
            predictions.append(pred)
            truth.append(tru)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=truth,
                             probabilities_labels=probabilities_labels)

    return dict(training_duration=training.duration)