コード例 #1
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Constant predictor (sklearn dummy) ****\n")
    save_metadata(config, version=sklearn.__version__)

    is_classification = config.type == 'classification'
    predictor = DummyClassifier(
        strategy='prior') if is_classification else DummyRegressor(
            strategy='median')

    encode = config.framework_params[
        'encode'] if 'encode' in config.framework_params else False
    X_train = dataset.train.X_enc if encode else dataset.train.X
    y_train = dataset.train.y_enc if encode else dataset.train.y
    X_test = dataset.test.X_enc if encode else dataset.test.X
    y_test = dataset.test.y_enc if encode else dataset.test.y

    with Timer() as training:
        predictor.fit(X_train, y_train)
    with Timer() as predict:
        predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test,
                     target_is_encoded=encode)

    return dict(models_count=1,
                training_duration=training.duration,
                predict_duration=predict.duration)
コード例 #2
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** Decision Tree [sklearn v{sklearn.__version__}] ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute_array(*unsparsify(dataset.train.X_enc, dataset.test.X_enc, fmt='array'))
    y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc, fmt='array')

    estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    with Timer() as predict:
        predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test,
                     target_is_encoded=is_classification)

    return dict(
        models_count=1,
        training_duration=training.duration,
        predict_duration=predict.duration
    )
コード例 #3
0
ファイル: exec.py プロジェクト: fmohr/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Autoxgboost (R) ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    here = dir_of(__file__)

    with Timer() as training:
        run_cmd(
            r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', target.index = {target_index}, '{output}', {cores}, time.budget = {time_budget})" """
            .format(script=os.path.join(here, 'exec.R'),
                    train=dataset.train.path,
                    test=dataset.test.path,
                    target_index=dataset.target.index + 1,
                    output=config.output_predictions_file,
                    cores=config.cores,
                    time_budget=config.max_runtime_seconds),
            _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)

    return dict(training_duration=training.duration)
コード例 #4
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(
        f"\n**** Gradient Boosting [sklearn v{sklearn.__version__}] ****\n")
    save_metadata(config, version=sklearn.__version__)

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
コード例 #5
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** dabl AnyClassifier ****\n")

    is_classification = config.type == 'classification'

    print(dataset.train.X)
    
    X_train, X_test = pd.DataFrame(dataset.train.X).astype(str), pd.DataFrame(dataset.test.X).astype(str)
    y_train, y_test = pd.Series(dataset.train.y), pd.Series(dataset.test.y)



    estimator = AnyClassifier if is_classification else None
    predictor = estimator(**config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(
        models_count=1,
        training_duration=training.duration
    )
コード例 #6
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("****TabNet****")
    save_metadata(config)

    is_classification = config.type == 'classification'
    X_train, X_test = dataset.train.X, dataset.test.X

    X_train, X_test = impute(X_train, X_test)

    X = np.concatenate((X_train, X_test), axis=0)
    enc = OrdinalEncoder()
    enc.fit(X)
    X_train = enc.transform(X_train)
    X_test = enc.transform(X_test)

    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = TabNetClassifier if is_classification else TabNetRegressor
    predictor = estimator()  # you can change hyperparameters

    if not is_classification:
        y_train = np.reshape(y_train.astype(np.float32), (-1, 1))
        y_test = np.reshape(y_test.astype(np.float32), (-1, 1))

    with Timer() as training:
        predictor.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_test, y_test)])
    with Timer() as predict:
        predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=y_test)
    return dict(models_count=1,
                training_duration=training.duration,
                predict_duration=predict.duration)
コード例 #7
0
ファイル: exec.py プロジェクト: shashank0117/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Oboe ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        # regression currently fails (as of 26.02.2019: still under development state by oboe team)
        raise ValueError('Regression is not yet supported (under development).')

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_cores = config.framework_params.get('_n_cores', config.cores)

    log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores))
    log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric))

    aml = AutoLearner(p_type='classification' if is_classification else 'regression',
                      n_cores=n_cores,
                      runtime_limit=config.max_runtime_seconds,
                      **training_params)

    aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else []

    with Timer() as training:
        try:
            aml.fit(X_train, y_train)
        except IndexError as e:
            if len(aml_models()) == 0:  # incorrect handling of some IndexError in oboe if ensemble is empty
                raise NoResultError("Oboe could not produce any model in the requested time.") from e
            raise e

    predictions = aml.predict(X_test).reshape(len(X_test))

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(dataset.target.values)
        probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(
        models_count=len(aml_models()),
        training_duration=training.duration
    )
コード例 #8
0
def run_random_forest(dataset, config, tuner, log):
    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X, dataset.test.X)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor

    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    tuner.update_search_space(SEARCH_SPACE)
    start_time = time.time()
    while True:
        try:
            param_idx, cur_params = tuner.generate_parameters()
            cur_model = estimator(random_state=config.seed, **cur_params)
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if best_score is None or (score_higher_better
                                      and cur_score > best_score) or (
                                          not score_higher_better
                                          and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model

            log.info("Trial {}: \n{}\nScore: {}\n".format(
                param_idx, cur_params, cur_score))
            tuner.receive_trial_result(param_idx, cur_params, cur_score)

            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > config.max_runtime_seconds:
                break
        except:
            break

    # This line is required to fully terminate some advisors
    tuner.handle_terminate()

    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset
    with Timer() as training:
        best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(
        X_test) if is_classification else None

    return probabilities, predictions, training, y_test
コード例 #9
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params)

    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf), training_duration=training.duration)
コード例 #10
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Decision Tree (sklearn) ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor
    predictor = estimator(random_state=config.seed, **config.framework_params)

    with Timer() as training:
        predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    probabilities = predictor.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
コード例 #11
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TemporaryDirectory() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)
コード例 #12
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n")
    save_metadata(config)

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='errorRate',
                           auc='areaUnderROC',
                           logloss='kBInformation')
    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path
    # Weka to requires target as the last attribute
    if dataset.target.index != len(dataset.predictors):
        train_file = reorder_dataset(dataset.train.path,
                                     target_src=dataset.target.index)
        test_file = reorder_dataset(dataset.test.path,
                                    target_src=dataset.target.index)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    parallelRuns = config.framework_params.get('_parallelRuns', config.cores)

    memLimit = config.framework_params.get('_memLimit', 'auto')
    if memLimit == 'auto':
        memLimit = max(
            min(config.max_mem_size_mb,
                math.ceil(config.max_mem_size_mb / parallelRuns)),
            1024)  # AutoWEKA default memLimit
    log.info("Using %sMB memory per run on %s parallel runs.", memLimit,
             parallelRuns)

    f = split_path(config.output_predictions_file)
    f.extension = '.weka_pred.csv'
    weka_file = path_from_split(f)
    cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format(
        here=dir_of(__file__))
    cmd_params = dict(
        t='"{}"'.format(train_file),
        T='"{}"'.format(test_file),
        memLimit=memLimit,
        classifications=
        '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""'
        .format(weka_file),
        timeLimit=int(config.max_runtime_seconds / 60),
        parallelRuns=parallelRuns,
        metric=metric,
        seed=config.seed % (1 << 16),  # weka accepts only int16 as seeds
        **training_params)
    cmd = cmd_root + ' '.join(
        ["-{} {}".format(k, v) for k, v in cmd_params.items()])
    with Timer() as training:
        run_cmd(cmd, _live_output_=True)

    # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order
    # interestingly, other frameworks seem to always sort the target values first
    # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function
    probabilities_labels = dataset.target.values
    if not os.path.exists(weka_file):
        raise NoResultError("AutoWEKA failed producing any prediction.")
    with open(weka_file, 'r') as weka_file:
        probabilities = []
        predictions = []
        truth = []
        for line in weka_file.readlines()[1:-1]:
            inst, actual, predicted, error, *distribution = line.split(',')
            pred_probabilities = [
                pred_probability.replace('*', '').replace('\n', '')
                for pred_probability in distribution
            ]
            _, pred = predicted.split(':')
            _, tru = actual.split(':')
            probabilities.append(pred_probabilities)
            predictions.append(pred)
            truth.append(tru)

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=truth,
                     probabilities_labels=probabilities_labels)

    return dict(training_duration=training.duration)
コード例 #13
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(here, 'venv/bin/python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TmpDir() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(cmd,
                                  *args,
                                  _input_str_=params,
                                  _live_output_=True,
                                  _env_=dict(PYTHONPATH=os.pathsep.join([
                                      rconfig().root_dir,
                                      os.path.join(rconfig().root_dir, "amlb"),
                                  ])))

        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        log.debug("Result from subprocess:\n%s", res)
        if callable(process_results):
            res = process_results(res)

        save_predictions_to_file(
            dataset=dataset,
            output_file=res.output_file,
            predictions=res.predictions.reshape(-1)
            if res.predictions is not None else None,
            truth=res.truth.reshape(-1) if res.truth is not None else None,
            probabilities=res.probabilities,
            target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration)
コード例 #14
0
ファイル: exec.py プロジェクト: shashank0117/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads,
                 config.max_mem_size_mb)
        h2o.init(nthreads=nthreads,
                 min_mem_size=str(config.max_mem_size_mb) + "M",
                 max_mem_size=str(config.max_mem_size_mb) + "M",
                 log_dir=os.path.join(config.output_dir, 'logs', config.name,
                                      str(config.fold)))

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path)
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        lb = aml.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        lbf = split_path(config.output_predictions_file)
        lbf.extension = '.leaderboard.csv'
        lbf = path_from_split(lbf)
        write_csv(lb, lbf)

        h2o_preds = aml.predict(test).as_data_frame(use_pandas=False)
        preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
        y_pred = preds.iloc[:, 0]

        h2o_truth = test[:,
                         dataset.target.index].as_data_frame(use_pandas=False,
                                                             header=False)
        y_truth = to_data_frame(h2o_truth)

        predictions = y_pred.values
        probabilities = preds.iloc[:, 1:].values
        truth = y_truth.values

        save_predictions_to_file(dataset=dataset,
                                 output_file=config.output_predictions_file,
                                 probabilities=probabilities,
                                 predictions=predictions,
                                 truth=truth)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
コード例 #15
0
ファイル: run_random_forest.py プロジェクト: xiaowu0162/nni
def run_random_forest(dataset, config, tuner, log):
    """
    Using the given tuner, tune a random forest within the given time constraint.
    This function uses cross validation score as the feedback score to the tuner. 
    The search space on which tuners search on is defined above empirically as a global variable.
    """
    
    limit_type, trial_limit = config.framework_params['limit_type'], None
    if limit_type == 'ntrials':
        trial_limit = int(config.framework_params['trial_limit'])
    
    X_train, X_test = preprocess_random_forest(dataset, log)
    y_train, y_test = dataset.train.y, dataset.test.y

    is_classification = config.type == 'classification'
    estimator = RandomForestClassifier if is_classification else RandomForestRegressor

    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    tuner.update_search_space(SEARCH_SPACE)    
    
    start_time = time.time()
    trial_count = 0
    intermediate_scores = []
    intermediate_best_scores = []           # should be monotonically increasing 
    
    while True:
        try:
            trial_count += 1
            param_idx, cur_params = tuner.generate_parameters()
            train_params = cur_params.copy()
            if 'TRIAL_BUDGET' in cur_params:
                train_params.pop('TRIAL_BUDGET')
            if cur_params['max_leaf_nodes'] == 0: 
                train_params.pop('max_leaf_nodes')
            if cur_params['max_depth'] == 0:
                train_params.pop('max_depth')
            log.info("Trial {}: \n{}\n".format(param_idx, cur_params))
                
            cur_model = estimator(random_state=config.seed, **train_params)
            
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if np.isnan(cur_score):
                cur_score = 0
            
            log.info("Score: {}\n".format(cur_score))
            if best_score is None or (score_higher_better and cur_score > best_score) or (not score_higher_better and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model    
            
            intermediate_scores.append(cur_score)
            intermediate_best_scores.append(best_score)
            tuner.receive_trial_result(param_idx, cur_params, cur_score)

            if limit_type == 'time':
                current_time = time.time()
                elapsed_time = current_time - start_time
                if elapsed_time >= config.max_runtime_seconds:
                    break
            elif limit_type == 'ntrials':
                if trial_count >= trial_limit:
                    break
        except:
            break

    # This line is required to fully terminate some advisors
    tuner.handle_terminate()
        
    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset 
    with Timer() as training:
        best_model.fit(X_train, y_train)     
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(X_test) if is_classification else None

    return probabilities, predictions, training, y_test, intermediate_scores, intermediate_best_scores
コード例 #16
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           r2='r2',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)
        jvm_memory = str(
            round(config.max_mem_size_mb * 2 /
                  3)) + "M"  # leaving 1/3rd of available memory for XGBoost

        log.info("Starting H2O cluster with %s cores, %s memory.", nthreads,
                 jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        rnd_port = os.getpid() % (max_port_range -
                                  min_port_range) + min_port_range
        port = config.framework_params.get('_port', rnd_port)

        h2o.init(
            nthreads=nthreads,
            port=port,
            min_mem_size=jvm_memory,
            max_mem_size=jvm_memory,
            strict_version_check=config.framework_params.get(
                '_strict_version_check', True)
            # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
        )

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path,
                                destination_frame=frame_name('train', config))
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config))
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(
            max_runtime_secs=config.max_runtime_seconds,
            max_runtime_secs_per_model=round(
                config.max_runtime_seconds /
                2),  # to prevent timeout on ensembles
            sort_metric=sort_metric,
            seed=config.seed,
            **training_params)

        monitor = (
            BackendMemoryMonitoring(
                frequency_seconds=rconfig().monitoring.frequency_seconds,
                check_on_exit=True,
                verbosity=rconfig().monitoring.verbosity)
            if config.framework_params.get('_monitor_backend', False)
            # else contextlib.nullcontext  # Py 3.7+ only
            else contextlib.contextmanager(iter)([0]))
        with Timer() as training:
            with monitor:
                aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        save_predictions(aml, test, dataset=dataset, config=config)
        save_artifacts(aml, dataset=dataset, config=config)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            # h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
コード例 #17
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(
        "\n**** Random Forest (sklearn) Tuned with NNI EvolutionTuner ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = impute(dataset.train.X, dataset.test.X)
    y_train, y_test = dataset.train.y, dataset.test.y

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    # model = estimator(random_state=config.seed, **config.framework_params)
    best_score, best_params, best_model = None, None, None
    score_higher_better = True

    log.info(
        "Tuning hyperparameters with NNI EvolutionTuner with a maximum time of {}s\n"
        .format(config.max_runtime_seconds))
    tuner = EvolutionTuner()
    tuner.update_search_space(SEARCH_SPACE)
    start_time = time.time()
    param_idx = 0
    while True:
        try:
            cur_params = tuner.generate_parameters(param_idx)
            cur_model = estimator(random_state=config.seed,
                                  **cur_params,
                                  **config.framework_params)
            # Here score is the output of score() from the estimator
            cur_score = cross_val_score(cur_model, X_train, y_train)
            cur_score = sum(cur_score) / float(len(cur_score))
            if best_score is None or (score_higher_better
                                      and cur_score > best_score) or (
                                          not score_higher_better
                                          and cur_score < best_score):
                best_score, best_params, best_model = cur_score, cur_params, cur_model

            log.info("Trial {}: \n{}\nScore: {}\n".format(
                param_idx, cur_params, cur_score))
            tuner.receive_trial_result(param_idx, cur_params, cur_score)
            param_idx += 1
            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > config.max_runtime_seconds:
                break
        except:
            break

    log.info("Tuning done, the best parameters are:\n{}\n".format(best_params))

    # retrain on the whole dataset
    with Timer() as training:
        best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    probabilities = best_model.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test)

    return dict(models_count=1, training_duration=training.duration)
コード例 #18
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
        auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False),
        f1=(lambda y, pred: 1.0 - f1_score(y, pred), False),
        # logloss=(log_loss, True),
        mae=(mean_absolute_error, False),
        mse=(mean_squared_error, False),
        msle=(mean_squared_log_error, False),
        r2=(default, False),  # lambda y, pred: 1.0 - r2_score(y, pred)
    )
    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[
        config.metric] if config.metric in metrics_to_loss_mapping else (None,
                                                                         False)
    if loss_fn is None:
        log.warning("Performance metric %s not supported: defaulting to %s.",
                    config.metric, 'accuracy' if is_classification else 'r2')
    if loss_fn is default:
        loss_fn = None

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log.warning("Ignoring cores constraint of %s cores.", config.cores)
    log.info(
        "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.",
        config.max_runtime_seconds, 'all', config.metric)

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    if is_classification:
        classifier = any_classifier('clf')
        regressor = None
    else:
        classifier = None
        regressor = any_regressor('rgr')

    estimator = HyperoptEstimator(classifier=classifier,
                                  regressor=regressor,
                                  algo=tpe.suggest,
                                  loss_fn=loss_fn,
                                  continuous_loss_fn=continuous_loss_fn,
                                  trial_timeout=config.max_runtime_seconds,
                                  seed=config.seed,
                                  **training_params)

    with InterruptTimeout(config.max_runtime_seconds * 4 / 3,
                          sig=signal.SIGQUIT):
        with InterruptTimeout(config.max_runtime_seconds,
                              before_interrupt=ft.partial(
                                  kill_proc_tree,
                                  timeout=5,
                                  include_parent=False)):
            with Timer() as training:
                estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)

    if is_classification:
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)
    else:
        probabilities = None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(estimator.trials),
                training_duration=training.duration)
コード例 #19
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** TPOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to TPOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    estimator = TPOTClassifier if is_classification else TPOTRegressor
    tpot = estimator(n_jobs=n_jobs,
                     max_time_mins=runtime_min,
                     scoring=scoring_metric,
                     random_state=config.seed,
                     **training_params)

    with Timer() as training:
        tpot.fit(X_train, y_train)

    log.info('Predicting on the test set.')
    predictions = tpot.predict(X_test)
    try:
        probabilities = tpot.predict_proba(
            X_test) if is_classification else None
    except RuntimeError:
        # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`.
        target_values_enc = dataset.target.label_encoder.transform(
            dataset.target.values)
        probabilities = Encoder(
            'one-hot', target=False,
            encoded_type=float).fit(target_values_enc).transform(predictions)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=is_classification)

    save_artifacts(tpot, config)

    return dict(models_count=len(tpot.evaluated_individuals_),
                training_duration=training.duration)
コード例 #20
0
ファイル: exec.py プロジェクト: drpdr/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads,
                 config.max_mem_size_mb)
        h2o.init(
            nthreads=nthreads,
            min_mem_size=str(config.max_mem_size_mb) + "M",
            max_mem_size=str(config.max_mem_size_mb) + "M",
            # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
        )

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path,
                                destination_frame=frame_name('train', config))
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config))
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        save_predictions(aml, test, dataset=dataset, config=config)
        save_artifacts(aml, dataset=dataset, config=config)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
コード例 #21
0
ファイル: exec.py プロジェクト: drpdr/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** AutoSklearn ****\n")
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=DeprecationWarning)

    is_classification = config.type == 'classification'

    # Mapping of benchmark metrics to autosklearn metrics
    metrics_mapping = dict(acc=metrics.accuracy,
                           auc=metrics.roc_auc,
                           f1=metrics.f1,
                           logloss=metrics.log_loss,
                           mae=metrics.mean_absolute_error,
                           mse=metrics.mean_squared_error,
                           r2=metrics.r2)
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # Set resources based on datasize
    log.info(
        "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.",
        config.max_runtime_seconds, config.cores, config.max_mem_size_mb,
        perf_metric)
    log.info("Environment: %s", os.environ)

    X_train = dataset.train.X_enc
    y_train = dataset.train.y_enc
    # log.info("finite=%s", np.isfinite(X_train))
    predictors_type = [
        'Categorical' if p.is_categorical() else 'Numerical'
        for p in dataset.predictors
    ]

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto')
    ensemble_memory_limit = config.framework_params.get(
        '_ensemble_memory_limit', 'auto')

    # when memory is large enough, we should have:
    # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb
    total_memory_mb = system_memory_mb().total
    if ml_memory_limit == 'auto':
        ml_memory_limit = max(
            min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)),
            3072)  # 3072 is autosklearn defaults
    if ensemble_memory_limit == 'auto':
        ensemble_memory_limit = max(
            math.ceil(ml_memory_limit -
                      (total_memory_mb - config.max_mem_size_mb)),
            math.ceil(ml_memory_limit / 3),  # default proportions
            1024)  # 1024 is autosklearn defaults
    log.info(
        "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.",
        ml_memory_limit, ensemble_memory_limit, n_jobs)

    log.warning(
        "Using meta-learned initialization, which might be bad (leakage).")
    # TODO: do we need to set per_run_time_limit too?
    estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor
    auto_sklearn = estimator(
        time_left_for_this_task=config.max_runtime_seconds,
        n_jobs=n_jobs,
        ml_memory_limit=ml_memory_limit,
        ensemble_memory_limit=ensemble_memory_limit,
        seed=config.seed,
        **training_params)
    with Timer() as training:
        auto_sklearn.fit(X_train,
                         y_train,
                         metric=perf_metric,
                         feat_type=predictors_type)

    # Convert output to strings for classification
    log.info("Predicting on the test set.")
    X_test = dataset.test.X_enc
    y_test = dataset.test.y_enc
    predictions = auto_sklearn.predict(X_test)
    probabilities = auto_sklearn.predict_proba(
        X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    save_artifacts(auto_sklearn, config)

    return dict(models_count=len(auto_sklearn.get_models_with_weights()),
                training_duration=training.duration)
コード例 #22
0
ファイル: exec.py プロジェクト: qingyun-wu/automlbenchmark
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** MLNet [v{config.framework_version}] ****\n")

    avaible_task_list = ['classification', 'regression']
    if config.type not in avaible_task_list:
        raise ValueError(f'{config.type} is not supported.')

    dir_path = os.path.dirname(os.path.realpath(__file__))
    DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib')
    os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR
    os.environ['MLNetCLIEnablePredict'] = 'True'
    os.environ['MLNET_MAX_THREAD'] = str(config.cores)
    mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet')
    train_time_in_seconds = config.max_runtime_seconds
    sub_command = config.type

    # set up MODELBUILDER_AUTOML
    MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI')
    os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML

    artifacts = config.framework_params.get('_save_artifacts', [])
    tmpdir = tempfile.mkdtemp()
    tmp_output_folder = os.path.join(tmpdir, str(config.fold))
    output_dir = output_subdir(
        'models',
        config=config) if 'models' in artifacts else tmp_output_folder
    log_dir = output_subdir(
        'logs', config=config) if 'logs' in artifacts else tmp_output_folder
    log_path = os.path.join(log_dir, 'log.txt')

    try:
        label = dataset.target.index
        train_dataset_path = dataset.train.data_path('csv')
        test_dataset_path = dataset.test.data_path('csv')

        log.info(f'train dataset: {train_dataset_path}')
        log.info(f'test dataset: {test_dataset_path}')

        cmd = (
            f"{mlnet} {sub_command}"
            f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}"
            f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}"
            f" --verbosity q --log-file-path {log_path}")

        with Timer() as training:
            run_cmd(cmd)

        train_result_json = os.path.join(output_dir,
                                         '{}.mbconfig'.format(config.fold))
        if not os.path.exists(train_result_json):
            raise NoResultError("MLNet failed producing any prediction.")

        with open(train_result_json, 'r') as f:
            json_str = f.read()
            mb_config = json.loads(json_str)
            model_path = os.path.join(output_dir, f"{config.fold}.zip")
            output_prediction_path = os.path.join(
                log_dir, "prediction.txt"
            )  # keeping this in log dir as it contains useful error when prediction fails
            models_count = len(mb_config['RunHistory']['Trials'])
            # predict
            predict_cmd = (
                f"{mlnet} predict --task-type {config.type}"
                f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}"
            )
            with Timer() as prediction:
                run_cmd(predict_cmd)
            if config.type == 'classification':
                prediction_df = pd.read_csv(output_prediction_path,
                                            dtype={'PredictedLabel': 'object'})

                save_predictions(
                    dataset=dataset,
                    output_file=config.output_predictions_file,
                    predictions=prediction_df['PredictedLabel'].values,
                    truth=dataset.test.y,
                    probabilities=prediction_df.values[:, :-1],
                    probabilities_labels=list(
                        prediction_df.columns.values[:-1]),
                )

            if config.type == 'regression':
                prediction_df = pd.read_csv(output_prediction_path)
                save_predictions(
                    dataset=dataset,
                    output_file=config.output_predictions_file,
                    predictions=prediction_df['Score'].values,
                    truth=dataset.test.y,
                )

            return dict(
                models_count=models_count,
                training_duration=training.duration,
                predict_duration=prediction.duration,
            )
    finally:
        if 'logs' in artifacts:
            logs_zip = os.path.join(log_dir, "logs.zip")
            zip_path(log_dir, logs_zip)
            clean_dir(log_dir, filter_=lambda p: p != logs_zip)
        if 'models' in artifacts:
            models_zip = os.path.join(output_dir, "models.zip")
            zip_path(output_dir, models_zip)
            clean_dir(output_dir, filter_=lambda p: p != models_zip)

        shutil.rmtree(tmpdir, ignore_errors=True)
コード例 #23
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Tuned Random Forest (sklearn) ****\n")

    is_classification = config.type == 'classification'

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    tuning_params = config.framework_params.get('_tuning', training_params)
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, n_jobs))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    metric = dict(auc='roc_auc', logloss='neg_log_loss',
                  acc='accuracy')[config.metric]

    n_features = X_train.shape[1]
    default_value = max(1, int(math.sqrt(n_features)))
    below_default = pick_values_uniform(start=1,
                                        end=default_value,
                                        length=5 + 1)[:-1]  # 5 below
    above_default = pick_values_uniform(start=default_value,
                                        end=n_features,
                                        length=10 + 1 -
                                        len(below_default))[1:]  # 5 above
    # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time
    # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values]
    # max_features_to_try = below_default[1:] + above_default[:-1]
    # max_features_values = ([default_value, 1, n_features]
    #                        + random.sample(max_features_to_try, k=len(max_features_to_try)))
    max_features_values = [default_value] + below_default + above_default
    # Define up to how much of total time we spend 'optimizing' `max_features`.
    # (the remainder if used for fitting the final model).
    safety_factor = 0.85
    with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds *
                                             safety_factor)):
        log.info("Evaluating multiple values for `max_features`: %s.",
                 max_features_values)
        max_feature_scores = []
        tuning_durations = []
        for i, max_features_value in enumerate(max_features_values):
            log.info("[{:2d}/{:2d}] Evaluating max_features={}".format(
                i + 1, len(max_features_values), max_features_value))
            imputation = Imputer()
            random_forest = estimator(n_jobs=n_jobs,
                                      random_state=config.seed,
                                      max_features=max_features_value,
                                      **tuning_params)
            pipeline = Pipeline(steps=[('preprocessing',
                                        imputation), ('learning',
                                                      random_forest)])
            with Timer() as cv_scoring:
                try:
                    scores = cross_val_score(estimator=pipeline,
                                             X=dataset.train.X_enc,
                                             y=dataset.train.y_enc,
                                             scoring=metric,
                                             cv=5)
                    max_feature_scores.append(
                        (statistics.mean(scores), max_features_value))
                except stopit.utils.TimeoutException as toe:
                    log.error(
                        "Failed CV scoring for max_features=%s : Timeout",
                        max_features_value)
                    tuning_durations.append(
                        (max_features_value, cv_scoring.duration))
                    raise toe
                except Exception as e:
                    log.error("Failed CV scoring for max_features=%s :\n%s",
                              max_features_value, e)
                    log.debug("Exception:", exc_info=True)
            tuning_durations.append((max_features_value, cv_scoring.duration))

    log.info("Tuning scores:\n%s", sorted(max_feature_scores))
    log.info("Tuning durations:\n%s", sorted(tuning_durations))
    _, best_max_features_value = max(
        max_feature_scores) if len(max_feature_scores) > 0 else (math.nan,
                                                                 'auto')
    log.info("Training final model with `max_features={}`.".format(
        best_max_features_value))
    rf = estimator(n_jobs=n_jobs,
                   random_state=config.seed,
                   max_features=best_max_features_value,
                   **training_params)
    with Timer() as training:
        rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)
    probabilities = rf.predict_proba(X_test) if is_classification else None

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=y_test,
                             target_is_encoded=True)

    return dict(models_count=len(rf),
                training_duration=training.duration +
                sum(map(lambda t: t[1], tuning_durations)))