Beispiel #1
0
def run(dataset, config):
    log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__)

    is_classification = config.type == 'classification'

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, config.cores))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rfc = estimator(n_jobs=config.cores, **config.framework_params)

    rfc.fit(X_train, y_train)

    predictions = rfc.predict(X_test)
    probabilities = rfc.predict_proba(X_test) if is_classification else None

    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)
Beispiel #2
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
config_user = config_load(
    os.path.join(args.userdir if args.userdir is not None else config.user_dir,
                 "config.yaml"))
# config listing properties set by command line
config_args = ns.parse(
    {'results.save': args.keep_scores},
    input_dir=args.indir,
    output_dir=args.outdir,
    user_dir=args.userdir,
    run_mode=args.mode,
    script=os.path.basename(__file__),
    sid=sid,
) + ns.parse(extras)
if args.mode != 'local':
    config_args + ns.parse({'monitoring.frequency_seconds': 0})
config_args = ns({k: v for k, v in config_args if v is not None})
log.debug("Config args: %s.", config_args)
# merging all configuration files
automl.resources.from_configs(config, config_user, config_args)

try:
    if args.mode == 'local':
        bench = automl.Benchmark(args.framework,
                                 args.benchmark,
                                 parallel_jobs=args.parallel)
    elif args.mode == 'docker':
        bench = automl.DockerBenchmark(args.framework,
                                       args.benchmark,
                                       parallel_jobs=args.parallel)
    elif args.mode == 'aws':
        bench = automl.AWSBenchmark(args.framework,
Beispiel #4
0
    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)


if __name__ == '__main__':
    params = json_loads(sys.stdin.read(), as_namespace=True)

    def load_data(path):
        return read_csv(path, as_data_frame=False, header=False)

    ds = ns(train=ns(X_enc=load_data(params.dataset.train.X_enc),
                     y=load_data(params.dataset.train.y).squeeze()),
            test=ns(
                X_enc=load_data(params.dataset.test.X_enc),
                y=load_data(params.dataset.test.y).squeeze(),
            ))
    config = params.config
    config.framework_params = ns.dict(config.framework_params)
    result = run(ds, config)

    res = copy.copy(result)
    res.predictions = os.path.join(config.result_dir, 'predictions')
    res.truth = os.path.join(config.result_dir, 'truth')
    write_csv(result.predictions.reshape(-1, 1), res.predictions)
    write_csv(result.truth.reshape(-1, 1), res.truth)
    if result.probabilities is not None:
        res.probabilities = os.path.join(config.result_dir, 'probabilities')
        write_csv(result.probabilities, res.probabilities)