Beispiel #1
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Random Forest (R) ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    here = dir_of(__file__)
    run_cmd(
        r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', '{output}', {cores})" """
        .format(script=os.path.join(here, 'exec.R'),
                train=dataset.train.path,
                test=dataset.test.path,
                output=config.output_predictions_file,
                cores=config.cores))

    log.info("Predictions saved to %s", config.output_predictions_file)
Beispiel #2
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
Beispiel #3
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** AutoWEKA ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='errorRate',
                           auc='areaUnderROC',
                           logloss='kBInformation')
    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path
    # Weka to requires target as the last attribute
    if dataset.target.index != len(dataset.predictors):
        train_file = reorder_dataset(dataset.train.path,
                                     target_src=dataset.target.index)
        test_file = reorder_dataset(dataset.test.path,
                                    target_src=dataset.target.index)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    parallelRuns = config.framework_params.get('_parallelRuns', config.cores)

    memLimit = config.framework_params.get('_memLimit', 'auto')
    if memLimit == 'auto':
        memLimit = max(
            min(config.max_mem_size_mb,
                math.ceil(config.max_mem_size_mb / parallelRuns)),
            1024)  # AutoWEKA default memLimit
    log.info("Using %sMB memory per run on %s parallel runs.", memLimit,
             parallelRuns)

    f = split_path(config.output_predictions_file)
    f.extension = '.weka_pred.csv'
    weka_file = path_from_split(f)
    cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format(
        here=dir_of(__file__))
    cmd_params = dict(
        t='"{}"'.format(train_file),
        T='"{}"'.format(test_file),
        memLimit=memLimit,
        classifications=
        '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""'
        .format(weka_file),
        timeLimit=int(config.max_runtime_seconds / 60),
        parallelRuns=parallelRuns,
        metric=metric,
        seed=config.seed % (1 << 16),  # weka accepts only int16 as seeds
        **training_params)
    cmd = cmd_root + ' '.join(
        ["-{} {}".format(k, v) for k, v in cmd_params.items()])
    with Timer() as training:
        run_cmd(cmd)

    # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order
    # interestingly, other frameworks seem to always sort the target values first
    # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function
    probabilities_labels = dataset.target.values
    if not os.path.exists(weka_file):
        raise NoResultError("AutoWEKA failed producing any prediction.")
    with open(weka_file, 'r') as weka_file:
        probabilities = []
        predictions = []
        truth = []
        for line in weka_file.readlines()[1:-1]:
            inst, actual, predicted, error, *distribution = line.split(',')
            pred_probabilities = [
                pred_probability.replace('*', '').replace('\n', '')
                for pred_probability in distribution
            ]
            _, pred = predicted.split(':')
            _, tru = actual.split(':')
            probabilities.append(pred_probabilities)
            predictions.append(pred)
            truth.append(tru)

    save_predictions_to_file(dataset=dataset,
                             output_file=config.output_predictions_file,
                             probabilities=probabilities,
                             predictions=predictions,
                             truth=truth,
                             probabilities_labels=probabilities_labels)

    return dict(training_duration=training.duration)