Beispiel #1
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Random Forest (R) ****\n")
    save_metadata(config)

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    here = dir_of(__file__)
    meta_results_file = os.path.join(config.output_dir, "meta_results.csv")
    run_cmd(r"""Rscript --vanilla -e "
            source('{script}'); 
            run('{train}', '{test}', '{output}', 
                cores={cores}, meta_results_file='{meta_results}')
            " """.format(script=os.path.join(here, 'exec.R'),
                         train=dataset.train.path,
                         test=dataset.test.path,
                         output=config.output_predictions_file,
                         meta_results=meta_results_file,
                         cores=config.cores),
            _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)

    meta_results = read_csv(meta_results_file)
    return dict(training_duration=meta_result(meta_results,
                                              'training_duration'),
                predict_duration=meta_result(meta_results, 'predict_duration'))
Beispiel #2
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Autoxgboost (R) ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    here = dir_of(__file__)

    with Timer() as training:
        run_cmd(
            r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', target.index = {target_index}, '{output}', {cores}, time.budget = {time_budget})" """
            .format(script=os.path.join(here, 'exec.R'),
                    train=dataset.train.path,
                    test=dataset.test.path,
                    target_index=dataset.target.index + 1,
                    output=config.output_predictions_file,
                    cores=config.cores,
                    time_budget=config.max_runtime_seconds),
            _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)

    return dict(training_duration=training.duration)
Beispiel #3
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Random Forest (R) ****\n")

    here = dir_of(__file__)
    meta_results_file = os.path.join(config.output_dir, "meta_results.csv")
    run_cmd((
        "Rscript --vanilla -e \""
        "source('{script}'); "
        "run('{train}', '{test}', '{output}', cores={cores}, meta_results_file='{meta_results}', task_type='{task_type}')"
        "\"").format(script=os.path.join(here, 'exec.R'),
                     train=dataset.train.path,
                     test=dataset.test.path,
                     output=config.output_predictions_file,
                     meta_results=meta_results_file,
                     task_type=config.type,
                     cores=config.cores),
            _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)

    meta_results = read_csv(meta_results_file)
    return dict(training_duration=meta_result(meta_results,
                                              'training_duration'),
                predict_duration=meta_result(meta_results, 'predict_duration'))
Beispiel #4
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info(f"\n**** Autoxgboost (R) [{config.framework_version}] ****\n")

    is_classification = config.type == 'classification'

    here = dir_of(__file__)

    meta_results_file = os.path.join(config.output_dir, "meta_results.csv")
    run_cmd((
        "Rscript --vanilla -e \""
        "source('{script}'); "
        "run('{train}', '{test}', target.index = {target_index}, '{type}', '{output}', {cores},"
        " time.budget = {time_budget}, meta_results_file='{meta_results}')"
        "\"").format(script=os.path.join(here, 'exec.R'),
                     train=dataset.train.path,
                     test=dataset.test.path,
                     target_index=dataset.target.index + 1,
                     type=config.type,
                     output=config.output_predictions_file,
                     cores=config.cores,
                     time_budget=config.max_runtime_seconds,
                     meta_results=meta_results_file),
            _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)

    meta_results = read_csv(meta_results_file)
    return dict(training_duration=meta_result(meta_results,
                                              'training_duration'),
                predict_duration=meta_result(meta_results, 'predict_duration'))
Beispiel #5
0
def docker_commands(*args, setup_cmd=None):
    return """
RUN {here}/setup.sh {args}
{cmd}
EXPOSE 54321
EXPOSE 54322
""".format(
        here=dir_of(__file__, True),
        args=' '.join(as_cmd_args(*args)),
        cmd="RUN {}".format(setup_cmd) if setup_cmd is not None else ""
    )
Beispiel #6
0
def run_cmd_in_venv(caller_file, cmd, *args, **kwargs):
    params = ns(python_exec='python')
    for k, v in params:
        kk = '_' + k + '_'
        if kk in kwargs:
            params[k] = kwargs[kk]
            del kwargs[kk]

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if os.path.isdir(venv_bin_path):
        py = os.path.join(venv_bin_path, 'python -W ignore')
        pip = os.path.join(venv_bin_path, 'python -m pip')
    else:
        py = f"{params.python_exec} -W ignore"
        pip = f"{params.python_exec} -m pip"

    cmd = cmd.format(py=py, pip=pip)
    return run_cmd(cmd, *args, **kwargs)
Beispiel #7
0
def run(dataset: Dataset, config: TaskConfig):
    #TODO: use rpy2 instead? not necessary here though as the call is very simple
    log.info("\n**** Random Forest (R) ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    here = dir_of(__file__)
    run_cmd(
        r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', '{output}', {cores})" """
        .format(script=os.path.join(here, 'exec.R'),
                train=dataset.train.path,
                test=dataset.test.path,
                output=config.output_predictions_file,
                cores=config.cores),
        _live_output_=True)

    log.info("Predictions saved to %s", config.output_predictions_file)
Beispiel #8
0
def docker_commands(*args, **kwargs):
    return """
RUN {here}/setup.sh
""".format(here=dir_of(__file__, True))
Beispiel #9
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TemporaryDirectory() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)
Beispiel #10
0
import io
import logging
import os
import uuid

from amlb.benchmark import TaskConfig
from amlb.data import Dataset
from amlb.datautils import write_csv, read_csv
from amlb.results import save_predictions_to_file
from amlb.utils import Namespace as ns, TmpDir, dir_of, run_cmd, json_dumps, json_loads

log = logging.getLogger(__name__)


PYTHON = os.path.join(dir_of(__file__), 'venv/bin/python3 -W ignore')
# PYTHON = 'python3 -W ignore'


def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
Beispiel #11
0
import logging
import sys

from amlb.benchmark import TaskConfig
from amlb.data import Dataset
from amlb.datautils import Encoder, impute
from amlb.results import NoResultError, save_predictions_to_file
from amlb.utils import Timer, dir_of

sys.path.append("{}/lib/oboe/automl".format(dir_of(__file__)))
from auto_learner import AutoLearner

log = logging.getLogger(__name__)


def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Oboe ****\n")

    is_classification = config.type == 'classification'
    if not is_classification:
        # regression currently fails (as of 26.02.2019: still under development state by oboe team)
        raise ValueError('Regression is not yet supported (under development).')

    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
    n_cores = config.framework_params.get('_n_cores', config.cores)

    log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores))
    log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric))
Beispiel #12
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n")
    save_metadata(config)

    is_classification = config.type == 'classification'
    if not is_classification:
        raise ValueError('Regression is not supported.')

    # Mapping of benchmark metrics to Weka metrics
    metrics_mapping = dict(acc='errorRate',
                           auc='areaUnderROC',
                           logloss='kBInformation')
    metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    train_file = dataset.train.path
    test_file = dataset.test.path
    # Weka to requires target as the last attribute
    if dataset.target.index != len(dataset.predictors):
        train_file = reorder_dataset(dataset.train.path,
                                     target_src=dataset.target.index)
        test_file = reorder_dataset(dataset.test.path,
                                    target_src=dataset.target.index)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    parallelRuns = config.framework_params.get('_parallelRuns', config.cores)

    memLimit = config.framework_params.get('_memLimit', 'auto')
    if memLimit == 'auto':
        memLimit = max(
            min(config.max_mem_size_mb,
                math.ceil(config.max_mem_size_mb / parallelRuns)),
            1024)  # AutoWEKA default memLimit
    log.info("Using %sMB memory per run on %s parallel runs.", memLimit,
             parallelRuns)

    f = split_path(config.output_predictions_file)
    f.extension = '.weka_pred.csv'
    weka_file = path_from_split(f)
    cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format(
        here=dir_of(__file__))
    cmd_params = dict(
        t='"{}"'.format(train_file),
        T='"{}"'.format(test_file),
        memLimit=memLimit,
        classifications=
        '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""'
        .format(weka_file),
        timeLimit=int(config.max_runtime_seconds / 60),
        parallelRuns=parallelRuns,
        metric=metric,
        seed=config.seed % (1 << 16),  # weka accepts only int16 as seeds
        **training_params)
    cmd = cmd_root + ' '.join(
        ["-{} {}".format(k, v) for k, v in cmd_params.items()])
    with Timer() as training:
        run_cmd(cmd, _live_output_=True)

    # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order
    # interestingly, other frameworks seem to always sort the target values first
    # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function
    probabilities_labels = dataset.target.values
    if not os.path.exists(weka_file):
        raise NoResultError("AutoWEKA failed producing any prediction.")
    with open(weka_file, 'r') as weka_file:
        probabilities = []
        predictions = []
        truth = []
        for line in weka_file.readlines()[1:-1]:
            inst, actual, predicted, error, *distribution = line.split(',')
            pred_probabilities = [
                pred_probability.replace('*', '').replace('\n', '')
                for pred_probability in distribution
            ]
            _, pred = predicted.split(':')
            _, tru = actual.split(':')
            probabilities.append(pred_probabilities)
            predictions.append(pred)
            truth.append(tru)

    save_predictions(dataset=dataset,
                     output_file=config.output_predictions_file,
                     probabilities=probabilities,
                     predictions=predictions,
                     truth=truth,
                     probabilities_labels=probabilities_labels)

    return dict(training_duration=training.duration)
Beispiel #13
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(here, 'venv/bin/python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TmpDir() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(cmd,
                                  *args,
                                  _input_str_=params,
                                  _live_output_=True,
                                  _env_=dict(PYTHONPATH=os.pathsep.join([
                                      rconfig().root_dir,
                                      os.path.join(rconfig().root_dir, "amlb"),
                                  ])))

        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        log.debug("Result from subprocess:\n%s", res)
        if callable(process_results):
            res = process_results(res)

        save_predictions_to_file(
            dataset=dataset,
            output_file=res.output_file,
            predictions=res.predictions.reshape(-1)
            if res.predictions is not None else None,
            truth=res.truth.reshape(-1) if res.truth is not None else None,
            probabilities=res.probabilities,
            target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration)
Beispiel #14
0
import os
import signal
import sys
import tempfile as tmp

from amlb.benchmark import TaskConfig
from amlb.data import Dataset
from amlb.datautils import Encoder, impute
from amlb.results import save_predictions_to_file
from amlb.utils import InterruptTimeout, Timer, dir_of, kill_proc_tree

os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
sys.path.append("{}/lib/hyperopt-sklearn".format(dir_of(__file__)))
from hpsklearn import HyperoptEstimator, any_classifier, any_regressor
from hyperopt import tpe
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

log = logging.getLogger(__name__)


def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** Hyperopt-sklearn ****\n")

    is_classification = config.type == 'classification'

    default = lambda: 0
    metrics_to_loss_mapping = dict(
        acc=(default, False),  # lambda y, pred: 1.0 - accuracy_score(y, pred)
Beispiel #15
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
Beispiel #16
0
def docker_commands(*args, **kwargs):
    return """
RUN {here}/setup.sh {amlb_dir}
""".format(here=dir_of(__file__, True), amlb_dir=rconfig().root_dir)
Beispiel #17
0
def setup(*args, **kwargs):
    from sklearn import __version__
    with open(os.path.join(dir_of(__file__), __installed_file__), 'w') as f:
        f.write('\n'.join([__version__, ""]))