Exemple #1
0
def call_run(run_fn):
    import numpy as np

    params = utils.Namespace.from_dict(utils.json_loads(sys.stdin.read()))

    def load_data(name, path, **ignored):
        if isinstance(path, str) and data_keys.match(name):
            return name, np.load(path, allow_pickle=True)
        return name, path

    print(params.dataset)
    ds = utils.Namespace.walk(params.dataset, load_data)

    config = params.config
    config.framework_params = utils.Namespace.dict(config.framework_params)

    try:
        result = run_fn(ds, config)
        res = dict(result)
        for name in ['predictions', 'truth', 'probabilities']:
            arr = result[name]
            if arr is not None:
                res[name] = os.path.join(config.result_dir, '.'.join([name, 'npy']))
                np.save(res[name], arr, allow_pickle=True)
    except BaseException as e:
        log.exception(e)
        res = dict(
            error_message=str(e),
            models_count=0
        )

    utils.json_dump(res, config.result_file, style='compact')
Exemple #2
0
def call_run(run_fn):
    import numpy as np

    params = utils.Namespace.from_dict(utils.json_loads(sys.stdin.read()))

    def load_data(name, path, **ignored):
        if isinstance(path, str) and data_keys.match(name):
            return name, np.load(path, allow_pickle=True)
        return name, path

    log.info("Params passed to subprocess:\n%s", params)
    ds = utils.Namespace.walk(params.dataset, load_data)

    config = params.config
    config.framework_params = utils.Namespace.dict(config.framework_params)

    try:
        with utils.InterruptTimeout(config.job_timeout_seconds,
                                    interruptions=[
                                        dict(sig=TimeoutError),
                                        dict(sig=signal.SIGTERM),
                                        dict(sig=signal.SIGQUIT),
                                        dict(sig=signal.SIGKILL),
                                        dict(interrupt='process',
                                             sig=signal.SIGKILL)
                                    ],
                                    wait_retry_secs=10):
            result = run_fn(ds, config)
            res = dict(result)
            for name in ['predictions', 'truth', 'probabilities']:
                arr = result[name]
                if arr is not None:
                    res[name] = os.path.join(config.result_dir,
                                             '.'.join([name, 'npy']))
                    np.save(res[name], arr, allow_pickle=True)
    except BaseException as e:
        log.exception(e)
        res = dict(error_message=str(e), models_count=0)
    finally:
        # ensure there's no subprocess left
        utils.kill_proc_tree(include_parent=False, timeout=5)

    utils.json_dump(res, config.result_file, style='compact')
Exemple #3
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
Exemple #4
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TmpDir() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        log.debug("Result from subprocess:\n%s", res)
        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions_to_file(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    **res.others.__dict__)
    rfc = estimator(n_jobs=config.cores, **config.framework_params)

    rfc.fit(X_train, y_train)

    predictions = rfc.predict(X_test)
    probabilities = rfc.predict_proba(X_test) if is_classification else None

    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)


if __name__ == '__main__':
    params = json_loads(sys.stdin.read(), as_namespace=True)

    def load_data(path):
        return read_csv(path, as_data_frame=False, header=False)

    ds = ns(train=ns(X_enc=load_data(params.dataset.train.X_enc),
                     y=load_data(params.dataset.train.y).squeeze()),
            test=ns(
                X_enc=load_data(params.dataset.test.X_enc),
                y=load_data(params.dataset.test.y).squeeze(),
            ))
    config = params.config
    config.framework_params = ns.dict(config.framework_params)
    result = run(ds, config)

    res = copy.copy(result)