Ejemplo n.º 1
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TemporaryDirectory() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)
Ejemplo n.º 2
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                options: Union[None, dict, ns] = None,
                process_results=None,
                python_exec=None):
    here = dir_of(caller_file)
    if python_exec is None:  # use local virtual env by default
        python_exec = venv_python_exec(here)
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    options = ns.from_dict(options) if options else ns()
    ser_config = options['serialization']
    env = options['env'] or ns()

    with TemporaryDirectory() as tmpdir:

        ds = _make_input_dataset(input_data,
                                 dataset,
                                 tmpdir,
                                 serialization=ser_config)

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config, options=options),
                            style='compact')
        log.debug("Params passed to subprocess:\n%s", params)
        cmon = rconfig().monitoring
        monitor = (dict(interval_seconds=cmon.interval_seconds,
                        verbosity=cmon.verbosity)
                   if 'sub_proc_memory' in cmon.statistics else None)
        env = dict(PATH=os.pathsep.join([venv_bin(here), os.environ['PATH']]),
                   PYTHONPATH=os.pathsep.join([
                       rconfig().root_dir,
                   ]),
                   AMLB_PATH=os.path.join(rconfig().root_dir, "amlb"),
                   AMLB_LOG_TRACE=str(
                       logging.TRACE if hasattr(logging, 'TRACE') else ''),
                   **{k: str(v)
                      for k, v in env})

        with Timer() as proc_timer:
            output, err = run_cmd(cmd,
                                  *args,
                                  _input_str_=params,
                                  _live_output_=True,
                                  _error_level_=logging.DEBUG,
                                  _env_=env,
                                  _monitor_=monitor)

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = deserialize_data(
                res[name],
                config=ser_config) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(dataset=dataset,
                             output_file=res.output_file,
                             predictions=as_vec(res.predictions),
                             truth=(as_vec(res.truth) if res.truth is not None
                                    else dataset.test.y_enc if
                                    res.target_is_encoded else dataset.test.y),
                             probabilities=res.probabilities,
                             probabilities_labels=res.probabilities_labels,
                             target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)