def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data(res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = os.path.join(here, 'venv/bin/python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TmpDir() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _env_=dict(PYTHONPATH=os.pathsep.join([ rconfig().root_dir, os.path.join(rconfig().root_dir, "amlb"), ]))) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None log.debug("Result from subprocess:\n%s", res) if callable(process_results): res = process_results(res) save_predictions_to_file( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, options: Union[None, dict, ns] = None, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = venv_python_exec(here) script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" options = ns.from_dict(options) if options else ns() ser_config = options['serialization'] env = options['env'] or ns() with TemporaryDirectory() as tmpdir: ds = _make_input_dataset(input_data, dataset, tmpdir, serialization=ser_config) config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config, options=options), style='compact') log.debug("Params passed to subprocess:\n%s", params) cmon = rconfig().monitoring monitor = (dict(interval_seconds=cmon.interval_seconds, verbosity=cmon.verbosity) if 'sub_proc_memory' in cmon.statistics else None) env = dict(PATH=os.pathsep.join([venv_bin(here), os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb"), AMLB_LOG_TRACE=str( logging.TRACE if hasattr(logging, 'TRACE') else ''), **{k: str(v) for k, v in env}) with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=env, _monitor_=monitor) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = deserialize_data( res[name], config=ser_config) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions(dataset=dataset, output_file=res.output_file, predictions=as_vec(res.predictions), truth=(as_vec(res.truth) if res.truth is not None else dataset.test.y_enc if res.target_is_encoded else dataset.test.y), probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)