def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info(f"\n**** Autoxgboost (R) [{config.framework_version}] ****\n") is_classification = config.type == 'classification' here = dir_of(__file__) meta_results_file = os.path.join(config.output_dir, "meta_results.csv") run_cmd(( "Rscript --vanilla -e \"" "source('{script}'); " "run('{train}', '{test}', target.index = {target_index}, '{type}', '{output}', {cores}," " time.budget = {time_budget}, meta_results_file='{meta_results}')" "\"").format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, target_index=dataset.target.index + 1, type=config.type, output=config.output_predictions_file, cores=config.cores, time_budget=config.max_runtime_seconds, meta_results=meta_results_file), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file) meta_results = read_csv(meta_results_file) return dict(training_duration=meta_result(meta_results, 'training_duration'), predict_duration=meta_result(meta_results, 'predict_duration'))
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Random Forest (R) ****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') here = dir_of(__file__) meta_results_file = os.path.join(config.output_dir, "meta_results.csv") run_cmd(r"""Rscript --vanilla -e " source('{script}'); run('{train}', '{test}', '{output}', cores={cores}, meta_results_file='{meta_results}') " """.format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, output=config.output_predictions_file, meta_results=meta_results_file, cores=config.cores), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file) meta_results = read_csv(meta_results_file) return dict(training_duration=meta_result(meta_results, 'training_duration'), predict_duration=meta_result(meta_results, 'predict_duration'))
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Random Forest (R) ****\n") here = dir_of(__file__) meta_results_file = os.path.join(config.output_dir, "meta_results.csv") run_cmd(( "Rscript --vanilla -e \"" "source('{script}'); " "run('{train}', '{test}', '{output}', cores={cores}, meta_results_file='{meta_results}', task_type='{task_type}')" "\"").format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, output=config.output_predictions_file, meta_results=meta_results_file, task_type=config.type, cores=config.cores), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file) meta_results = read_csv(meta_results_file) return dict(training_duration=meta_result(meta_results, 'training_duration'), predict_duration=meta_result(meta_results, 'predict_duration'))
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Autoxgboost (R) ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') here = dir_of(__file__) with Timer() as training: run_cmd( r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', target.index = {target_index}, '{output}', {cores}, time.budget = {time_budget})" """ .format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, target_index=dataset.target.index + 1, output=config.output_predictions_file, cores=config.cores, time_budget=config.max_runtime_seconds), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file) return dict(training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Random Forest (R) ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') here = dir_of(__file__) run_cmd( r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', '{output}', {cores})" """ .format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, output=config.output_predictions_file, cores=config.cores), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file)
def run_cmd_in_venv(caller_file, cmd, *args, **kwargs): params = ns(python_exec='python') for k, v in params: kk = '_' + k + '_' if kk in kwargs: params[k] = kwargs[kk] del kwargs[kk] here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if os.path.isdir(venv_bin_path): py = os.path.join(venv_bin_path, 'python -W ignore') pip = os.path.join(venv_bin_path, 'python -m pip') else: py = f"{params.python_exec} -W ignore" pip = f"{params.python_exec} -m pip" cmd = cmd.format(py=py, pip=pip) return run_cmd(cmd, *args, **kwargs)
def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data(res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = os.path.join(here, 'venv/bin/python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TmpDir() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _env_=dict(PYTHONPATH=os.pathsep.join([ rconfig().root_dir, os.path.join(rconfig().root_dir, "amlb"), ]))) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None log.debug("Result from subprocess:\n%s", res) if callable(process_results): res = process_results(res) save_predictions_to_file( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** MLNet [v{config.framework_version}] ****\n") avaible_task_list = ['classification', 'regression'] if config.type not in avaible_task_list: raise ValueError(f'{config.type} is not supported.') dir_path = os.path.dirname(os.path.realpath(__file__)) DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib') os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR os.environ['MLNetCLIEnablePredict'] = 'True' os.environ['MLNET_MAX_THREAD'] = str(config.cores) mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet') train_time_in_seconds = config.max_runtime_seconds sub_command = config.type # set up MODELBUILDER_AUTOML MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI') os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML artifacts = config.framework_params.get('_save_artifacts', []) tmpdir = tempfile.mkdtemp() tmp_output_folder = os.path.join(tmpdir, str(config.fold)) output_dir = output_subdir( 'models', config=config) if 'models' in artifacts else tmp_output_folder log_dir = output_subdir( 'logs', config=config) if 'logs' in artifacts else tmp_output_folder log_path = os.path.join(log_dir, 'log.txt') try: label = dataset.target.index train_dataset_path = dataset.train.data_path('csv') test_dataset_path = dataset.test.data_path('csv') log.info(f'train dataset: {train_dataset_path}') log.info(f'test dataset: {test_dataset_path}') cmd = ( f"{mlnet} {sub_command}" f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}" f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}" f" --verbosity q --log-file-path {log_path}") with Timer() as training: run_cmd(cmd) train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold)) if not os.path.exists(train_result_json): raise NoResultError("MLNet failed producing any prediction.") with open(train_result_json, 'r') as f: json_str = f.read() mb_config = json.loads(json_str) model_path = os.path.join(output_dir, f"{config.fold}.zip") output_prediction_path = os.path.join( log_dir, "prediction.txt" ) # keeping this in log dir as it contains useful error when prediction fails models_count = len(mb_config['RunHistory']['Trials']) # predict predict_cmd = ( f"{mlnet} predict --task-type {config.type}" f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}" ) with Timer() as prediction: run_cmd(predict_cmd) if config.type == 'classification': prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'}) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['PredictedLabel'].values, truth=dataset.test.y, probabilities=prediction_df.values[:, :-1], probabilities_labels=list( prediction_df.columns.values[:-1]), ) if config.type == 'regression': prediction_df = pd.read_csv(output_prediction_path) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['Score'].values, truth=dataset.test.y, ) return dict( models_count=models_count, training_duration=training.duration, predict_duration=prediction.duration, ) finally: if 'logs' in artifacts: logs_zip = os.path.join(log_dir, "logs.zip") zip_path(log_dir, logs_zip) clean_dir(log_dir, filter_=lambda p: p != logs_zip) if 'models' in artifacts: models_zip = os.path.join(output_dir, "models.zip") zip_path(output_dir, models_zip) clean_dir(output_dir, filter_=lambda p: p != models_zip) shutil.rmtree(tmpdir, ignore_errors=True)