def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 26.02.2019: still under development state by oboe team) raise ValueError('Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores)) log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric)) aml = AutoLearner(p_type='classification' if is_classification else 'regression', n_cores=n_cores, runtime_limit=config.max_runtime_seconds, **training_params) aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else [] with Timer() as training: try: aml.fit(X_train, y_train) except IndexError as e: if len(aml_models()) == 0: # incorrect handling of some IndexError in oboe if ensemble is empty raise NoResultError("Oboe could not produce any model in the requested time.") from e raise e predictions = aml.predict(X_test).reshape(len(X_test)) if is_classification: target_values_enc = dataset.target.label_encoder.transform(dataset.target.values) probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict( models_count=len(aml_models()), training_duration=training.duration )
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = os.path.join(here, 'venv/bin/python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TmpDir() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _env_=dict(PYTHONPATH=os.pathsep.join([ rconfig().root_dir, os.path.join(rconfig().root_dir, "amlb"), ]))) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None log.debug("Result from subprocess:\n%s", res) if callable(process_results): res = process_results(res) save_predictions_to_file( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** MLNet [v{config.framework_version}] ****\n") avaible_task_list = ['classification', 'regression'] if config.type not in avaible_task_list: raise ValueError(f'{config.type} is not supported.') dir_path = os.path.dirname(os.path.realpath(__file__)) DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib') os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR os.environ['MLNetCLIEnablePredict'] = 'True' os.environ['MLNET_MAX_THREAD'] = str(config.cores) mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet') train_time_in_seconds = config.max_runtime_seconds sub_command = config.type # set up MODELBUILDER_AUTOML MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI') os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML artifacts = config.framework_params.get('_save_artifacts', []) tmpdir = tempfile.mkdtemp() tmp_output_folder = os.path.join(tmpdir, str(config.fold)) output_dir = output_subdir( 'models', config=config) if 'models' in artifacts else tmp_output_folder log_dir = output_subdir( 'logs', config=config) if 'logs' in artifacts else tmp_output_folder log_path = os.path.join(log_dir, 'log.txt') try: label = dataset.target.index train_dataset_path = dataset.train.data_path('csv') test_dataset_path = dataset.test.data_path('csv') log.info(f'train dataset: {train_dataset_path}') log.info(f'test dataset: {test_dataset_path}') cmd = ( f"{mlnet} {sub_command}" f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}" f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}" f" --verbosity q --log-file-path {log_path}") with Timer() as training: run_cmd(cmd) train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold)) if not os.path.exists(train_result_json): raise NoResultError("MLNet failed producing any prediction.") with open(train_result_json, 'r') as f: json_str = f.read() mb_config = json.loads(json_str) model_path = os.path.join(output_dir, f"{config.fold}.zip") output_prediction_path = os.path.join( log_dir, "prediction.txt" ) # keeping this in log dir as it contains useful error when prediction fails models_count = len(mb_config['RunHistory']['Trials']) # predict predict_cmd = ( f"{mlnet} predict --task-type {config.type}" f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}" ) with Timer() as prediction: run_cmd(predict_cmd) if config.type == 'classification': prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'}) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['PredictedLabel'].values, truth=dataset.test.y, probabilities=prediction_df.values[:, :-1], probabilities_labels=list( prediction_df.columns.values[:-1]), ) if config.type == 'regression': prediction_df = pd.read_csv(output_prediction_path) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['Score'].values, truth=dataset.test.y, ) return dict( models_count=models_count, training_duration=training.duration, predict_duration=prediction.duration, ) finally: if 'logs' in artifacts: logs_zip = os.path.join(log_dir, "logs.zip") zip_path(log_dir, logs_zip) clean_dir(log_dir, filter_=lambda p: p != logs_zip) if 'models' in artifacts: models_zip = os.path.join(output_dir, "models.zip") zip_path(output_dir, models_zip) clean_dir(output_dir, filter_=lambda p: p != models_zip) shutil.rmtree(tmpdir, ignore_errors=True)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', r2='r2', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) jvm_memory = str( round(config.max_mem_size_mb * 2 / 3)) + "M" # leaving 1/3rd of available memory for XGBoost log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory) max_port_range = 49151 min_port_range = 1024 rnd_port = os.getpid() % (max_port_range - min_port_range) + min_port_range port = config.framework_params.get('_port', rnd_port) h2o.init( nthreads=nthreads, port=port, min_mem_size=jvm_memory, max_mem_size=jvm_memory, strict_version_check=config.framework_params.get( '_strict_version_check', True) # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML( max_runtime_secs=config.max_runtime_seconds, max_runtime_secs_per_model=round( config.max_runtime_seconds / 2), # to prevent timeout on ensembles sort_metric=sort_metric, seed=config.seed, **training_params) monitor = ( BackendMemoryMonitoring( frequency_seconds=rconfig().monitoring.frequency_seconds, check_on_exit=True, verbosity=rconfig().monitoring.verbosity) if config.framework_params.get('_monitor_backend', False) # else contextlib.nullcontext # Py 3.7+ only else contextlib.contextmanager(iter)([0])) with Timer() as training: with monitor: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): # h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init( nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, options: Union[None, dict, ns] = None, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = venv_python_exec(here) script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" options = ns.from_dict(options) if options else ns() ser_config = options['serialization'] env = options['env'] or ns() with TemporaryDirectory() as tmpdir: ds = _make_input_dataset(input_data, dataset, tmpdir, serialization=ser_config) config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config, options=options), style='compact') log.debug("Params passed to subprocess:\n%s", params) cmon = rconfig().monitoring monitor = (dict(interval_seconds=cmon.interval_seconds, verbosity=cmon.verbosity) if 'sub_proc_memory' in cmon.statistics else None) env = dict(PATH=os.pathsep.join([venv_bin(here), os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb"), AMLB_LOG_TRACE=str( logging.TRACE if hasattr(logging, 'TRACE') else ''), **{k: str(v) for k, v in env}) with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=env, _monitor_=monitor) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = deserialize_data( res[name], config=ser_config) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions(dataset=dataset, output_file=res.output_file, predictions=as_vec(res.predictions), truth=(as_vec(res.truth) if res.truth is not None else dataset.test.y_enc if res.target_is_encoded else dataset.test.y), probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()