def save_artifacts(automl, dataset, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: lb = automl.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) if 'leaderboard' in artifacts: models_dir = output_subdir("models", config) write_csv(lb, os.path.join(models_dir, "leaderboard.csv")) if 'models' in artifacts: models_dir = output_subdir("models", config) all_models_se = next( (mid for mid in lb['model_id'] if mid.startswith("StackedEnsemble_AllModels")), None) mformat = 'mojo' if 'mojos' in artifacts else 'json' if all_models_se and mformat == 'mojo': save_model(all_models_se, dest_dir=models_dir, mformat=mformat) else: for mid in lb['model_id']: save_model(mid, dest_dir=models_dir, mformat=mformat) models_archive = os.path.join(models_dir, "models.zip") zip_path(models_dir, models_archive) def delete(path, isdir): if path != models_archive and os.path.splitext( path)[1] in ['.json', '.zip']: os.remove(path) walk_apply(models_dir, delete, max_depth=0) if 'models_predictions' in artifacts: predictions_dir = output_subdir("predictions", config) test = h2o.get_frame(frame_name('test', config)) for mid in lb['model_id']: model = h2o.get_model(mid) save_predictions(model, test, dataset=dataset, config=config, predictions_file=os.path.join( predictions_dir, mid, 'predictions.csv'), preview=False) zip_path(predictions_dir, os.path.join(predictions_dir, "models_predictions.zip")) def delete(path, isdir): if isdir: shutil.rmtree(path, ignore_errors=True) walk_apply(predictions_dir, delete, max_depth=0) if 'logs' in artifacts: logs_dir = output_subdir("logs", config) h2o.download_all_logs(dirname=logs_dir) except Exception: log.debug("Error when saving artifacts.", exc_info=True)
def save_artifacts(automl, dataset, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: lb = automl.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) if 'leaderboard' in artifacts: models_dir = make_subdir("models", config) write_csv(lb, os.path.join(models_dir, "leaderboard.csv")) if 'models' in artifacts: models_dir = make_subdir("models", config) all_models_se = next( (mid for mid in lb['model_id'] if mid.startswith("StackedEnsemble_AllModels")), None) mformat = 'mojo' if 'mojos' in artifacts else 'json' if all_models_se: save_model(all_models_se, dest_dir=models_dir, mformat=mformat) else: for mid in lb['model_id']: save_model(mid, dest_dir=models_dir, mformat=mformat) if 'models_predictions' in artifacts: predictions_dir = make_subdir("predictions", config) test = h2o.get_frame(frame_name('test', config)) for mid in lb['model_id']: model = h2o.get_model(mid) save_predictions(model, test, dataset=dataset, config=config, predictions_file=os.path.join( predictions_dir, mid, 'predictions.csv')) if 'logs' in artifacts: logs_dir = make_subdir("logs", config) h2o.download_all_logs(dirname=logs_dir) except: log.debug("Error when saving artifacts.", exc_info=True)
def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data(res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
target_is_encoded=False) if __name__ == '__main__': params = json_loads(sys.stdin.read(), as_namespace=True) def load_data(path): return read_csv(path, as_data_frame=False, header=False) ds = ns(train=ns(X_enc=load_data(params.dataset.train.X_enc), y=load_data(params.dataset.train.y).squeeze()), test=ns( X_enc=load_data(params.dataset.test.X_enc), y=load_data(params.dataset.test.y).squeeze(), )) config = params.config config.framework_params = ns.dict(config.framework_params) result = run(ds, config) res = copy.copy(result) res.predictions = os.path.join(config.result_dir, 'predictions') res.truth = os.path.join(config.result_dir, 'truth') write_csv(result.predictions.reshape(-1, 1), res.predictions) write_csv(result.truth.reshape(-1, 1), res.truth) if result.probabilities is not None: res.probabilities = os.path.join(config.result_dir, 'probabilities') write_csv(result.probabilities, res.probabilities) print(config.result_token) print(json_dumps(res, style='compact'))
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()