def save_artifacts(automl, dataset, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: lb = automl.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) if 'leaderboard' in artifacts: models_dir = output_subdir("models", config) write_csv(lb, os.path.join(models_dir, "leaderboard.csv")) if 'models' in artifacts: models_dir = output_subdir("models", config) all_models_se = next( (mid for mid in lb['model_id'] if mid.startswith("StackedEnsemble_AllModels")), None) mformat = 'mojo' if 'mojos' in artifacts else 'json' if all_models_se and mformat == 'mojo': save_model(all_models_se, dest_dir=models_dir, mformat=mformat) else: for mid in lb['model_id']: save_model(mid, dest_dir=models_dir, mformat=mformat) models_archive = os.path.join(models_dir, "models.zip") zip_path(models_dir, models_archive) def delete(path, isdir): if path != models_archive and os.path.splitext( path)[1] in ['.json', '.zip']: os.remove(path) walk_apply(models_dir, delete, max_depth=0) if 'models_predictions' in artifacts: predictions_dir = output_subdir("predictions", config) test = h2o.get_frame(frame_name('test', config)) for mid in lb['model_id']: model = h2o.get_model(mid) save_predictions(model, test, dataset=dataset, config=config, predictions_file=os.path.join( predictions_dir, mid, 'predictions.csv'), preview=False) zip_path(predictions_dir, os.path.join(predictions_dir, "models_predictions.zip")) def delete(path, isdir): if isdir: shutil.rmtree(path, ignore_errors=True) walk_apply(predictions_dir, delete, max_depth=0) if 'logs' in artifacts: logs_dir = output_subdir("logs", config) h2o.download_all_logs(dirname=logs_dir) except Exception: log.debug("Error when saving artifacts.", exc_info=True)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** MLNet [v{config.framework_version}] ****\n") avaible_task_list = ['classification', 'regression'] if config.type not in avaible_task_list: raise ValueError(f'{config.type} is not supported.') dir_path = os.path.dirname(os.path.realpath(__file__)) DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib') os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR os.environ['MLNetCLIEnablePredict'] = 'True' os.environ['MLNET_MAX_THREAD'] = str(config.cores) mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet') train_time_in_seconds = config.max_runtime_seconds sub_command = config.type # set up MODELBUILDER_AUTOML MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI') os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML artifacts = config.framework_params.get('_save_artifacts', []) tmpdir = tempfile.mkdtemp() tmp_output_folder = os.path.join(tmpdir, str(config.fold)) output_dir = output_subdir( 'models', config=config) if 'models' in artifacts else tmp_output_folder log_dir = output_subdir( 'logs', config=config) if 'logs' in artifacts else tmp_output_folder log_path = os.path.join(log_dir, 'log.txt') try: label = dataset.target.index train_dataset_path = dataset.train.data_path('csv') test_dataset_path = dataset.test.data_path('csv') log.info(f'train dataset: {train_dataset_path}') log.info(f'test dataset: {test_dataset_path}') cmd = ( f"{mlnet} {sub_command}" f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}" f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}" f" --verbosity q --log-file-path {log_path}") with Timer() as training: run_cmd(cmd) train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold)) if not os.path.exists(train_result_json): raise NoResultError("MLNet failed producing any prediction.") with open(train_result_json, 'r') as f: json_str = f.read() mb_config = json.loads(json_str) model_path = os.path.join(output_dir, f"{config.fold}.zip") output_prediction_path = os.path.join( log_dir, "prediction.txt" ) # keeping this in log dir as it contains useful error when prediction fails models_count = len(mb_config['RunHistory']['Trials']) # predict predict_cmd = ( f"{mlnet} predict --task-type {config.type}" f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}" ) with Timer() as prediction: run_cmd(predict_cmd) if config.type == 'classification': prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'}) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['PredictedLabel'].values, truth=dataset.test.y, probabilities=prediction_df.values[:, :-1], probabilities_labels=list( prediction_df.columns.values[:-1]), ) if config.type == 'regression': prediction_df = pd.read_csv(output_prediction_path) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['Score'].values, truth=dataset.test.y, ) return dict( models_count=models_count, training_duration=training.duration, predict_duration=prediction.duration, ) finally: if 'logs' in artifacts: logs_zip = os.path.join(log_dir, "logs.zip") zip_path(log_dir, logs_zip) clean_dir(log_dir, filter_=lambda p: p != logs_zip) if 'models' in artifacts: models_zip = os.path.join(output_dir, "models.zip") zip_path(output_dir, models_zip) clean_dir(output_dir, filter_=lambda p: p != models_zip) shutil.rmtree(tmpdir, ignore_errors=True)
"Setting up %s environment only for %s, no benchmark will be run.", args.mode, args.framework) if not args.keep_scores and args.mode != 'local': log.warning( "`keep_scores` parameter is currently ignored in %s mode, scores are always saved in this mode.", args.mode) bench.setup(amlb.SetupMode[args.setup]) if args.setup != 'only': res = bench.run(args.task, args.fold) except (ValueError, AutoMLError) as e: log.error('\nERROR:\n%s', e) if extras.get('verbose') is True: log.exception(e) code = 1 except Exception as e: log.exception(e) code = 2 finally: archives = amlb.resources.config().archive if archives and bench: out_dirs = bench.output_dirs for d in archives: if d in out_dirs: zip_path(out_dirs[d], os.path.join(out_dirs.session, f"{d}.zip")) shutil.rmtree(out_dirs[d], ignore_errors=True) sys.exit(code)
args.mode, args.framework) if not args.keep_scores and args.mode != 'local': log.warning( "`keep_scores` parameter is currently ignored in %s mode, scores are always saved in this mode.", args.mode) bench.setup(amlb.SetupMode[args.setup]) if args.setup != 'only': res = bench.run(args.task, args.fold) except (ValueError, AutoMLError) as e: log.error('\nERROR:\n%s', e) if extras.get('verbose') is True: log.exception(e) code = 1 except Exception as e: log.exception(e) code = 2 finally: archives = amlb.resources.config().archive if archives and bench: out_dirs = bench.output_dirs for d in archives: if d in out_dirs: zip_path(out_dirs[d], os.path.join(out_dirs.session, f"{d}.zip"), arcpathformat='long') shutil.rmtree(out_dirs[d], ignore_errors=True) sys.exit(code)