Ejemplo n.º 1
0
def save_artifacts(automl, dataset, config):
    artifacts = config.framework_params.get('_save_artifacts', ['leaderboard'])
    try:
        lb = automl.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        if 'leaderboard' in artifacts:
            models_dir = output_subdir("models", config)
            write_csv(lb, os.path.join(models_dir, "leaderboard.csv"))
        if 'models' in artifacts:
            models_dir = output_subdir("models", config)
            all_models_se = next(
                (mid for mid in lb['model_id']
                 if mid.startswith("StackedEnsemble_AllModels")), None)
            mformat = 'mojo' if 'mojos' in artifacts else 'json'
            if all_models_se and mformat == 'mojo':
                save_model(all_models_se, dest_dir=models_dir, mformat=mformat)
            else:
                for mid in lb['model_id']:
                    save_model(mid, dest_dir=models_dir, mformat=mformat)
                models_archive = os.path.join(models_dir, "models.zip")
                zip_path(models_dir, models_archive)

                def delete(path, isdir):
                    if path != models_archive and os.path.splitext(
                            path)[1] in ['.json', '.zip']:
                        os.remove(path)

                walk_apply(models_dir, delete, max_depth=0)

        if 'models_predictions' in artifacts:
            predictions_dir = output_subdir("predictions", config)
            test = h2o.get_frame(frame_name('test', config))
            for mid in lb['model_id']:
                model = h2o.get_model(mid)
                save_predictions(model,
                                 test,
                                 dataset=dataset,
                                 config=config,
                                 predictions_file=os.path.join(
                                     predictions_dir, mid, 'predictions.csv'),
                                 preview=False)
            zip_path(predictions_dir,
                     os.path.join(predictions_dir, "models_predictions.zip"))

            def delete(path, isdir):
                if isdir:
                    shutil.rmtree(path, ignore_errors=True)

            walk_apply(predictions_dir, delete, max_depth=0)

        if 'logs' in artifacts:
            logs_dir = output_subdir("logs", config)
            h2o.download_all_logs(dirname=logs_dir)
    except Exception:
        log.debug("Error when saving artifacts.", exc_info=True)
Ejemplo n.º 2
0
def run(dataset: Dataset, config: TaskConfig):
    log.info(f"\n**** MLNet [v{config.framework_version}] ****\n")

    avaible_task_list = ['classification', 'regression']
    if config.type not in avaible_task_list:
        raise ValueError(f'{config.type} is not supported.')

    dir_path = os.path.dirname(os.path.realpath(__file__))
    DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib')
    os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR
    os.environ['MLNetCLIEnablePredict'] = 'True'
    os.environ['MLNET_MAX_THREAD'] = str(config.cores)
    mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet')
    train_time_in_seconds = config.max_runtime_seconds
    sub_command = config.type

    # set up MODELBUILDER_AUTOML
    MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI')
    os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML

    artifacts = config.framework_params.get('_save_artifacts', [])
    tmpdir = tempfile.mkdtemp()
    tmp_output_folder = os.path.join(tmpdir, str(config.fold))
    output_dir = output_subdir(
        'models',
        config=config) if 'models' in artifacts else tmp_output_folder
    log_dir = output_subdir(
        'logs', config=config) if 'logs' in artifacts else tmp_output_folder
    log_path = os.path.join(log_dir, 'log.txt')

    try:
        label = dataset.target.index
        train_dataset_path = dataset.train.data_path('csv')
        test_dataset_path = dataset.test.data_path('csv')

        log.info(f'train dataset: {train_dataset_path}')
        log.info(f'test dataset: {test_dataset_path}')

        cmd = (
            f"{mlnet} {sub_command}"
            f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}"
            f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}"
            f" --verbosity q --log-file-path {log_path}")

        with Timer() as training:
            run_cmd(cmd)

        train_result_json = os.path.join(output_dir,
                                         '{}.mbconfig'.format(config.fold))
        if not os.path.exists(train_result_json):
            raise NoResultError("MLNet failed producing any prediction.")

        with open(train_result_json, 'r') as f:
            json_str = f.read()
            mb_config = json.loads(json_str)
            model_path = os.path.join(output_dir, f"{config.fold}.zip")
            output_prediction_path = os.path.join(
                log_dir, "prediction.txt"
            )  # keeping this in log dir as it contains useful error when prediction fails
            models_count = len(mb_config['RunHistory']['Trials'])
            # predict
            predict_cmd = (
                f"{mlnet} predict --task-type {config.type}"
                f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}"
            )
            with Timer() as prediction:
                run_cmd(predict_cmd)
            if config.type == 'classification':
                prediction_df = pd.read_csv(output_prediction_path,
                                            dtype={'PredictedLabel': 'object'})

                save_predictions(
                    dataset=dataset,
                    output_file=config.output_predictions_file,
                    predictions=prediction_df['PredictedLabel'].values,
                    truth=dataset.test.y,
                    probabilities=prediction_df.values[:, :-1],
                    probabilities_labels=list(
                        prediction_df.columns.values[:-1]),
                )

            if config.type == 'regression':
                prediction_df = pd.read_csv(output_prediction_path)
                save_predictions(
                    dataset=dataset,
                    output_file=config.output_predictions_file,
                    predictions=prediction_df['Score'].values,
                    truth=dataset.test.y,
                )

            return dict(
                models_count=models_count,
                training_duration=training.duration,
                predict_duration=prediction.duration,
            )
    finally:
        if 'logs' in artifacts:
            logs_zip = os.path.join(log_dir, "logs.zip")
            zip_path(log_dir, logs_zip)
            clean_dir(log_dir, filter_=lambda p: p != logs_zip)
        if 'models' in artifacts:
            models_zip = os.path.join(output_dir, "models.zip")
            zip_path(output_dir, models_zip)
            clean_dir(output_dir, filter_=lambda p: p != models_zip)

        shutil.rmtree(tmpdir, ignore_errors=True)
Ejemplo n.º 3
0
            "Setting up %s environment only for %s, no benchmark will be run.",
            args.mode, args.framework)

    if not args.keep_scores and args.mode != 'local':
        log.warning(
            "`keep_scores` parameter is currently ignored in %s mode, scores are always saved in this mode.",
            args.mode)

    bench.setup(amlb.SetupMode[args.setup])
    if args.setup != 'only':
        res = bench.run(args.task, args.fold)
except (ValueError, AutoMLError) as e:
    log.error('\nERROR:\n%s', e)
    if extras.get('verbose') is True:
        log.exception(e)
    code = 1
except Exception as e:
    log.exception(e)
    code = 2
finally:
    archives = amlb.resources.config().archive
    if archives and bench:
        out_dirs = bench.output_dirs
        for d in archives:
            if d in out_dirs:
                zip_path(out_dirs[d], os.path.join(out_dirs.session,
                                                   f"{d}.zip"))
                shutil.rmtree(out_dirs[d], ignore_errors=True)

    sys.exit(code)
Ejemplo n.º 4
0
            args.mode, args.framework)

    if not args.keep_scores and args.mode != 'local':
        log.warning(
            "`keep_scores` parameter is currently ignored in %s mode, scores are always saved in this mode.",
            args.mode)

    bench.setup(amlb.SetupMode[args.setup])
    if args.setup != 'only':
        res = bench.run(args.task, args.fold)
except (ValueError, AutoMLError) as e:
    log.error('\nERROR:\n%s', e)
    if extras.get('verbose') is True:
        log.exception(e)
    code = 1
except Exception as e:
    log.exception(e)
    code = 2
finally:
    archives = amlb.resources.config().archive
    if archives and bench:
        out_dirs = bench.output_dirs
        for d in archives:
            if d in out_dirs:
                zip_path(out_dirs[d],
                         os.path.join(out_dirs.session, f"{d}.zip"),
                         arcpathformat='long')
                shutil.rmtree(out_dirs[d], ignore_errors=True)

    sys.exit(code)