def save_predictions(model, test, dataset, config, predictions_file=None, preview=True): h2o_preds = model.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values prob_labels = h2o_preds[0][1:] if all([re.fullmatch(r"p\d+", p) for p in prob_labels]): # for categories represented as numerical values, h2o prefixes the probabilities columns with p # in this case, we let the app setting the labels to avoid mismatch prob_labels = None truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file if predictions_file is None else predictions_file, probabilities=probabilities, probabilities_labels=prob_labels, predictions=predictions, truth=truth, preview=preview)
def save_predictions(model, test, dataset, config, predictions_file=None, preview=True): h2o_preds = model.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values prob_labels = h2o_preds[0][1:] truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file if predictions_file is None else predictions_file, probabilities=probabilities, probabilities_labels=prob_labels, predictions=predictions, truth=truth, preview=preview)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()