def get_default_pipeline(cls): from autoPyTorch.pipeline.base.pipeline import Pipeline from autoPyTorch.pipeline.nodes import AutoNetSettings, OptimizationAlgorithm, \ CrossValidation, Imputation, NormalizationStrategySelector, OneHotEncoding, PreprocessorSelector, ResamplingStrategySelector, \ EmbeddingSelector, NetworkSelector, OptimizerSelector, LearningrateSchedulerSelector, LogFunctionsSelector, MetricSelector, \ LossModuleSelector, TrainNode, CreateDataLoader, CreateDatasetInfo, InitializationSelector # build the pipeline pipeline = Pipeline([ AutoNetSettings(), CreateDatasetInfo(), OptimizationAlgorithm([ CrossValidation([ Imputation(), NormalizationStrategySelector(), OneHotEncoding(), PreprocessorSelector(), ResamplingStrategySelector(), EmbeddingSelector(), NetworkSelector(), InitializationSelector(), OptimizerSelector(), LearningrateSchedulerSelector(), LogFunctionsSelector(), MetricSelector(), LossModuleSelector(), CreateDataLoader(), TrainNode() ]) ]), ]) cls._apply_default_pipeline_settings(pipeline) return pipeline
def fit(self, pipeline_config, final_metric_score, optimized_hyperparameter_config, budget, refit=None): if refit or pipeline_config["ensemble_size"] == 0 or pipeline_config[ "task_id"] not in [-1, 1]: return { "final_metric_score": final_metric_score, "optimized_hyperparameter_config": optimized_hyperparameter_config, "budget": budget } filename = os.path.join(pipeline_config["result_logger_dir"], 'predictions_for_ensemble.npy') train_metric = self.pipeline[MetricSelector.get_name()].metrics[ pipeline_config["train_metric"]] y_transform = self.pipeline[ OneHotEncoding.get_name()].complete_y_tranformation result = logged_results_to_HBS_result( pipeline_config["result_logger_dir"]) all_predictions, labels, model_identifiers, _ = read_ensemble_prediction_file( filename=filename, y_transform=y_transform) ensemble_selection, ensemble_configs = build_ensemble( result=result, train_metric=train_metric, minimize=pipeline_config["minimize"], ensemble_size=pipeline_config["ensemble_size"], all_predictions=all_predictions, labels=labels, model_identifiers=model_identifiers, only_consider_n_best=pipeline_config[ "ensemble_only_consider_n_best"], sorted_initialization_n_best=pipeline_config[ "ensemble_sorted_initialization_n_best"]) return { "final_metric_score": final_metric_score, "optimized_hyperparameter_config": optimized_hyperparameter_config, "budget": budget, "ensemble": ensemble_selection, "ensemble_final_metric_score": ensemble_selection.get_validation_performance(), "ensemble_configs": ensemble_configs }
def get_default_ensemble_pipeline(cls): """Construct a default pipeline, include nodes for Ensemble. Returns: Pipeline -- The constructed default pipeline """ from autoPyTorch.pipeline.base.pipeline import Pipeline from autoPyTorch.pipeline.nodes import AutoNetSettings, OptimizationAlgorithm, \ CrossValidation, Imputation, NormalizationStrategySelector, OneHotEncoding, PreprocessorSelector, ResamplingStrategySelector, \ EmbeddingSelector, NetworkSelector, OptimizerSelector, LearningrateSchedulerSelector, LogFunctionsSelector, MetricSelector, \ LossModuleSelector, TrainNode, CreateDataLoader, CreateDatasetInfo, EnableComputePredictionsForEnsemble, SavePredictionsForEnsemble, \ BuildEnsemble, EnsembleServer, InitializationSelector, BaselineTrainer # build the pipeline pipeline = Pipeline([ AutoNetSettings(), CreateDatasetInfo(), EnsembleServer(), OptimizationAlgorithm([ CrossValidation([ Imputation(), BaselineTrainer(), NormalizationStrategySelector(), OneHotEncoding(), PreprocessorSelector(), ResamplingStrategySelector(), EmbeddingSelector(), NetworkSelector(), InitializationSelector(), OptimizerSelector(), LearningrateSchedulerSelector(), LogFunctionsSelector(), MetricSelector(), EnableComputePredictionsForEnsemble(), LossModuleSelector(), CreateDataLoader(), TrainNode(), SavePredictionsForEnsemble() ]) ]), BuildEnsemble() ]) cls._apply_default_pipeline_settings(pipeline) return pipeline
def save_ensemble_logs(pipeline_config, autonet, result_dir, ensemble_size=None, log_filename=None): # prepare some variables autonet_config = autonet.get_current_autonet_config() metrics = autonet.pipeline[MetricSelector.get_name()].metrics optimize_metric = metrics[autonet_config["optimize_metric"]] y_transform = autonet.pipeline[OneHotEncoding.get_name()].complete_y_tranformation result = logged_results_to_HBS_result(result_dir) filename = os.path.join(result_dir, "predictions_for_ensemble.npy") test_filename = os.path.join(result_dir, "test_predictions_for_ensemble.npy") ensemble_log_filename = os.path.join(result_dir, log_filename or "ensemble_log.json") with open(ensemble_log_filename, "w") as f: pass # read the predictions predictions, labels, model_identifiers, timestamps = read_ensemble_prediction_file(filename=filename, y_transform=y_transform) assert(list(map(lambda x: x["finished"], timestamps)) == sorted(list(map(lambda x: x["finished"], timestamps)))) test_data_available = False try: test_predictions, test_labels, test_model_identifiers, test_timestamps = read_ensemble_prediction_file(filename=test_filename, y_transform=y_transform) test_predictions = [np.mean(p, axis=0) for p in test_predictions] assert test_model_identifiers == model_identifiers and test_timestamps == timestamps, "Different model identifiers or timestamps in test file" predictions, model_identifiers, timestamps, test_predictions = \ filter_nan_predictions(predictions, model_identifiers, timestamps, test_predictions) test_data_available = True except IOError: logging.getLogger("benchmark").info("No test data available when building ensemble logs.") predictions, model_identifiers, timestamps = \ filter_nan_predictions(predictions, model_identifiers, timestamps) # compute the prediction subset used to compute performance over time start_time = min(map(lambda t: t["submitted"], timestamps)) end_time = max(map(lambda t: t["finished"], timestamps)) step = math.log(end_time - start_time) / (pipeline_config["num_ensemble_evaluations"] - 1) steps = start_time + np.exp(np.arange(step, step * (pipeline_config["num_ensemble_evaluations"] + 1), step)) subset_indices = [np.array([i for i, t in enumerate(timestamps) if t["finished"] < s]) for s in steps] # iterate over the subset to compute performance over time last_finished = 0 for subset in subset_indices: if len(subset) == 0: continue finished = max(timestamps[s]["finished"] for s in subset) if finished == last_finished: continue last_finished = finished subset_predictions = [np.copy(predictions[s]) for s in subset] subset_model_identifiers = [model_identifiers[s] for s in subset] # build an ensemble with current subset and size ensemble_start_time = time.time() ensemble, _ = build_ensemble(result=result, optimize_metric=optimize_metric, ensemble_size=ensemble_size or autonet_config["ensemble_size"], all_predictions=subset_predictions, labels=labels, model_identifiers=subset_model_identifiers, only_consider_n_best=autonet_config["ensemble_only_consider_n_best"], sorted_initialization_n_best=autonet_config["ensemble_sorted_initialization_n_best"]) # get the ensemble predictions ensemble_prediction = ensemble.predict(subset_predictions) if test_data_available: subset_test_predictions = [np.copy(test_predictions[s]) for s in subset] test_ensemble_prediction = ensemble.predict(subset_test_predictions) # evaluate the metrics metric_performances = dict() for metric_name, metric in metrics.items(): if metric_name != autonet_config["optimize_metric"] and metric_name not in autonet_config["additional_metrics"]: continue metric_performances[metric_name] = metric(ensemble_prediction, labels) if test_data_available: metric_performances["test_%s" % metric_name] = metric(test_ensemble_prediction, test_labels) ensemble_time = time.time() - ensemble_start_time # write to log with open(ensemble_log_filename, "a") as f: print(json.dumps([ finished + ensemble_time, metric_performances, sorted([(identifier, weight) for identifier, weight in zip(ensemble.identifiers_, ensemble.weights_) if weight > 0], key=lambda x: -x[1]), [ensemble.identifiers_[i] for i in ensemble.indices_], { "ensemble_size": ensemble.ensemble_size, "metric": autonet_config["optimize_metric"], "sorted_initialization_n_best": ensemble.sorted_initialization_n_best, "only_consider_n_best": ensemble.only_consider_n_best, "bagging": ensemble.bagging, "mode": ensemble.mode, "num_input_models": ensemble.num_input_models_, "trajectory": ensemble.trajectory_, "train_score": ensemble.train_score_ } ]), file=f)