def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import multilabel_accuracy from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeightedBinary AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('sigmoid', nn.Sigmoid()) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('bce_with_logits', nn.BCEWithLogitsLoss, None, False) loss_selector.add_loss_module('bce_with_logits_weighted', nn.BCEWithLogitsLoss, LossWeightStrategyWeightedBinary(), False) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('multilabel_accuracy', multilabel_accuracy) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = False cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = False
def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import mean_distance AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('none', nn.Sequential()) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('l1_loss', nn.L1Loss) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('mean_distance', mean_distance) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = True cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = False
def score(self, X_test, Y_test, return_loss_value=False): """Calculate the sore on test data using the specified optimize_metric Arguments: X_test {array} -- The test data matrix. Y_test {array} -- The test targets. Returns: score -- The score for the test data. """ # run predict pipeline X_test, Y_test = self.check_data_array_types(X_test, Y_test) autonet_config = self.autonet_config or self.base_config self.pipeline.predict_pipeline(pipeline_config=autonet_config, X=X_test) Y_pred = self.pipeline[ OptimizationAlgorithm.get_name()].predict_output['Y'] # one hot encode Y OHE = self.pipeline[OneHotEncoding.get_name()] Y_test = OHE.transform_y(Y_test, OHE.fit_output['y_one_hot_encoder']) metric = self.pipeline[ MetricSelector.get_name()].fit_output['optimize_metric'] if return_loss_value: return metric.get_loss_value(Y_pred, Y_test) return metric(Y_pred, Y_test)
def test_selector(self): pipeline = Pipeline([MetricSelector()]) selector = pipeline[MetricSelector.get_name()] selector.add_metric("auc", auc_metric) selector.add_metric("accuracy", accuracy) selector.add_metric("mean", mean_distance) pipeline_config = pipeline.get_pipeline_config( optimize_metric="accuracy", additional_metrics=['auc', 'mean']) pipeline.fit_pipeline(pipeline_config=pipeline_config) selected_optimize_metric = selector.fit_output['optimize_metric'] selected_additional_metrics = selector.fit_output['additional_metrics'] self.assertEqual(selected_optimize_metric.metric, accuracy) self.assertSetEqual(set(x.metric for x in selected_additional_metrics), set([auc_metric, mean_distance]))
def predict(self, X, return_probabilities=False, return_metric=False): # run predict pipeline X, = self.check_data_array_types(X) prediction = None autonet_config = self.get_current_autonet_config() identifiers_with_budget, weights = self.fit_result["ensemble"].identifiers_, self.fit_result["ensemble"].weights_ baseline_id2model = BaselineTrainer.identifiers_ens model_dirs = [os.path.join(self.autonet_config["result_logger_dir"], "models", str(ident) + ".torch") for ident in identifiers_with_budget] # get data preprocessing pipeline for ident, weight in zip(identifiers_with_budget, weights): if weight==0: continue if ident[0]>=0: model_dir = os.path.join(self.autonet_config["result_logger_dir"], "models", str(ident) + ".torch") logging.info("==> Inferring model model " + model_dir + ", adding preds with weight " + str(weight)) model = torch.load(model_dir) autonet_config["model"] = model current_prediction = self.trained_autonet.pipeline.predict_pipeline(pipeline_config=autonet_config, X=X)['Y'] prediction = current_prediction if prediction is None else prediction + weight * current_prediction OHE = self.trained_autonet.pipeline[OneHotEncoding.get_name()] metric = self.trained_autonet.pipeline[MetricSelector.get_name()].fit_output['optimize_metric'] else: model_dir = os.path.join(self.autonet_config["result_logger_dir"], "models", str(ident) + ".pkl") info_dir = os.path.join(self.autonet_config["result_logger_dir"], "models", str(ident) + "_info.pkl") logging.info("==> Inferring model model " + model_dir + ", adding preds with weight " + str(weight)) baseline_model = baseline_id2model[ident[0]]() baseline_model.load(model_dir, info_dir) current_prediction = baseline_model.predict(X_test=X, predict_proba=True) prediction = current_prediction if prediction is None else prediction + weight * current_prediction # reverse one hot encoding result = OHE.reverse_transform_y(prediction, OHE.fit_output['y_one_hot_encoder']) if not return_probabilities and not return_metric: return result result = [result] if return_probabilities: result.append(prediction) if return_metric: result.append(metric) return tuple(result) """
def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.components.preprocessing.resampling import RandomOverSamplingWithReplacement, RandomUnderSamplingWithReplacement, SMOTE, \ TargetSizeStrategyAverageSample, TargetSizeStrategyDownsample, TargetSizeStrategyMedianSample, TargetSizeStrategyUpsample import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import accuracy from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeighted AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('softmax', nn.Softmax(1)) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('cross_entropy', nn.CrossEntropyLoss, None, True) loss_selector.add_loss_module('cross_entropy_weighted', nn.CrossEntropyLoss, LossWeightStrategyWeighted(), True) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('accuracy', accuracy) resample_selector = pipeline[ResamplingStrategySelector.get_name()] resample_selector.add_over_sampling_method( 'random', RandomOverSamplingWithReplacement) resample_selector.add_over_sampling_method('smote', SMOTE) resample_selector.add_under_sampling_method( 'random', RandomUnderSamplingWithReplacement) resample_selector.add_target_size_strategy('upsample', TargetSizeStrategyUpsample) resample_selector.add_target_size_strategy( 'downsample', TargetSizeStrategyDownsample) resample_selector.add_target_size_strategy( 'average', TargetSizeStrategyAverageSample) resample_selector.add_target_size_strategy( 'median', TargetSizeStrategyMedianSample) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = False cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = True one_hot_encoding_node = pipeline[OneHotEncoding.get_name()] one_hot_encoding_node.encode_Y = True return pipeline
def fit(self, pipeline_config, final_metric_score, optimized_hyperparameter_config, budget, refit=None): if refit or pipeline_config["ensemble_size"] == 0 or pipeline_config[ "task_id"] not in [-1, 1]: return { "final_metric_score": final_metric_score, "optimized_hyperparameter_config": optimized_hyperparameter_config, "budget": budget } filename = os.path.join(pipeline_config["result_logger_dir"], 'predictions_for_ensemble.npy') train_metric = self.pipeline[MetricSelector.get_name()].metrics[ pipeline_config["train_metric"]] y_transform = self.pipeline[ OneHotEncoding.get_name()].complete_y_tranformation result = logged_results_to_HBS_result( pipeline_config["result_logger_dir"]) all_predictions, labels, model_identifiers, _ = read_ensemble_prediction_file( filename=filename, y_transform=y_transform) ensemble_selection, ensemble_configs = build_ensemble( result=result, train_metric=train_metric, minimize=pipeline_config["minimize"], ensemble_size=pipeline_config["ensemble_size"], all_predictions=all_predictions, labels=labels, model_identifiers=model_identifiers, only_consider_n_best=pipeline_config[ "ensemble_only_consider_n_best"], sorted_initialization_n_best=pipeline_config[ "ensemble_sorted_initialization_n_best"]) return { "final_metric_score": final_metric_score, "optimized_hyperparameter_config": optimized_hyperparameter_config, "budget": budget, "ensemble": ensemble_selection, "ensemble_final_metric_score": ensemble_selection.get_validation_performance(), "ensemble_configs": ensemble_configs }
def _apply_default_pipeline_settings(pipeline): import torch.nn as nn from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.image.simple_train_node import SimpleTrainNode from autoPyTorch.pipeline.nodes.image.cross_validation_indices import CrossValidationIndices from autoPyTorch.pipeline.nodes.image.loss_module_selector_indices import LossModuleSelectorIndices from autoPyTorch.pipeline.nodes.image.network_selector_datasetinfo import NetworkSelectorDatasetInfo from autoPyTorch.components.metrics import accuracy, auc_metric, pac_metric, balanced_accuracy, cross_entropy from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeighted AutoNetImageData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelectorDatasetInfo.get_name()] net_selector.add_final_activation('softmax', nn.Softmax(1)) loss_selector = pipeline[LossModuleSelectorIndices.get_name()] loss_selector.add_loss_module('cross_entropy', nn.CrossEntropyLoss, None, True) loss_selector.add_loss_module('cross_entropy_weighted', nn.CrossEntropyLoss, LossWeightStrategyWeighted(), True) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('accuracy', accuracy, loss_transform=True, requires_target_class_labels=False) metric_selector.add_metric('auc_metric', auc_metric, loss_transform=True, requires_target_class_labels=False) metric_selector.add_metric('pac_metric', pac_metric, loss_transform=True, requires_target_class_labels=False) metric_selector.add_metric('balanced_accuracy', balanced_accuracy, loss_transform=True, requires_target_class_labels=True) metric_selector.add_metric('cross_entropy', cross_entropy, loss_transform=True, requires_target_class_labels=False) train_node = pipeline[SimpleTrainNode.get_name()] train_node.default_minimize_value = False cv = pipeline[CrossValidationIndices.get_name()] cv.use_stratified_cv_split_default = True
def get_default_pipeline(cls): from autoPyTorch.pipeline.base.pipeline import Pipeline from autoPyTorch.pipeline.nodes.image.optimization_algorithm_no_timelimit import OptimizationAlgorithmNoTimeLimit from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding from autoPyTorch.pipeline.nodes.optimizer_selector import OptimizerSelector from autoPyTorch.pipeline.nodes.log_functions_selector import LogFunctionsSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.image.simple_scheduler_selector import SimpleLearningrateSchedulerSelector from autoPyTorch.pipeline.nodes.image.cross_validation_indices import CrossValidationIndices from autoPyTorch.pipeline.nodes.image.autonet_settings_no_shuffle import AutoNetSettingsNoShuffle from autoPyTorch.pipeline.nodes.image.network_selector_datasetinfo import NetworkSelectorDatasetInfo from autoPyTorch.pipeline.nodes.image.loss_module_selector_indices import LossModuleSelectorIndices from autoPyTorch.pipeline.nodes.image.image_augmentation import ImageAugmentation from autoPyTorch.pipeline.nodes.image.create_image_dataloader import CreateImageDataLoader from autoPyTorch.pipeline.nodes.image.create_dataset_info import CreateDatasetInfo from autoPyTorch.pipeline.nodes.image.simple_train_node import SimpleTrainNode from autoPyTorch.pipeline.nodes.image.image_dataset_reader import ImageDatasetReader from autoPyTorch.pipeline.nodes.image.single_dataset import SingleDataset # build the pipeline pipeline = Pipeline([ AutoNetSettingsNoShuffle(), OptimizationAlgorithmNoTimeLimit([ SingleDataset([ ImageDatasetReader(), CreateDatasetInfo(), CrossValidationIndices([ NetworkSelectorDatasetInfo(), OptimizerSelector(), SimpleLearningrateSchedulerSelector(), LogFunctionsSelector(), MetricSelector(), LossModuleSelectorIndices(), ImageAugmentation(), CreateImageDataLoader(), SimpleTrainNode() ]) ]) ]) ]) cls._apply_default_pipeline_settings(pipeline) return pipeline
def get_default_pipeline(cls): from autoPyTorch.pipeline.base.pipeline import Pipeline from autoPyTorch.pipeline.nodes.autonet_settings import AutoNetSettings from autoPyTorch.pipeline.nodes.optimization_algorithm import OptimizationAlgorithm from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation from autoPyTorch.pipeline.nodes.imputation import Imputation from autoPyTorch.pipeline.nodes.normalization_strategy_selector import NormalizationStrategySelector from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding from autoPyTorch.pipeline.nodes.preprocessor_selector import PreprocessorSelector from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.pipeline.nodes.embedding_selector import EmbeddingSelector from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.optimizer_selector import OptimizerSelector from autoPyTorch.pipeline.nodes.lr_scheduler_selector import LearningrateSchedulerSelector from autoPyTorch.pipeline.nodes.log_functions_selector import LogFunctionsSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode # build the pipeline pipeline = Pipeline([ AutoNetSettings(), OptimizationAlgorithm([ CrossValidation([ Imputation(), NormalizationStrategySelector(), OneHotEncoding(), PreprocessorSelector(), ResamplingStrategySelector(), EmbeddingSelector(), NetworkSelector(), OptimizerSelector(), LearningrateSchedulerSelector(), LogFunctionsSelector(), MetricSelector(), LossModuleSelector(), TrainNode() ]) ]) ]) cls._apply_default_pipeline_settings(pipeline) return pipeline
def score(self, X_test, Y_test, return_loss_value=False): """Calculate the sore on test data using the specified optimize_metric Arguments: X_test {array} -- The test data matrix. Y_test {array} -- The test targets. Returns: score -- The score for the test data. """ # Update config if needed X_test, Y_test = self.check_data_array_types(X_test, Y_test) autonet_config = self.get_current_autonet_config() res = self.pipeline.predict_pipeline(pipeline_config=autonet_config, X=X_test) if 'score' in res: # in case of default dataset like CIFAR10 - the pipeline will compute the score of the according pytorch test set return res['score'] Y_pred = res['Y'] # run predict pipeline #self.pipeline.predict_pipeline(pipeline_config=autonet_config, X=X_test) #Y_pred = self.pipeline[OptimizationAlgorithm.get_name()].predict_output['Y'] # one hot encode Y try: OHE = self.pipeline[OneHotEncoding.get_name()] Y_test = OHE.transform_y(Y_test, OHE.fit_output['y_one_hot_encoder']) except: print("No one-hot encodig possible. Continuing without.") pass metric = self.pipeline[ MetricSelector.get_name()].fit_output['optimize_metric'] if return_loss_value: return metric.get_loss_value(Y_pred, Y_test) return metric(torch.from_numpy(Y_pred.astype(np.float32)), torch.from_numpy(Y_test.astype(np.float32)))
def score(self, X_test, Y_test): """Calculate the sore on test data using the specified train_metric Arguments: X_test {array} -- The test data matrix. Y_test {array} -- The test targets. Returns: score -- The score for the test data. """ # run predict pipeline self.pipeline.predict_pipeline(pipeline_config=self.autonet_config, X=X_test) Y_pred = self.pipeline[OptimizationAlgorithm.get_name()].predict_output['Y'] # one hot encode Y OHE = self.pipeline[OneHotEncoding.get_name()] Y_test = OHE.transform_y(Y_test, OHE.fit_output['y_one_hot_encoder']) metric = self.pipeline[MetricSelector.get_name()].fit_output['train_metric'] return metric(torch.from_numpy(Y_test), torch.from_numpy(Y_pred))
def predict(self, X, return_probabilities=False, return_metric=False): # run predict pipeline X, = self.check_data_array_types(X) prediction = None models_with_weights = self.fit_result["ensemble"].get_models_with_weights(self.trained_autonets) autonet_config = self.autonet_config or self.base_config for weight, autonet in models_with_weights: current_prediction = autonet.pipeline.predict_pipeline(pipeline_config=autonet_config, X=X)["Y"] prediction = current_prediction if prediction is None else prediction + weight * current_prediction OHE = autonet.pipeline[OneHotEncoding.get_name()] metric = autonet.pipeline[MetricSelector.get_name()].fit_output['optimize_metric'] # reverse one hot encoding result = OHE.reverse_transform_y(prediction, OHE.fit_output['y_one_hot_encoder']) if not return_probabilities and not return_metric: return result result = [result] if return_probabilities: result.append(prediction) if return_metric: result.append(metric) return tuple(result)
def save_ensemble_logs(pipeline_config, autonet, result_dir, ensemble_size=None, log_filename=None): # prepare some variables autonet_config = autonet.get_current_autonet_config() metrics = autonet.pipeline[MetricSelector.get_name()].metrics optimize_metric = metrics[autonet_config["optimize_metric"]] y_transform = autonet.pipeline[OneHotEncoding.get_name()].complete_y_tranformation result = logged_results_to_HBS_result(result_dir) filename = os.path.join(result_dir, "predictions_for_ensemble.npy") test_filename = os.path.join(result_dir, "test_predictions_for_ensemble.npy") ensemble_log_filename = os.path.join(result_dir, log_filename or "ensemble_log.json") with open(ensemble_log_filename, "w") as f: pass # read the predictions predictions, labels, model_identifiers, timestamps = read_ensemble_prediction_file(filename=filename, y_transform=y_transform) assert(list(map(lambda x: x["finished"], timestamps)) == sorted(list(map(lambda x: x["finished"], timestamps)))) test_data_available = False try: test_predictions, test_labels, test_model_identifiers, test_timestamps = read_ensemble_prediction_file(filename=test_filename, y_transform=y_transform) test_predictions = [np.mean(p, axis=0) for p in test_predictions] assert test_model_identifiers == model_identifiers and test_timestamps == timestamps, "Different model identifiers or timestamps in test file" predictions, model_identifiers, timestamps, test_predictions = \ filter_nan_predictions(predictions, model_identifiers, timestamps, test_predictions) test_data_available = True except IOError: logging.getLogger("benchmark").info("No test data available when building ensemble logs.") predictions, model_identifiers, timestamps = \ filter_nan_predictions(predictions, model_identifiers, timestamps) # compute the prediction subset used to compute performance over time start_time = min(map(lambda t: t["submitted"], timestamps)) end_time = max(map(lambda t: t["finished"], timestamps)) step = math.log(end_time - start_time) / (pipeline_config["num_ensemble_evaluations"] - 1) steps = start_time + np.exp(np.arange(step, step * (pipeline_config["num_ensemble_evaluations"] + 1), step)) subset_indices = [np.array([i for i, t in enumerate(timestamps) if t["finished"] < s]) for s in steps] # iterate over the subset to compute performance over time last_finished = 0 for subset in subset_indices: if len(subset) == 0: continue finished = max(timestamps[s]["finished"] for s in subset) if finished == last_finished: continue last_finished = finished subset_predictions = [np.copy(predictions[s]) for s in subset] subset_model_identifiers = [model_identifiers[s] for s in subset] # build an ensemble with current subset and size ensemble_start_time = time.time() ensemble, _ = build_ensemble(result=result, optimize_metric=optimize_metric, ensemble_size=ensemble_size or autonet_config["ensemble_size"], all_predictions=subset_predictions, labels=labels, model_identifiers=subset_model_identifiers, only_consider_n_best=autonet_config["ensemble_only_consider_n_best"], sorted_initialization_n_best=autonet_config["ensemble_sorted_initialization_n_best"]) # get the ensemble predictions ensemble_prediction = ensemble.predict(subset_predictions) if test_data_available: subset_test_predictions = [np.copy(test_predictions[s]) for s in subset] test_ensemble_prediction = ensemble.predict(subset_test_predictions) # evaluate the metrics metric_performances = dict() for metric_name, metric in metrics.items(): if metric_name != autonet_config["optimize_metric"] and metric_name not in autonet_config["additional_metrics"]: continue metric_performances[metric_name] = metric(ensemble_prediction, labels) if test_data_available: metric_performances["test_%s" % metric_name] = metric(test_ensemble_prediction, test_labels) ensemble_time = time.time() - ensemble_start_time # write to log with open(ensemble_log_filename, "a") as f: print(json.dumps([ finished + ensemble_time, metric_performances, sorted([(identifier, weight) for identifier, weight in zip(ensemble.identifiers_, ensemble.weights_) if weight > 0], key=lambda x: -x[1]), [ensemble.identifiers_[i] for i in ensemble.indices_], { "ensemble_size": ensemble.ensemble_size, "metric": autonet_config["optimize_metric"], "sorted_initialization_n_best": ensemble.sorted_initialization_n_best, "only_consider_n_best": ensemble.only_consider_n_best, "bagging": ensemble.bagging, "mode": ensemble.mode, "num_input_models": ensemble.num_input_models_, "trajectory": ensemble.trajectory_, "train_score": ensemble.train_score_ } ]), file=f)