def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import mean_distance AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('none', nn.Sequential()) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('l1_loss', nn.L1Loss) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('mean_distance', mean_distance) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = True cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = False
def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import multilabel_accuracy from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeightedBinary AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('sigmoid', nn.Sigmoid()) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('bce_with_logits', nn.BCEWithLogitsLoss, None, False) loss_selector.add_loss_module('bce_with_logits_weighted', nn.BCEWithLogitsLoss, LossWeightStrategyWeightedBinary(), False) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('multilabel_accuracy', multilabel_accuracy) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = False cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = False
def fit(self, pipeline_config, hyperparameter_config, X, Y, train_indices, valid_indices): hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) logger = logging.getLogger('autonet') if hyperparameter_config['target_size_strategy'] == 'none': return dict() over_sampling_method = self.over_sampling_methods[hyperparameter_config['over_sampling_method']]( ConfigWrapper(hyperparameter_config['over_sampling_method'], hyperparameter_config) ) under_sampling_method = self.under_sampling_methods[hyperparameter_config['under_sampling_method']]( ConfigWrapper(hyperparameter_config['under_sampling_method'], hyperparameter_config) ) target_size_strategy = self.target_size_strategies[hyperparameter_config['target_size_strategy']]() y = np.argmax(Y[train_indices], axis=1).astype(int) ohe = OneHotEncoder(categories="auto", sparse=False) ohe.fit(y.reshape((-1, 1))) over_sampling_target_size = target_size_strategy.over_sample_strategy(y) under_sampling_target_size = target_size_strategy.under_sample_strategy(y) logger.debug("Distribution before resample: " + str(np.unique(y, return_counts=True)[1])) X_resampled, y_resampled = over_sampling_method.resample(X[train_indices], y, over_sampling_target_size, pipeline_config["random_seed"]) X_resampled, y_resampled = under_sampling_method.resample(X_resampled, y_resampled, under_sampling_target_size, pipeline_config["random_seed"]) logger.debug("Distribution after resample: " + str(np.unique(y_resampled, return_counts=True)[1])) if valid_indices is None: return {"X": X_resampled, "Y": ohe.transform(y_resampled.reshape((-1, 1))), "train_indices": np.array(list(range(X_resampled.shape[0])))} X, Y, split_indices = CrossValidation.get_validation_set_split_indices(pipeline_config, X_train=X_resampled, X_valid=X[valid_indices], Y_train=ohe.transform(y_resampled.reshape((-1, 1))), Y_valid=Y[valid_indices], allow_shuffle=False) return {"X": X, "Y": Y, "train_indices": split_indices[0], "valid_indices": split_indices[1]}
def _apply_default_pipeline_settings(pipeline): from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.components.preprocessing.resampling import RandomOverSamplingWithReplacement, RandomUnderSamplingWithReplacement, SMOTE, \ TargetSizeStrategyAverageSample, TargetSizeStrategyDownsample, TargetSizeStrategyMedianSample, TargetSizeStrategyUpsample import torch.nn as nn from autoPyTorch.components.metrics.standard_metrics import accuracy from autoPyTorch.components.preprocessing.loss_weight_strategies import LossWeightStrategyWeighted AutoNetFeatureData._apply_default_pipeline_settings(pipeline) net_selector = pipeline[NetworkSelector.get_name()] net_selector.add_final_activation('softmax', nn.Softmax(1)) loss_selector = pipeline[LossModuleSelector.get_name()] loss_selector.add_loss_module('cross_entropy', nn.CrossEntropyLoss, None, True) loss_selector.add_loss_module('cross_entropy_weighted', nn.CrossEntropyLoss, LossWeightStrategyWeighted(), True) metric_selector = pipeline[MetricSelector.get_name()] metric_selector.add_metric('accuracy', accuracy) resample_selector = pipeline[ResamplingStrategySelector.get_name()] resample_selector.add_over_sampling_method( 'random', RandomOverSamplingWithReplacement) resample_selector.add_over_sampling_method('smote', SMOTE) resample_selector.add_under_sampling_method( 'random', RandomUnderSamplingWithReplacement) resample_selector.add_target_size_strategy('upsample', TargetSizeStrategyUpsample) resample_selector.add_target_size_strategy( 'downsample', TargetSizeStrategyDownsample) resample_selector.add_target_size_strategy( 'average', TargetSizeStrategyAverageSample) resample_selector.add_target_size_strategy( 'median', TargetSizeStrategyMedianSample) train_node = pipeline[TrainNode.get_name()] train_node.default_minimize_value = False cv = pipeline[CrossValidation.get_name()] cv.use_stratified_cv_split_default = True one_hot_encoding_node = pipeline[OneHotEncoding.get_name()] one_hot_encoding_node.encode_Y = True return pipeline
def get_default_pipeline(cls): from autoPyTorch.pipeline.base.pipeline import Pipeline from autoPyTorch.pipeline.nodes.autonet_settings import AutoNetSettings from autoPyTorch.pipeline.nodes.optimization_algorithm import OptimizationAlgorithm from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation from autoPyTorch.pipeline.nodes.imputation import Imputation from autoPyTorch.pipeline.nodes.normalization_strategy_selector import NormalizationStrategySelector from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding from autoPyTorch.pipeline.nodes.preprocessor_selector import PreprocessorSelector from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector from autoPyTorch.pipeline.nodes.embedding_selector import EmbeddingSelector from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector from autoPyTorch.pipeline.nodes.optimizer_selector import OptimizerSelector from autoPyTorch.pipeline.nodes.lr_scheduler_selector import LearningrateSchedulerSelector from autoPyTorch.pipeline.nodes.log_functions_selector import LogFunctionsSelector from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector from autoPyTorch.pipeline.nodes.train_node import TrainNode # build the pipeline pipeline = Pipeline([ AutoNetSettings(), OptimizationAlgorithm([ CrossValidation([ Imputation(), NormalizationStrategySelector(), OneHotEncoding(), PreprocessorSelector(), ResamplingStrategySelector(), EmbeddingSelector(), NetworkSelector(), OptimizerSelector(), LearningrateSchedulerSelector(), LogFunctionsSelector(), MetricSelector(), LossModuleSelector(), TrainNode() ]) ]) ]) cls._apply_default_pipeline_settings(pipeline) return pipeline
def test_cross_validation(self): class ResultNode(PipelineNode): def fit(self, X, Y, train_indices, valid_indices): return { 'loss': np.sum(X[valid_indices]), 'info': {'a': np.sum(X[train_indices]), 'b': np.sum(X[valid_indices])} } pipeline = Pipeline([ CrossValidation([ ResultNode() ]) ]) pipeline["CrossValidation"].add_cross_validator("k_fold", KFold, lambda x: x.reshape((-1 ,))) pipeline["CrossValidation"].add_cross_validator("stratified_k_fold", StratifiedKFold, lambda x: x.reshape((-1 ,))) x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y_train = np.array([[1], [0], [1]]) # test cv_splits pipeline_config = pipeline.get_pipeline_config(cross_validator="k_fold", cross_validator_args={"n_splits": 3}) pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config) dataset_info = DataSetInfo() dataset_info.categorical_features = [None] * 3 dataset_info.x_shape = x_train.shape dataset_info.y_shape = y_train.shape pipeline_config["random_seed"] = 42 cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False) self.assertEqual(cv_result['loss'], 15) self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15}) # test validation split pipeline_config = pipeline.get_pipeline_config(validation_split=0.3) pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config) pipeline_config['random_seed'] = 42 dataset_info = DataSetInfo() dataset_info.categorical_features = [None] * 3 dataset_info.x_shape = x_train.shape dataset_info.y_shape = y_train.shape cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False) self.assertEqual(cv_result['loss'], 24) self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24}) # test stratified cv split x_valid = x_train y_valid = y_train x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]]) y_train = np.array([[1], [1], [0], [0], [1], [0]]) pipeline_config = pipeline.get_pipeline_config(cross_validator="stratified_k_fold", cross_validator_args={"n_splits": 3}) pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config) pipeline_config['random_seed'] = 42 dataset_info = DataSetInfo() dataset_info.categorical_features = [None] * 3 dataset_info.x_shape = x_train.shape dataset_info.y_shape = y_train.shape cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False) self.assertEqual(cv_result['loss'], 57) self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57}) pipeline_config = pipeline.get_pipeline_config() pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config) pipeline_config['random_seed'] = 42 dataset_info = DataSetInfo() dataset_info.categorical_features = [None] * 3 dataset_info.x_shape = x_train.shape dataset_info.y_shape = y_train.shape cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=x_valid, Y_valid=y_valid, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False) self.assertEqual(cv_result['loss'], 45) self.assertDictEqual(cv_result['info'], {'a': 171, 'b': 45})
def test_cross_validation(self): class ResultNode(PipelineNode): def fit(self, X_train, X_valid): return { 'loss': np.sum(X_valid), 'info': { 'a': np.sum(X_train), 'b': np.sum(X_valid) } } pipeline = Pipeline([CrossValidation([ResultNode()])]) x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y_train = np.array([[1], [0], [1]]) # test cv_splits pipeline_config = pipeline.get_pipeline_config(cv_splits=3) pipeline_config_space = pipeline.get_hyperparameter_search_space( **pipeline_config) pipeline_config['categorical_features'] = None cv_result = pipeline.fit_pipeline( hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time()) self.assertEqual(cv_result['loss'], 15) self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15}) # test validation split pipeline_config = pipeline.get_pipeline_config(validation_split=0.3) pipeline_config_space = pipeline.get_hyperparameter_search_space( **pipeline_config) pipeline_config['categorical_features'] = None cv_result = pipeline.fit_pipeline( hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time()) self.assertEqual(cv_result['loss'], 24) self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24}) # test stratified cv split x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]]) y_train = np.array([[1], [1], [0], [0], [1], [0]]) pipeline_config = pipeline.get_pipeline_config( cv_splits=3, use_stratified_cv_split=True) pipeline_config_space = pipeline.get_hyperparameter_search_space( **pipeline_config) pipeline_config['categorical_features'] = None cv_result = pipeline.fit_pipeline( hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None, optimize_start_time=time.time()) self.assertEqual(cv_result['loss'], 57) self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57})