def get_default_pipeline(cls):
        from autoPyTorch.pipeline.base.pipeline import Pipeline
        from autoPyTorch.pipeline.nodes.autonet_settings import AutoNetSettings
        from autoPyTorch.pipeline.nodes.optimization_algorithm import OptimizationAlgorithm
        from autoPyTorch.pipeline.nodes.cross_validation import CrossValidation
        from autoPyTorch.pipeline.nodes.imputation import Imputation
        from autoPyTorch.pipeline.nodes.normalization_strategy_selector import NormalizationStrategySelector
        from autoPyTorch.pipeline.nodes.one_hot_encoding import OneHotEncoding
        from autoPyTorch.pipeline.nodes.preprocessor_selector import PreprocessorSelector
        from autoPyTorch.pipeline.nodes.resampling_strategy_selector import ResamplingStrategySelector
        from autoPyTorch.pipeline.nodes.embedding_selector import EmbeddingSelector
        from autoPyTorch.pipeline.nodes.network_selector import NetworkSelector
        from autoPyTorch.pipeline.nodes.optimizer_selector import OptimizerSelector
        from autoPyTorch.pipeline.nodes.lr_scheduler_selector import LearningrateSchedulerSelector
        from autoPyTorch.pipeline.nodes.log_functions_selector import LogFunctionsSelector
        from autoPyTorch.pipeline.nodes.metric_selector import MetricSelector
        from autoPyTorch.pipeline.nodes.loss_module_selector import LossModuleSelector
        from autoPyTorch.pipeline.nodes.train_node import TrainNode

        # build the pipeline
        pipeline = Pipeline([
            AutoNetSettings(),
            OptimizationAlgorithm([
                CrossValidation([
                    Imputation(),
                    NormalizationStrategySelector(),
                    OneHotEncoding(),
                    PreprocessorSelector(),
                    ResamplingStrategySelector(),
                    EmbeddingSelector(),
                    NetworkSelector(),
                    OptimizerSelector(),
                    LearningrateSchedulerSelector(),
                    LogFunctionsSelector(),
                    MetricSelector(),
                    LossModuleSelector(),
                    TrainNode()
                ])
            ])
        ])

        cls._apply_default_pipeline_settings(pipeline)
        return pipeline
    def test_cross_validation(self):

        class ResultNode(PipelineNode):
            def fit(self, X, Y, train_indices, valid_indices):
                return { 'loss': np.sum(X[valid_indices]), 'info': {'a': np.sum(X[train_indices]), 'b': np.sum(X[valid_indices])} }

        pipeline = Pipeline([
            CrossValidation([
                ResultNode()
            ])
        ])

        pipeline["CrossValidation"].add_cross_validator("k_fold", KFold, lambda x: x.reshape((-1 ,)))
        pipeline["CrossValidation"].add_cross_validator("stratified_k_fold", StratifiedKFold, lambda x: x.reshape((-1 ,)))

        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        y_train = np.array([[1], [0], [1]])

        # test cv_splits
        pipeline_config = pipeline.get_pipeline_config(cross_validator="k_fold", cross_validator_args={"n_splits": 3})
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape
        pipeline_config["random_seed"] = 42

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 15)
        self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15})

        
        # test validation split
        pipeline_config = pipeline.get_pipeline_config(validation_split=0.3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 24)
        self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24})


        # test stratified cv split
        x_valid = x_train
        y_valid = y_train
        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18]])
        y_train = np.array([[1], [1], [0], [0], [1], [0]])

        pipeline_config = pipeline.get_pipeline_config(cross_validator="stratified_k_fold", cross_validator_args={"n_splits": 3})
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=None, Y_valid=None, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 57)
        self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57})

        pipeline_config = pipeline.get_pipeline_config()
        pipeline_config_space = pipeline.get_hyperparameter_search_space(**pipeline_config)
        pipeline_config['random_seed'] = 42
        dataset_info = DataSetInfo()
        dataset_info.categorical_features = [None] * 3
        dataset_info.x_shape = x_train.shape
        dataset_info.y_shape = y_train.shape

        cv_result = pipeline.fit_pipeline(hyperparameter_config=pipeline_config_space, pipeline_config=pipeline_config, 
                                          X_train=x_train, Y_train=y_train, X_valid=x_valid, Y_valid=y_valid, 
                                          budget=5, budget_type=BudgetTypeEpochs, one_hot_encoder=None,
                                          optimize_start_time=time.time(), refit=False, dataset_info=dataset_info, rescore=False)

        self.assertEqual(cv_result['loss'], 45)
        self.assertDictEqual(cv_result['info'], {'a': 171, 'b': 45})
Esempio n. 3
0
    def test_cross_validation(self):
        class ResultNode(PipelineNode):
            def fit(self, X_train, X_valid):
                return {
                    'loss': np.sum(X_valid),
                    'info': {
                        'a': np.sum(X_train),
                        'b': np.sum(X_valid)
                    }
                }

        pipeline = Pipeline([CrossValidation([ResultNode()])])

        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        y_train = np.array([[1], [0], [1]])

        # test cv_splits
        pipeline_config = pipeline.get_pipeline_config(cv_splits=3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 15)
        self.assertDictEqual(cv_result['info'], {'a': 30, 'b': 15})

        # test validation split
        pipeline_config = pipeline.get_pipeline_config(validation_split=0.3)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 24)
        self.assertDictEqual(cv_result['info'], {'a': 21, 'b': 24})

        # test stratified cv split
        x_train = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12],
                            [13, 14, 15], [16, 17, 18]])
        y_train = np.array([[1], [1], [0], [0], [1], [0]])

        pipeline_config = pipeline.get_pipeline_config(
            cv_splits=3, use_stratified_cv_split=True)
        pipeline_config_space = pipeline.get_hyperparameter_search_space(
            **pipeline_config)
        pipeline_config['categorical_features'] = None

        cv_result = pipeline.fit_pipeline(
            hyperparameter_config=pipeline_config_space,
            pipeline_config=pipeline_config,
            X_train=x_train,
            Y_train=y_train,
            X_valid=None,
            Y_valid=None,
            budget=5,
            budget_type=BudgetTypeEpochs,
            one_hot_encoder=None,
            optimize_start_time=time.time())

        self.assertEqual(cv_result['loss'], 57)
        self.assertDictEqual(cv_result['info'], {'a': 114, 'b': 57})