예제 #1
0
    def test_fit_roar(self):
        def get_roar_object_callback(scenario_dict, seed, ta, **kwargs):
            """Random online adaptive racing.

            http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
            scenario = Scenario(scenario_dict)
            return ROAR(
                scenario=scenario,
                rng=seed,
                tae_runner=ta,
            )

        output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            initial_configurations_via_metalearning=0,
            get_smac_object_callback=get_roar_object_callback,
        )
        automl.fit(X_train, Y_train, metric=accuracy)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #2
0
    def test_binary_score(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(
            n_samples=1000, n_features=20, n_redundant=5, n_informative=5,
            n_repeated=2, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:700]
        Y_train = data[1][:700]
        X_test = data[0][700:]
        Y_test = data[1][700:]

        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 5)
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.5)

        del automl
        self._tearDown(output)
예제 #3
0
    def test_file_output(self):
        self.output_dir = os.path.join(os.getcwd(), '.test_file_output')

        D = get_regression_datamanager()
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        configuration = configuration_space.sample_configuration()
        backend_api = backend.create(self.output_dir, self.output_dir)
        evaluator = HoldoutEvaluator(D,
                                     backend_api,
                                     configuration,
                                     with_predictions=True,
                                     all_scoring_functions=True,
                                     output_y_test=True)

        loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
            evaluator.fit_predict_and_loss()
        evaluator.file_output(loss, Y_optimization_pred, Y_valid_pred,
                              Y_test_pred)

        self.assertTrue(
            os.path.exists(
                os.path.join(self.output_dir, '.auto-sklearn',
                             'true_targets_ensemble.npy')))
예제 #4
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            output = os.path.join(self.test_dir, '..',
                                  '.tmp_test_do_dummy_prediction')
            self._setUp(output)

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            backend_api = backend.create(output, output)
            auto = autosklearn.automl.AutoML(
                backend_api, 20, 5,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = load_data(dataset, backend_api)
            auto._backend.save_datamanager(D)
            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the output directory (under output)
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(
                output, '.auto-sklearn', 'predictions_ensemble',
                'predictions_ensemble_1_00001.npy')))

            del auto
            self._tearDown(output)
예제 #5
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            output = os.path.join(self.test_dir, '..',
                                  '.tmp_test_do_dummy_prediction')
            self._setUp(output)

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            backend_api = backend.create(output, output)
            auto = autosklearn.automl.AutoML(
                backend_api, 15, 5,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = load_data(dataset, backend_api)
            auto._backend.save_datamanager(D)
            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the output directory (under output)
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(
                output, '.auto-sklearn', 'predictions_ensemble',
                'predictions_ensemble_1_00001.npy')))

            del auto
            self._tearDown(output)
예제 #6
0
    def test_binary_score_and_include(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(
            n_samples=400, n_features=10, n_redundant=1, n_informative=3,
            n_repeated=1, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:200]
        Y_train = data[1][:200]
        X_test = data[0][200:]
        Y_test = data[1][200:]

        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5,
                                           include_estimators=['sgd'],
                                           include_preprocessors=['no_preprocessing'])
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION,
                   metric=accuracy)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        # TODO, the assumption from above is not really tested here
        # Also, the score method should be removed, it only makes little sense
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.4)

        del automl
        self._tearDown(output)
예제 #7
0
 def test_eval_cv_on_subset(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_cv(self.queue, self.configuration, self.data, backend_api, 1, 1,
             5, 45, True, False, True)
     info = self.queue.get()
     self.assertAlmostEqual(info[1], 0.063004032258064502)
     self.assertEqual(info[2], 1)
예제 #8
0
    def build_automl(
        self,
        seed: int,
        shared_mode: bool,
        ensemble_size: int,
        initial_configurations_via_metalearning: int,
        tmp_folder: str,
        output_folder: str,
        smac_scenario_args: Optional[Dict] = None,
    ):

        if shared_mode:
            self.delete_output_folder_after_terminate = False
            self.delete_tmp_folder_after_terminate = False
            if tmp_folder is None:
                raise ValueError("If shared_mode == True tmp_folder must not "
                                 "be None.")
            if output_folder is None:
                raise ValueError("If shared_mode == True output_folder must "
                                 "not be None.")

        backend = create(temporary_directory=tmp_folder,
                         output_directory=output_folder,
                         delete_tmp_folder_after_terminate=self.
                         delete_tmp_folder_after_terminate,
                         delete_output_folder_after_terminate=self.
                         delete_output_folder_after_terminate,
                         shared_mode=shared_mode)

        if smac_scenario_args is None:
            smac_scenario_args = self.smac_scenario_args

        automl = self._get_automl_class()(
            backend=backend,
            time_left_for_this_task=self.time_left_for_this_task,
            per_run_time_limit=self.per_run_time_limit,
            initial_configurations_via_metalearning=
            initial_configurations_via_metalearning,
            ensemble_size=ensemble_size,
            ensemble_nbest=self.ensemble_nbest,
            ensemble_memory_limit=self.ensemble_memory_limit,
            seed=seed,
            ml_memory_limit=self.ml_memory_limit,
            include_estimators=self.include_estimators,
            exclude_estimators=self.exclude_estimators,
            include_preprocessors=self.include_preprocessors,
            exclude_preprocessors=self.exclude_preprocessors,
            resampling_strategy=self.resampling_strategy,
            resampling_strategy_arguments=self.resampling_strategy_arguments,
            shared_mode=shared_mode,
            get_smac_object_callback=self.get_smac_object_callback,
            disable_evaluator_output=self.disable_evaluator_output,
            smac_scenario_args=smac_scenario_args,
            logging_config=self.logging_config,
            metadata_directory=self.metadata_directory,
            write_history=self.write_history,
            read_history=self.read_history,
        )

        return automl
예제 #9
0
 def test_eval_holdout_on_subset(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_holdout(self.queue, self.configuration, self.data, backend_api, 1,
                  1, 43, True, False, True)
     info = self.queue.get()
     self.assertAlmostEqual(info[1], 0.1)
     self.assertEqual(info[2], 1)
예제 #10
0
 def test_eval_holdout_iterative_fit_no_timeout(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_iterative_holdout(self.queue, self.configuration, self.data,
                            backend_api, 1, 1, None, True, False, True)
     info = self.queue.get()
     self.assertAlmostEqual(info[1], 0.05)
     self.assertEqual(info[2], 1)
예제 #11
0
파일: base.py 프로젝트: zwcdp/auto-sklearn
 def _create_backend(self, test_name):
     tmp = os.path.join(self.test_dir, '..', '.tmp._%s' % test_name)
     output = os.path.join(self.test_dir, '..', '.output._%s' % test_name)
     # Make sure the folders we wanna create do not already exist.
     self._setUp(tmp)
     self._setUp(output)
     backend = create(tmp, output)
     return backend
예제 #12
0
 def _create_backend(self, test_name):
     tmp = os.path.join(self.test_dir, '..', '.tmp._%s' % test_name)
     output = os.path.join(self.test_dir, '..', '.output._%s' % test_name)
     # Make sure the folders we wanna create do not already exist.
     self._setUp(tmp)
     self._setUp(output)
     backend = create(tmp, output)
     return backend
예제 #13
0
 def test_eval_holdout(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_holdout(self.queue, self.configuration, self.data, backend_api, 1,
                  1, None, True, False, True)
     info = self.queue.get()
     self.assertAlmostEqual(info[1], 0.05)
     self.assertEqual(info[2], 1)
     self.assertNotIn('bac_metric', info[3])
예제 #14
0
    def test_automl_outputs(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_automl_outputs')
        self._setUp(output)
        name = '31_bac'
        dataset = os.path.join(self.test_dir, '..', '.data', name)
        data_manager_file = os.path.join(output, '.auto-sklearn',
                                         'datamanager.pkl')

        backend_api = backend.create(output, output)
        auto = autosklearn.automl.AutoML(
            backend_api,
            20,
            5,
            initial_configurations_via_metalearning=0,
            seed=100,
        )
        auto.fit_automl_dataset(dataset, accuracy)

        # pickled data manager (without one hot encoding!)
        with open(data_manager_file, 'rb') as fh:
            D = pickle.load(fh)
            self.assertTrue(
                np.allclose(D.data['X_train'][0, :3], [1., 12., 2.]))

        # Check that all directories are there
        fixture = [
            'predictions_valid', 'true_targets_ensemble.npy', 'start_time_100',
            'datamanager.pkl', 'predictions_ensemble', 'ensembles',
            'predictions_test', 'models'
        ]
        self.assertEqual(
            sorted(os.listdir(os.path.join(output, '.auto-sklearn'))),
            sorted(fixture))

        # At least one ensemble, one validation, one test prediction and one
        # model and one ensemble
        fixture = os.listdir(
            os.path.join(output, '.auto-sklearn', 'predictions_ensemble'))
        self.assertIn('predictions_ensemble_100_00001.npy', fixture)

        fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'models'))
        self.assertIn('100.1.model', fixture)

        fixture = os.listdir(os.path.join(output, '.auto-sklearn',
                                          'ensembles'))
        self.assertIn('100.0000000000.ensemble', fixture)

        # Start time
        start_time_file_path = os.path.join(output, '.auto-sklearn',
                                            "start_time_100")
        with open(start_time_file_path, 'r') as fh:
            start_time = float(fh.read())
        self.assertGreaterEqual(time.time() - start_time, 10)

        del auto
        self._tearDown(output)
예제 #15
0
 def test_eval_cv_all_loss_functions(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_cv(self.queue, self.configuration, self.data, backend_api, 1, 1,
             5, None, True, True, True)
     info = self.queue.get()
     self.assertIn(
         'f1_metric: 0.0794451450189;pac_metric: 0.344745492187;'
         'acc_metric: 0.075;auc_metric: 0.0285222960152;'
         'bac_metric: 0.0796370967742;duration: ', info[3])
     self.assertAlmostEqual(info[1], 0.079637096774193727)
     self.assertEqual(info[2], 1)
예제 #16
0
 def test_eval_holdout_all_loss_functions(self):
     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
     eval_holdout(self.queue, self.configuration, self.data, backend_api, 1,
                  1, None, True, True, True)
     info = self.queue.get()
     self.assertIn(
         'f1_metric: 0.0480549199085;pac_metric: 0.135572680594;'
         'acc_metric: 0.0454545454545;auc_metric: 0.0;'
         'bac_metric: 0.05;duration: ', info[3])
     self.assertAlmostEqual(info[1], 0.05)
     self.assertEqual(info[2], 1)
def evaluate(input_directory, validation_files, test_files, ensemble_size=50):

    backend = create(input_directory,
                     input_directory + "_output",
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)

    valid_labels = backend.load_targets_ensemble()
    D = backend.load_datamanager()
    test_labels = D.data["Y_test"]

    score = balanced_accuracy

    # Read the modification time of the predictions file and
    # compute the interval to the first prediction file.
    # This interval will be add to the time we needed to build the ensemble
    time_function_evaluation = validation_files[-1][-1]

    # Build the ensemble
    start = time.time()
    ensemble_selection = EnsembleSelection(
        ensemble_size=ensemble_size,
        task_type=D.info['task'],
        metric=score,
        random_state=np.random.RandomState())

    validation_predictions = np.array([v[0] for v in validation_files])
    test_predictions = np.array([t[0] for t in test_files])

    ensemble_selection.fit(validation_predictions,
                           valid_labels,
                           identifiers=None)
    y_hat_ensemble = ensemble_selection.predict(
        np.array(validation_predictions))
    y_hat_test = ensemble_selection.predict(np.array(test_predictions))

    # Compute validation error
    ensemble_error = 1 - score(valid_labels, y_hat_ensemble)

    # Compute test error
    ensemble_test_error = 1 - score(test_labels, y_hat_test)

    ensemble_time = time.time() - start

    rval = {
        'ensemble_time': ensemble_time,
        'time_function_evaluation': time_function_evaluation,
        'ensemble_error': ensemble_error,
        'ensemble_test_error': ensemble_test_error
    }

    return rval
예제 #18
0
    def test_automl_outputs(self):
        output = os.path.join(self.test_dir, '..',
                              '.tmp_test_automl_outputs')
        self._setUp(output)
        name = '31_bac'
        dataset = os.path.join(self.test_dir, '..', '.data', name)
        data_manager_file = os.path.join(output, '.auto-sklearn',
                                         'datamanager.pkl')

        backend_api = backend.create(output, output)
        auto = autosklearn.automl.AutoML(
            backend_api, 15, 5,
            initial_configurations_via_metalearning=25,
            seed=100)
        auto.fit_automl_dataset(dataset)

        # pickled data manager (without one hot encoding!)
        with open(data_manager_file, 'rb') as fh:
            D = six.moves.cPickle.load(fh)
            self.assertTrue(np.allclose(D.data['X_train'][0, :3],
                                        [1., 12., 2.]))

        # Check that all directories are there
        fixture = ['predictions_valid', 'true_targets_ensemble.npy',
                   'start_time_100', 'datamanager.pkl', 'predictions_ensemble',
                   'ensembles', 'predictions_test', 'models']
        self.assertEqual(sorted(os.listdir(os.path.join(output,
                                                        '.auto-sklearn'))),
                         sorted(fixture))

        # At least one ensemble, one validation, one test prediction and one
        # model and one ensemble
        fixture = os.listdir(os.path.join(output, '.auto-sklearn',
                                          'predictions_ensemble'))
        self.assertIn('predictions_ensemble_100_00001.npy', fixture)

        fixture = os.listdir(os.path.join(output, '.auto-sklearn',
                                          'models'))
        self.assertIn('100.1.model', fixture)

        fixture = os.listdir(os.path.join(output, '.auto-sklearn',
                                          'ensembles'))
        self.assertIn('100.0000000000.ensemble', fixture)

        # Start time
        start_time_file_path = os.path.join(output, '.auto-sklearn',
                                            "start_time_100")
        with open(start_time_file_path, 'r') as fh:
            start_time = float(fh.read())
        self.assertGreaterEqual(time.time() - start_time, 10)

        del auto
        self._tearDown(output)
예제 #19
0
 def test_eval_partial_cv(self):
     results = [
         0.071428571428571508, 0.071428571428571508, 0.08333333333333337,
         0.16666666666666674, 0.0
     ]
     for fold in range(5):
         backend_api = backend.create(self.tmp_dir, self.tmp_dir)
         eval_partial_cv(self.queue, self.configuration, self.data,
                         backend_api, 1, 1, fold, 5, None, True, False,
                         True)
         info = self.queue.get()
         results.append(info[1])
         self.assertAlmostEqual(info[1], results[fold])
         self.assertEqual(info[2], 1)
예제 #20
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5)
        automl.fit(X_train, Y_train, metric=accuracy)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #21
0
    def build_automl(
        self,
        seed: int,
        ensemble_size: int,
        initial_configurations_via_metalearning: int,
        tmp_folder: str,
        output_folder: str,
        smac_scenario_args: Optional[Dict] = None,
    ):

        backend = create(
            temporary_directory=tmp_folder,
            output_directory=output_folder,
            delete_tmp_folder_after_terminate=self.
            delete_tmp_folder_after_terminate,
            delete_output_folder_after_terminate=self.
            delete_output_folder_after_terminate,
        )

        if smac_scenario_args is None:
            smac_scenario_args = self.smac_scenario_args

        automl = self._get_automl_class()(
            backend=backend,
            time_left_for_this_task=self.time_left_for_this_task,
            per_run_time_limit=self.per_run_time_limit,
            initial_configurations_via_metalearning=
            initial_configurations_via_metalearning,
            ensemble_size=ensemble_size,
            ensemble_nbest=self.ensemble_nbest,
            max_models_on_disc=self.max_models_on_disc,
            ensemble_memory_limit=self.ensemble_memory_limit,
            seed=seed,
            ml_memory_limit=self.ml_memory_limit,
            include_estimators=self.include_estimators,
            exclude_estimators=self.exclude_estimators,
            include_preprocessors=self.include_preprocessors,
            exclude_preprocessors=self.exclude_preprocessors,
            resampling_strategy=self.resampling_strategy,
            resampling_strategy_arguments=self.resampling_strategy_arguments,
            n_jobs=self._n_jobs,
            dask_client=self.dask_client,
            get_smac_object_callback=self.get_smac_object_callback,
            disable_evaluator_output=self.disable_evaluator_output,
            smac_scenario_args=smac_scenario_args,
            logging_config=self.logging_config,
            metadata_directory=self.metadata_directory,
            metric=self._metric)

        return automl
예제 #22
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 5)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #23
0
    def test_iterative_holdout_not_iterative(self, pipeline_mock):
        # Regular fitting
        D = get_binary_classification_datamanager()
        D.name = 'test'

        Xt_fixture = 'Xt_fixture'
        pipeline_mock.estimator_supports_iterative_fit.return_value = False
        pipeline_mock.fit_transformer.return_value = Xt_fixture, {}
        pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile(
            [0.6, 0.4], (len(X), 1))
        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
        pipeline_mock.get_additional_run_info.return_value = None
        output_dir = os.path.join(os.getcwd(),
                                  '.test_iterative_holdout_not_iterative')

        configuration = unittest.mock.Mock(spec=Configuration)
        backend_api = backend.create(output_dir, output_dir)
        backend_api.load_datamanager = lambda: D
        queue_ = multiprocessing.Queue()

        evaluator = TrainEvaluator(backend_api,
                                   queue_,
                                   configuration=configuration,
                                   resampling_strategy='holdout-iterative-fit',
                                   all_scoring_functions=False,
                                   output_y_hat_optimization=True,
                                   metric=accuracy)
        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
        evaluator.file_output.return_value = (None, None)

        evaluator.fit_predict_and_loss(iterative=True)
        self.assertEqual(evaluator.file_output.call_count, 1)

        rval = evaluator.queue.get(timeout=1)
        self.assertAlmostEqual(rval['loss'], 0.47826086956521741)
        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

        self.assertEqual(pipeline_mock.iterative_fit.call_count, 0)
        # fifteen calls because of the holdout, the validation and the test set
        # and a total of five calls because of five iterations of fitting
        self.assertEqual(evaluator.model.predict_proba.call_count, 3)
        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
        self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
                         D.data['Y_valid'].shape[0])
        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
                         D.data['Y_test'].shape[0])
        self.assertEqual(evaluator.file_output.call_count, 1)
        self.assertEqual(evaluator.model.fit.call_count, 1)
예제 #24
0
    def test_finish_up_model_predicts_NaN(self):
        '''Tests by handing in predictions which contain NaNs'''
        rs = np.random.RandomState(1)
        D = get_multiclass_classification_datamanager()
        output_dir = os.path.join(
            os.getcwd(), '.test_finish_up_model_predicts_NaN')

        try:
            shutil.rmtree(output_dir)
        except:
            pass

        backend_api = backend.create(output_dir, output_dir)
        ae = AbstractEvaluator(Datamanager=D, backend=backend_api,
                               output_y_test=False)
        ae.Y_optimization = rs.rand(33, 3)
        predictions_ensemble = rs.rand(33, 3)
        predictions_test = rs.rand(25, 3)
        predictions_valid = rs.rand(25, 3)

        # NaNs in prediction ensemble
        predictions_ensemble[5, 2] = np.NaN
        _, loss, _, additional_run_info = ae.finish_up(
            0.1, predictions_ensemble, predictions_valid, predictions_test)
        self.assertEqual(loss, 2.0)
        self.assertEqual(additional_run_info, 'Model predictions for '
                                              'optimization set contains NaNs.')

        # NaNs in prediction validation
        predictions_ensemble[5, 2] = 0.5
        predictions_valid[5, 2] = np.NaN
        _, loss, _, additional_run_info = ae.finish_up(
            0.1, predictions_ensemble, predictions_valid, predictions_test)
        self.assertEqual(loss, 2.0)
        self.assertEqual(additional_run_info, 'Model predictions for '
                                              'validation set contains NaNs.')

        # NaNs in prediction test
        predictions_valid[5, 2] = 0.5
        predictions_test[5, 2] = np.NaN
        _, loss, _, additional_run_info = ae.finish_up(
            0.1, predictions_ensemble, predictions_valid, predictions_test)
        self.assertEqual(loss, 2.0)
        self.assertEqual(additional_run_info, 'Model predictions for '
                                              'test set contains NaNs.')

        self.assertEqual(len(os.listdir(os.path.join(output_dir,
                                                   '.auto-sklearn'))), 0)
예제 #25
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 30, 5)
        automl.fit(X_train, Y_train)
        #print(automl.show_models(), flush=True)
        #print(automl.cv_results_, flush=True)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #26
0
    def test_holdout(self, pipeline_mock):
        D = get_binary_classification_datamanager()
        D.name = 'test'

        pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile(
            [0.6, 0.4], (len(X), 1))
        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
        pipeline_mock.get_additional_run_info.return_value = None
        output_dir = os.path.join(os.getcwd(), '.test_holdout')

        configuration = unittest.mock.Mock(spec=Configuration)
        backend_api = backend.create(output_dir, output_dir)
        backend_api.load_datamanager = lambda: D
        queue_ = multiprocessing.Queue()

        evaluator = TrainEvaluator(
            backend_api,
            queue_,
            configuration=configuration,
            resampling_strategy='holdout',
            resampling_strategy_args={'train_size': 0.66},
            all_scoring_functions=False,
            output_y_hat_optimization=True,
            metric=accuracy,
            subsample=50)
        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
        evaluator.file_output.return_value = (None, None)

        evaluator.fit_predict_and_loss()

        rval = get_last_result(evaluator.queue)
        result = rval['loss']
        self.assertEqual(len(rval), 3)
        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

        self.assertEqual(evaluator.file_output.call_count, 1)
        self.assertEqual(result, 0.45833333333333337)
        self.assertEqual(pipeline_mock.fit.call_count, 1)
        # three calls because of the holdout, the validation and the test set
        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
        self.assertEqual(evaluator.file_output.call_count, 1)
        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24)
        self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
                         D.data['Y_valid'].shape[0])
        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
                         D.data['Y_test'].shape[0])
        self.assertEqual(evaluator.model.fit.call_count, 1)
예제 #27
0
    def test_cv(self, pipeline_mock):
        D = get_binary_classification_datamanager()

        pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile(
            [0.6, 0.4], (len(X), 1))
        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
        pipeline_mock.get_additional_run_info.return_value = None
        output_dir = os.path.join(os.getcwd(), '.test_cv')

        configuration = unittest.mock.Mock(spec=Configuration)
        backend_api = backend.create(output_dir, output_dir)
        backend_api.load_datamanager = lambda: D
        queue_ = multiprocessing.Queue()

        evaluator = TrainEvaluator(backend_api,
                                   queue_,
                                   configuration=configuration,
                                   resampling_strategy='cv',
                                   resampling_strategy_args={'folds': 5},
                                   all_scoring_functions=False,
                                   output_y_hat_optimization=True,
                                   metric=accuracy)
        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
        evaluator.file_output.return_value = (None, None)

        evaluator.fit_predict_and_loss()

        rval = get_last_result(evaluator.queue)
        result = rval['loss']
        self.assertEqual(len(rval), 3)
        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

        self.assertEqual(evaluator.file_output.call_count, 1)
        self.assertEqual(result, 0.46376811594202894)
        self.assertEqual(pipeline_mock.fit.call_count, 5)
        # Fifteen calls because of the holdout, the validation and the test set
        self.assertEqual(pipeline_mock.predict_proba.call_count, 15)
        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
                         D.data['Y_train'].shape[0])
        self.assertEqual(evaluator.file_output.call_args[0][1].shape[0],
                         D.data['Y_valid'].shape[0])
        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
                         D.data['Y_test'].shape[0])
        # The model prior to fitting is saved, this cannot be directly tested
        # because of the way the mock module is used. Instead, we test whether
        # the if block in which model assignment is done is accessed
        self.assertTrue(evaluator._added_empty_model)
예제 #28
0
 def _create_backend(self,
                     test_name,
                     delete_tmp_folder_after_terminate=True,
                     delete_output_folder_after_terminate=True):
     tmp = os.path.join(self.test_dir, '..', '.tmp._%s' % test_name)
     output = os.path.join(self.test_dir, '..', '.output._%s' % test_name)
     # Make sure the folders we wanna create do not already exist.
     self._setUp(tmp)
     self._setUp(output)
     backend = create(
         tmp,
         output,
         delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
         delete_output_folder_after_terminate=
         delete_output_folder_after_terminate,
     )
     return backend
예제 #29
0
def get_meta_learning_configs(X,
                              y,
                              task_type,
                              dataset_name='default',
                              metric='accuracy',
                              num_cfgs=5):
    if X is None or y is None:
        X, y, _ = load_data(dataset_name)
    backend = create(temporary_directory=None,
                     output_directory=None,
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)
    dm = XYDataManager(X, y, None, None, task_type, None, dataset_name)

    configuration_space = pipeline.get_configuration_space(
        dm.info,
        include_estimators=None,
        exclude_estimators=None,
        include_preprocessors=None,
        exclude_preprocessors=None)

    watcher = StopWatch()
    name = os.path.basename(dm.name)
    watcher.start_task(name)

    def reset_data_manager(max_mem=None):
        pass

    automlsmbo = AutoMLSMBO(
        config_space=configuration_space,
        dataset_name=dataset_name,
        backend=backend,
        total_walltime_limit=1e5,
        func_eval_time_limit=1e5,
        memory_limit=1e5,
        metric=metric,
        watcher=watcher,
        metadata_directory='components/meta_learning/meta_resource',
        num_metalearning_cfgs=num_cfgs)
    automlsmbo.reset_data_manager = reset_data_manager
    automlsmbo.task = task_type
    automlsmbo.datamanager = dm
    configs = automlsmbo.get_metalearning_suggestions()
    return configs
예제 #30
0
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 15)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #31
0
    def test_fit_roar(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5,
                                           initial_configurations_via_metalearning=0,
                                           configuration_mode='ROAR')
        automl.fit(X_train, Y_train, metric=accuracy)
        # print(automl.show_models(), flush=True)
        # print(automl.cv_results_, flush=True)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #32
0
    def build_automl(self):
        if self.shared_mode:
            self.delete_output_folder_after_terminate = False
            self.delete_tmp_folder_after_terminate = False
            if self.tmp_folder is None:
                raise ValueError("If shared_mode == True tmp_folder must not "
                                 "be None.")
            if self.output_folder is None:
                raise ValueError("If shared_mode == True output_folder must "
                                 "not be None.")

        backend = create(temporary_directory=self.tmp_folder,
                         output_directory=self.output_folder,
                         delete_tmp_folder_after_terminate=self.
                         delete_tmp_folder_after_terminate,
                         delete_output_folder_after_terminate=self.
                         delete_output_folder_after_terminate,
                         shared_mode=self.shared_mode)
        automl = self._get_automl_class()(
            backend=backend,
            time_left_for_this_task=self.time_left_for_this_task,
            per_run_time_limit=self.per_run_time_limit,
            initial_configurations_via_metalearning=self.
            initial_configurations_via_metalearning,
            ensemble_size=self.ensemble_size,
            ensemble_nbest=self.ensemble_nbest,
            seed=self.seed,
            ml_memory_limit=self.ml_memory_limit,
            include_estimators=self.include_estimators,
            exclude_estimators=self.exclude_estimators,
            include_preprocessors=self.include_preprocessors,
            exclude_preprocessors=self.exclude_preprocessors,
            resampling_strategy=self.resampling_strategy,
            resampling_strategy_arguments=self.resampling_strategy_arguments,
            shared_mode=self.shared_mode,
            get_smac_object_callback=self.get_smac_object_callback,
            disable_evaluator_output=self.disable_evaluator_output,
            smac_scenario_args=self.smac_scenario_args,
            # 启用正方向代码
            direction_args=self.direction_args,
        )

        return automl
예제 #33
0
    def build_automl(self):
        if self.shared_mode:
            self.delete_output_folder_after_terminate = False
            self.delete_tmp_folder_after_terminate = False
            if self.tmp_folder is None:
                raise ValueError("If shared_mode == True tmp_folder must not "
                                 "be None.")
            if self.output_folder is None:
                raise ValueError("If shared_mode == True output_folder must "
                                 "not be None.")

        backend = create(temporary_directory=self.tmp_folder,
                         output_directory=self.output_folder,
                         delete_tmp_folder_after_terminate=self.
                         delete_tmp_folder_after_terminate,
                         delete_output_folder_after_terminate=self.
                         delete_output_folder_after_terminate)
        automl = autosklearn.automl.AutoML(
            backend=backend,
            time_left_for_this_task=self.time_left_for_this_task,
            per_run_time_limit=self.per_run_time_limit,
            log_dir=backend.temporary_directory,
            initial_configurations_via_metalearning=self.
            initial_configurations_via_metalearning,
            ensemble_size=self.ensemble_size,
            ensemble_nbest=self.ensemble_nbest,
            seed=self.seed,
            ml_memory_limit=self.ml_memory_limit,
            include_estimators=self.include_estimators,
            exclude_estimators=self.exclude_estimators,
            include_preprocessors=self.include_preprocessors,
            exclude_preprocessors=self.exclude_preprocessors,
            resampling_strategy=self.resampling_strategy,
            resampling_strategy_arguments=self.resampling_strategy_arguments,
            delete_tmp_folder_after_terminate=self.
            delete_tmp_folder_after_terminate,
            delete_output_folder_after_terminate=self.
            delete_output_folder_after_terminate,
            shared_mode=self.shared_mode,
            configuration_mode=self.configuration_mode,
            disable_evaluator_output=self.disable_evaluator_output)

        return automl
예제 #34
0
def backend(request):

    test_dir = os.path.dirname(__file__)
    tmp = os.path.join(
        test_dir,
        '.tmp__%s__%s' % (request.module.__name__, request.node.name))
    output = os.path.join(
        test_dir,
        '.output__%s__%s' % (request.module.__name__, request.node.name))

    for dir in (tmp, output):
        for i in range(10):
            if os.path.exists(dir):
                try:
                    shutil.rmtree(dir)
                    break
                except OSError:
                    time.sleep(1)

    # Make sure the folders we wanna create do not already exist.
    backend = create(
        tmp,
        output,
        delete_tmp_folder_after_terminate=True,
        delete_output_folder_after_terminate=True,
    )

    def get_finalizer(tmp_dir, output_dir):
        def session_run_at_end():
            for dir in (tmp_dir, output_dir):
                for i in range(10):
                    if os.path.exists(dir):
                        try:
                            shutil.rmtree(dir)
                            break
                        except OSError:
                            time.sleep(1)

        return session_run_at_end

    request.addfinalizer(get_finalizer(tmp, output))

    return backend
예제 #35
0
    def test_partial_cv(self, pipeline_mock):
        D = get_binary_classification_datamanager()
        kfold = StratifiedKFold(random_state=1, n_splits=5, shuffle=True)

        pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile(
            [0.6, 0.4], (len(X), 1))
        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
        output_dir = os.path.join(os.getcwd(), '.test_partial_cv')
        D = get_binary_classification_datamanager()
        D.name = 'test'

        configuration = unittest.mock.Mock(spec=Configuration)
        backend_api = backend.create(output_dir, output_dir)
        queue_ = multiprocessing.Queue()

        evaluator = TrainEvaluator(D,
                                   backend_api,
                                   queue_,
                                   configuration=configuration,
                                   cv=kfold,
                                   all_scoring_functions=False,
                                   output_y_hat_optimization=True,
                                   metric=accuracy)

        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
        evaluator.file_output.return_value = (None, None)

        evaluator.partial_fit_predict_and_loss(1)

        rval = evaluator.queue.get(timeout=1)
        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)

        self.assertEqual(evaluator.file_output.call_count, 0)
        self.assertEqual(rval['loss'], 0.46666666666666667)
        self.assertEqual(pipeline_mock.fit.call_count, 1)
        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
        # The model prior to fitting is saved, this cannot be directly tested
        # because of the way the mock module is used. Instead, we test whether
        # the if block in which model assignment is done is accessed
        self.assertTrue(evaluator._added_empty_model)
예제 #36
0
    def build_automl(self):
        if self.shared_mode:
            self.delete_output_folder_after_terminate = False
            self.delete_tmp_folder_after_terminate = False
            if self.tmp_folder is None:
                raise ValueError("If shared_mode == True tmp_folder must not "
                                 "be None.")
            if self.output_folder is None:
                raise ValueError("If shared_mode == True output_folder must "
                                 "not be None.")

        backend = create(temporary_directory=self.tmp_folder,
                         output_directory=self.output_folder,
                         delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate,
                         delete_output_folder_after_terminate=self.delete_output_folder_after_terminate,
                         shared_mode = self.shared_mode)
        automl = self._get_automl_class()(
            backend=backend,
            time_left_for_this_task=self.time_left_for_this_task,
            per_run_time_limit=self.per_run_time_limit,
            initial_configurations_via_metalearning=
            self.initial_configurations_via_metalearning,
            ensemble_size=self.ensemble_size,
            ensemble_nbest=self.ensemble_nbest,
            seed=self.seed,
            ml_memory_limit=self.ml_memory_limit,
            include_estimators=self.include_estimators,
            exclude_estimators=self.exclude_estimators,
            include_preprocessors=self.include_preprocessors,
            exclude_preprocessors=self.exclude_preprocessors,
            resampling_strategy=self.resampling_strategy,
            resampling_strategy_arguments=self.resampling_strategy_arguments,
            shared_mode=self.shared_mode,
            get_smac_object_callback=self.get_smac_object_callback,
            disable_evaluator_output=self.disable_evaluator_output,
            smac_scenario_args=self.smac_scenario_args
        )

        return automl