Esempio n. 1
0
    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric=None,
            feat_type=None,
            dataset_name=None):
        if not self._shared_mode:
            self._backend.context.delete_directories()
        else:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        self._backend.context.create_directories()

        if dataset_name is None:
            dataset_name = hash_array_or_matrix(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if metric is None:
            raise ValueError('No metric given.')
        if not isinstance(metric, Scorer):
            raise ValueError('Metric must be instance of '
                             'autosklearn.metrics.Scorer.')

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        #pdb.set_trace()
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name)

        #pdb.set_trace()
        return self._fit(loaded_data_manager, metric)
Esempio n. 2
0
    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if not self._shared_mode:
            self._backend.context.delete_directories()
        else:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        self._backend.context.create_directories()

        if dataset_name is None:
            dataset_name = hash_numpy_array(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 3
0
def test_smbo_metalearning_configurations(backend, context, dask_client):

    # Get the inputs to the optimizer
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    config_space = AutoML(backend=backend,
                          metric=autosklearn.metrics.accuracy,
                          time_left_for_this_task=20,
                          per_run_time_limit=5).fit(
                              X_train,
                              Y_train,
                              task=BINARY_CLASSIFICATION,
                              only_return_configuration_space=True)
    watcher = StopWatch()

    # Create an optimizer
    smbo = AutoMLSMBO(
        config_space=config_space,
        dataset_name='iris',
        backend=backend,
        total_walltime_limit=10,
        func_eval_time_limit=5,
        memory_limit=4096,
        metric=autosklearn.metrics.accuracy,
        watcher=watcher,
        n_jobs=1,
        dask_client=dask_client,
        port=logging.handlers.DEFAULT_TCP_LOGGING_PORT,
        start_num_run=1,
        data_memory_limit=None,
        num_metalearning_cfgs=25,
        pynisher_context=context,
    )
    assert smbo.pynisher_context == context

    # Create the inputs to metalearning
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=BINARY_CLASSIFICATION,
        dataset_name='iris',
        feat_type={i: 'numerical'
                   for i in range(X_train.shape[1])},
    )
    backend.save_datamanager(datamanager)
    smbo.task = BINARY_CLASSIFICATION
    smbo.reset_data_manager()
    metalearning_configurations = smbo.get_metalearning_suggestions()

    # We should have 25 metalearning configurations
    assert len(metalearning_configurations) == 25
    assert [
        isinstance(config, Configuration)
        for config in metalearning_configurations
    ]
Esempio n. 4
0
def test_do_dummy_prediction(backend, dask_client, datasets):

    name, task = datasets

    X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
    datamanager = XYDataManager(
        X_train, Y_train,
        X_test, Y_test,
        task=task,
        dataset_name=name,
        feat_type=None,
    )

    auto = autosklearn.automl.AutoML(
        backend, 20, 5,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
    )

    # Make a dummy logger
    auto._logger_port = 9020
    auto._logger = unittest.mock.Mock()
    auto._logger.info.return_value = None

    auto._backend.save_datamanager(datamanager)
    D = backend.load_datamanager()

    # Check if data manager is correcly loaded
    assert D.info['task'] == datamanager.info['task']
    auto._do_dummy_prediction(D, 1)

    # Ensure that the dummy predictions are not in the current working
    # directory, but in the temporary directory.
    assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))
    assert os.path.exists(os.path.join(
        backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0',
        'predictions_ensemble_1_1_0.0.npy')
    )

    model_path = os.path.join(backend.temporary_directory, '.auto-sklearn',
                              'runs', '1_1_0.0',
                              '1.1.0.0.model')

    # Make sure the dummy model complies with scikit learn
    # get/set params
    assert os.path.exists(model_path)
    with open(model_path, 'rb') as model_handler:
        clone(pickle.load(model_handler))

    auto._clean_logger()

    del auto
Esempio n. 5
0
    def test_do_dummy_prediction(self):
        datasets = {
            'breast_cancer': BINARY_CLASSIFICATION,
            'wine': MULTICLASS_CLASSIFICATION,
            'diabetes': REGRESSION,
        }

        for name, task in datasets.items():
            backend_api = self._create_backend('test_do_dummy_prediction')

            X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
            datamanager = XYDataManager(
                X_train,
                Y_train,
                X_test,
                Y_test,
                task=task,
                dataset_name=name,
                feat_type=None,
            )

            auto = autosklearn.automl.AutoML(
                backend_api,
                20,
                5,
                initial_configurations_via_metalearning=25,
                metric=accuracy,
            )
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend.save_datamanager(datamanager)
            D = backend_api.load_datamanager()

            # Check if data manager is correcly loaded
            self.assertEqual(D.info['task'], datamanager.info['task'])

            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(
                os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')))
            self.assertTrue(
                os.path.exists(
                    os.path.join(backend_api.temporary_directory,
                                 '.auto-sklearn', 'predictions_ensemble',
                                 'predictions_ensemble_1_1_0.0.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
def test_do_dummy_prediction(dask_client, datasets):

    name, task = datasets

    X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=task,
        dataset_name=name,
        feat_type={i: 'numerical'
                   for i in range(X_train.shape[1])},
    )

    auto = autosklearn.automl.AutoML(
        20,
        5,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
        delete_tmp_folder_after_terminate=False,
    )
    auto._backend = auto._create_backend()

    # Make a dummy logger
    auto._logger_port = 9020
    auto._logger = unittest.mock.Mock()
    auto._logger.info.return_value = None

    auto._backend.save_datamanager(datamanager)
    D = auto._backend.load_datamanager()

    # Check if data manager is correcly loaded
    assert D.info['task'] == datamanager.info['task']
    auto._do_dummy_prediction(D, 1)

    # Ensure that the dummy predictions are not in the current working
    # directory, but in the temporary directory.
    assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))
    assert os.path.exists(
        os.path.join(auto._backend.temporary_directory, '.auto-sklearn',
                     'runs', '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy'))

    auto._clean_logger()

    del auto
Esempio n. 7
0
def get_abalone_datamanager():
    # https://www.openml.org/d/183
    dataset_name = 'abalone'
    data = sklearn.datasets.fetch_openml(data_id=183, as_frame=True)
    feat_type = [
        'Categorical' if x.name == 'category' else 'Numerical'
        for x in data['data'].dtypes
    ]
    X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True)
    y = preprocessing.LabelEncoder().fit_transform(y)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, random_state=1)

    D = XYDataManager(X_train, y_train, X_test, y_test,
                      MULTICLASS_CLASSIFICATION, feat_type, dataset_name)
    return D
Esempio n. 8
0
    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 9
0
def get_meta_learning_configs(X,
                              y,
                              task_type,
                              dataset_name='default',
                              metric='accuracy',
                              num_cfgs=5):
    if X is None or y is None:
        X, y, _ = load_data(dataset_name)
    backend = create(temporary_directory=None,
                     output_directory=None,
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)
    dm = XYDataManager(X, y, None, None, task_type, None, dataset_name)

    configuration_space = pipeline.get_configuration_space(
        dm.info,
        include_estimators=None,
        exclude_estimators=None,
        include_preprocessors=None,
        exclude_preprocessors=None)

    watcher = StopWatch()
    name = os.path.basename(dm.name)
    watcher.start_task(name)

    def reset_data_manager(max_mem=None):
        pass

    automlsmbo = AutoMLSMBO(
        config_space=configuration_space,
        dataset_name=dataset_name,
        backend=backend,
        total_walltime_limit=1e5,
        func_eval_time_limit=1e5,
        memory_limit=1e5,
        metric=metric,
        watcher=watcher,
        metadata_directory='components/meta_learning/meta_resource',
        num_metalearning_cfgs=num_cfgs)
    automlsmbo.reset_data_manager = reset_data_manager
    automlsmbo.task = task_type
    automlsmbo.datamanager = dm
    configs = automlsmbo.get_metalearning_suggestions()
    return configs
Esempio n. 10
0
    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, bool) for f in feat_type]):
            raise ValueError('Array feat_type must only contain bools.')

        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 11
0
def test_do_dummy_prediction(backend, dask_client, datasets):

    name, task = datasets

    X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=task,
        dataset_name=name,
        feat_type=None,
    )

    auto = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
    )
    setup_logger(backend.temporary_directory)
    auto._logger = get_logger('test_do_dummy_predictions')

    auto._backend.save_datamanager(datamanager)
    D = backend.load_datamanager()

    # Check if data manager is correcly loaded
    assert D.info['task'] == datamanager.info['task']
    auto._do_dummy_prediction(D, 1)

    # Ensure that the dummy predictions are not in the current working
    # directory, but in the temporary directory.
    assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))
    assert os.path.exists(
        os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs',
                     '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy'))

    del auto
    def set_dataset(self, X, y, X_test=None, y_test=None, feat_type=None):
        """
        Stores the obtained dataset parameters in the XYDataManager of auto-sklearn
        and caclulates the metafeatures of the dataset
        """

        utils = MetaFeatures()
        X, y = utils.perform_input_checks(X, y)
        if X_test is not None:
            X_test, y_test = utils.perform_input_checks(X_test, y_test)
            if len(y.shape) != len(y_test.shape):
                raise ValueError('Target value shapes do not match: %s vs %s' %
                                 (y.shape, y_test.shape))

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self.target_type = type_of_target(y)
        task = self._task_mapping.get(self.target_type)
        if task == None:
            task = ask_const.REGRESSION

        self.dataset_name = FMLHash().hashValAndReturnString(str(X))
        self.data_manager = XYDataManager(X, y, X_test, y_test, task,
                                          feat_type, self.dataset_name)

        self.meta_features = utils.calculate_metafeatures(
            self.data_manager, self.dataset_name)
Esempio n. 13
0
    def test_fail_if_dummy_prediction_fails(self, ta_run_mock):
        backend_api = self._create_backend('test_fail_if_dummy_prediction_fails')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        datamanager = XYDataManager(
            X_train, Y_train,
            X_test, Y_test,
            task=2,
            feat_type=['Numerical' for i in range(X_train.shape[1])],
            dataset_name='iris',
        )

        time_for_this_task = 30
        per_run_time = 10
        auto = autosklearn.automl.AutoML(backend_api,
                                         time_for_this_task,
                                         per_run_time,
                                         initial_configurations_via_metalearning=25,
                                         metric=accuracy,
                                         )
        setup_logger()
        auto._logger = get_logger('test_fail_if_dummy_prediction_fails')
        auto._backend._make_internals_directory()
        auto._backend.save_datamanager(datamanager)

        # First of all, check that ta.run() is actually called.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        auto._do_dummy_prediction(datamanager, 1)
        ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task)

        # Case 1. Check that function raises no error when statustype == success.
        # ta.run() returns status, cost, runtime, and additional info.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        raised = False
        try:
            auto._do_dummy_prediction(datamanager, 1)
        except ValueError:
            raised = True
        self.assertFalse(raised, 'Exception raised')

        # Case 2. Check that if statustype returned by ta.run() != success,
        # the function raises error.
        ta_run_mock.return_value = StatusType.CRASHED, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.CRASHED '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.ABORT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.ABORT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.TIMEOUT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.MEMOUT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.CAPPED, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.CAPPED '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )

        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Esempio n. 14
0
    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray,
        task: int,
        metric: Scorer,
        X_test: Optional[np.ndarray] = None,
        y_test: Optional[np.ndarray] = None,
        feat_type: Optional[List[bool]] = None,
        dataset_name: Optional[str] = None,
        only_return_configuration_space: Optional[bool] = False,
        load_models: bool = True,
        incremental_learning: bool = False,
    ):
        if self._shared_mode:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        if dataset_name is None:
            dataset_name = hash_array_or_matrix(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if metric is None:
            raise ValueError('No metric given.')
        if not isinstance(metric, Scorer):
            raise ValueError('Metric must be instance of '
                             'autosklearn.metrics.Scorer.')

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(
            X,
            y,
            X_test=X_test,
            y_test=y_test,
            task=task,
            feat_type=feat_type,
            dataset_name=dataset_name,
        )

        return self._fit(
            datamanager=loaded_data_manager,
            metric=metric,
            load_models=load_models,
            only_return_configuration_space=only_return_configuration_space,
            incremental_learning=incremental_learning)
Esempio n. 15
0
def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    datamanager = XYDataManager(
        X_train, Y_train,
        X_test, Y_test,
        task=2,
        feat_type=['Numerical' for i in range(X_train.shape[1])],
        dataset_name='iris',
    )

    time_for_this_task = 30
    per_run_time = 10
    auto = autosklearn.automl.AutoML(backend,
                                     time_for_this_task,
                                     per_run_time,
                                     initial_configurations_via_metalearning=25,
                                     metric=accuracy,
                                     dask_client=dask_client,
                                     )
    auto._backend._make_internals_directory()
    auto._backend.save_datamanager(datamanager)

    # Make a dummy logger
    auto._logger_port = 9020
    auto._logger = unittest.mock.Mock()
    auto._logger.info.return_value = None

    # First of all, check that ta.run() is actually called.
    ta_run_mock.return_value = StatusType.SUCCESS, None, None, {}
    auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task)

    # Case 1. Check that function raises no error when statustype == success.
    # ta.run() returns status, cost, runtime, and additional info.
    ta_run_mock.return_value = StatusType.SUCCESS, None, None, {}
    raised = False
    try:
        auto._do_dummy_prediction(datamanager, 1)
    except ValueError:
        raised = True
    assert not raised, 'Exception raised'

    # Case 2. Check that if statustype returned by ta.run() != success,
    # the function raises error.
    ta_run_mock.return_value = StatusType.CRASHED, None, None, {}
    with pytest.raises(
        ValueError,
        match='Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.'  # noqa
    ):
        auto._do_dummy_prediction(datamanager, 1)

    ta_run_mock.return_value = StatusType.ABORT, None, None, {}
    with pytest.raises(
        ValueError,
        match='Dummy prediction failed with run state StatusType.ABORT '
              'and additional output: {}.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {}
    with pytest.raises(
        ValueError,
        match='Dummy prediction failed with run state StatusType.TIMEOUT '
              'and additional output: {}.'
    ):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.MEMOUT, None, None, {}
    with pytest.raises(
        ValueError,
        match='Dummy prediction failed with run state StatusType.MEMOUT '
              'and additional output: {}.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.CAPPED, None, None, {}
    with pytest.raises(
        ValueError,
        match='Dummy prediction failed with run state StatusType.CAPPED '
              'and additional output: {}.'
    ):
        auto._do_dummy_prediction(datamanager, 1)

    ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6}
    with pytest.raises(
        ValueError,
        match='The error suggests that the provided memory limits were too tight.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
Esempio n. 16
0
    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray,
        task: int,
        X_test: Optional[np.ndarray] = None,
        y_test: Optional[np.ndarray] = None,
        feat_type: Optional[List[str]] = None,
        dataset_name: Optional[str] = None,
        only_return_configuration_space: Optional[bool] = False,
        load_models: bool = True,
    ):
        # Reset learnt stuff
        self.models_ = None
        self.cv_models_ = None
        self.ensemble_ = None

        # The metric must exist as of this point
        # It can be provided in the constructor, or automatically
        # defined in the estimator fit call
        if self._metric is None:
            raise ValueError('No metric given.')
        if not isinstance(self._metric, Scorer):
            raise ValueError('Metric must be instance of '
                             'autosklearn.metrics.Scorer.')
        if self._shared_mode:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        if dataset_name is None:
            dataset_name = hash_array_or_matrix(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        datamanager = XYDataManager(
            X,
            y,
            X_test=X_test,
            y_test=y_test,
            task=task,
            feat_type=feat_type,
            dataset_name=dataset_name,
        )

        self._backend._make_internals_directory()
        try:
            os.makedirs(self._backend.get_model_dir())
        except (OSError, FileExistsError):
            if not self._shared_mode:
                raise
        try:
            os.makedirs(self._backend.get_cv_model_dir())
        except (OSError, FileExistsError):
            if not self._shared_mode:
                raise

        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        self._backend.save_datamanager(datamanager)

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(self._dataset_name, self._time_for_task,
                                  time_for_load_data, self._logger)

        # == Perform dummy predictions
        num_run = 1
        # if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._backend.temporary_directory,
            self._backend,
            datamanager,
            include_estimators=self._include_estimators,
            exclude_estimators=self._exclude_estimators,
            include_preprocessors=self._include_preprocessors,
            exclude_preprocessors=self._exclude_preprocessors)
        if only_return_configuration_space:
            return self.configuration_space

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name)
        time_left_for_ensembles = max(0, self._time_for_task - elapsed_time)
        if time_left_for_ensembles <= 0:
            self._proc_ensemble = None
            # Fit only raises error when ensemble_size is not zero but
            # time_left_for_ensembles is zero.
            if self._ensemble_size > 0:
                raise ValueError("Not starting ensemble builder because there "
                                 "is no time left. Try increasing the value "
                                 "of time_left_for_this_task.")
        elif self._ensemble_size <= 0:
            self._proc_ensemble = None
            self._logger.info('Not starting ensemble builder because '
                              'ensemble size is <= 0.')
        else:
            self._logger.info('Start Ensemble with %5.2fsec time left' %
                              time_left_for_ensembles)
            self._proc_ensemble = self._get_ensemble_process(
                time_left_for_ensembles)
            self._proc_ensemble.start()

        self._stopwatch.stop_task(ensemble_task_name)

        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass

        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name)
        time_left_for_smac = max(0, self._time_for_task - elapsed_time)

        if self._logger:
            self._logger.info('Start SMAC with %5.2fsec time left' %
                              time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            _proc_smac = None
            self._budget_type = None
        else:
            if self._per_run_time_limit is None or \
                    self._per_run_time_limit > time_left_for_smac:
                self._logger.warning(
                    'Time limit for a single run is higher than total time '
                    'limit. Capping the limit for a single run to the total '
                    'time given to SMAC (%f)' % time_left_for_smac)
                per_run_time_limit = time_left_for_smac
            else:
                per_run_time_limit = self._per_run_time_limit

            # Make sure that at least 2 models are created for the ensemble process
            num_models = time_left_for_smac // per_run_time_limit
            if num_models < 2:
                per_run_time_limit = time_left_for_smac // 2
                self._logger.warning(
                    "Capping the per_run_time_limit to {} to have "
                    "time for a least 2 models in each process.".format(
                        per_run_time_limit))

            _proc_smac = AutoMLSMBO(
                config_space=self.configuration_space,
                dataset_name=self._dataset_name,
                backend=self._backend,
                total_walltime_limit=time_left_for_smac,
                func_eval_time_limit=per_run_time_limit,
                memory_limit=self._ml_memory_limit,
                data_memory_limit=self._data_memory_limit,
                watcher=self._stopwatch,
                start_num_run=num_run,
                num_metalearning_cfgs=self.
                _initial_configurations_via_metalearning,
                config_file=configspace_path,
                seed=self._seed,
                metadata_directory=self._metadata_directory,
                metric=self._metric,
                resampling_strategy=self._resampling_strategy,
                resampling_strategy_args=self._resampling_strategy_arguments,
                shared_mode=self._shared_mode,
                include_estimators=self._include_estimators,
                exclude_estimators=self._exclude_estimators,
                include_preprocessors=self._include_preprocessors,
                exclude_preprocessors=self._exclude_preprocessors,
                disable_file_output=self._disable_evaluator_output,
                get_smac_object_callback=self._get_smac_object_callback,
                smac_scenario_args=self._smac_scenario_args,
            )

            try:
                self.runhistory_, self.trajectory_, self._budget_type = \
                    _proc_smac.run_smbo()
                trajectory_filename = os.path.join(
                    self._backend.get_smac_output_directory_for_run(
                        self._seed), 'trajectory.json')
                saveable_trajectory = \
                    [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:])
                     for entry in self.trajectory_]
                with open(trajectory_filename, 'w') as fh:
                    json.dump(saveable_trajectory, fh)
            except Exception as e:
                self._logger.exception(e)
                raise

        # Wait until the ensemble process is finished to avoid shutting down
        # while the ensemble builder tries to access the data
        if self._proc_ensemble is not None and self._ensemble_size > 0:
            self._proc_ensemble.join()

        self._proc_ensemble = None
        if load_models:
            self._load_models()

        return self