예제 #1
0
def make_mode_holdout_iterative_fit(data, seed, configuration, num_run):
    global evaluator
    evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict())
    evaluator.iterative_fit()
    signal.signal(15, empty_signal_handler)
    evaluator.finish_up()

    backend = Backend(None, os.getcwd())
    if os.path.exists(backend.get_model_dir()):
        backend.save_model(evaluator.model, num_run, seed)
예제 #2
0
    def __init__(
        self,
        Datamanager,
        output_dir,
        configuration=None,
        with_predictions=False,
        all_scoring_functions=False,
        seed=1,
        output_y_test=False,
        num_run=None,
        subsample=None,
    ):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        self.subsample = subsample

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)
예제 #3
0
def make_mode_holdout_iterative_fit(data, seed, configuration, num_run):
    global evaluator
    evaluator = HoldoutEvaluator(data,
                                 configuration,
                                 seed=seed,
                                 num_run=num_run,
                                 **_get_base_dict())
    evaluator.iterative_fit()
    signal.signal(15, empty_signal_handler)
    evaluator.finish_up()

    backend = Backend(None, os.getcwd())
    if os.path.exists(backend.get_model_dir()):
        backend.save_model(evaluator.model, num_run, seed)
예제 #4
0
def store_and_or_load_data(dataset_info, outputdir):
    backend = Backend(None, outputdir)

    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        D = CompetitionDataManager(dataset_info, encode_labels=True)
        backend.save_datamanager(D)

    return D
예제 #5
0
    def __init__(self,
                 Datamanager,
                 configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = SimpleRegressionPipeline
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = SimpleClassificationPipeline
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)
예제 #6
0
def store_and_or_load_data(dataset_info, outputdir):
    backend = Backend(None, outputdir)

    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        D = CompetitionDataManager(dataset_info, encode_labels=True)
        backend.save_datamanager(D)

    return D
예제 #7
0
파일: smbo.py 프로젝트: Ayaro/auto-sklearn
def load_data(dataset_info, outputdir, tmp_dir=None, max_mem=None):
    if tmp_dir is None:
        tmp_dir = outputdir
    backend = Backend(outputdir, tmp_dir)
    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        if max_mem is None:
            D = CompetitionDataManager(dataset_info, encode_labels=True)
        else:
            D = CompetitionDataManager(dataset_info, encode_labels=True, max_memory_in_mb=max_mem)
    return D
예제 #8
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_indices_ = None

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)
예제 #9
0
def load_data(dataset_info, outputdir, tmp_dir=None, max_mem=None):
    if tmp_dir is None:
        tmp_dir = outputdir
    backend = Backend(outputdir, tmp_dir)
    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        if max_mem is None:
            D = CompetitionDataManager(dataset_info, encode_labels=True)
        else:
            D = CompetitionDataManager(dataset_info,
                                       encode_labels=True,
                                       max_memory_in_mb=max_mem)
    return D
예제 #10
0
    def __init__(self, Datamanager, output_dir, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None,
                 subsample=None,):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        self.subsample = subsample

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)
예제 #11
0
    def __init__(self, Datamanager, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = ParamSklearnRegressor
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = ParamSklearnClassifier
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)
예제 #12
0
class AutoML(BaseEstimator, multiprocessing.Process):
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset,
            encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task, time_for_load_data,
                         logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)

        _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed,
                                 num_run, self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 1/2.")
        else:
            self._logger.error('Error creating dummy prediction 1/2:%s ',
                               _info[3])

        num_run += 1

        _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed,
                                 num_run, self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 2/2.")
        else:
            self._logger.error('Error creating dummy prediction 2/2 %s',
                               _info[3])

        num_run += 1
        return num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in [
                'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv',
                'partial-cv'
        ]:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError(
                'Illegal acquisition %s: Must be one of %s.' %
                (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(datamanager.data['X_train'],
                                 datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(self._dataset_name, self._time_for_task,
                                  time_for_load_data, self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._tmp_dir, self._backend, datamanager,
            self._include_estimators, self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info('Start Ensemble with %5.2fsec time left' %
                              time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(
                time_left_for_ensembles)
            self._proc_ensemble.start()
        self._stopwatch.stop_task(ensemble_task_name)

        # == RUN SMBO
        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass

        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(
            0, self._time_for_task -
            self._stopwatch.wall_elapsed(self._dataset_name))

        if self._logger:
            self._logger.info('Start SMAC with %5.2fsec time left' %
                              time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            self._procsmac = None
        else:
            self._proc_smac = AutoMLSMBO(
                config_space=self.configuration_space,
                dataset_name=self._dataset_name,
                tmp_dir=self._tmp_dir,
                output_dir=self._output_dir,
                total_walltime_limit=time_left_for_smac,
                func_eval_time_limit=self._per_run_time_limit,
                memory_limit=self._ml_memory_limit,
                data_memory_limit=self._data_memory_limit,
                watcher=self._stopwatch,
                start_num_run=num_run,
                num_metalearning_cfgs=self.
                _initial_configurations_via_metalearning,
                config_file=configspace_path,
                smac_iters=self._max_iter_smac,
                seed=self._seed,
                metadata_directory=self._metadata_directory,
                resampling_strategy=self._resampling_strategy,
                resampling_strategy_args=self._resampling_strategy_arguments,
                acquisition_function=self.acquisition_function,
                shared_mode=self._shared_mode)
            self._proc_smac.start()

        psutil_procs = []
        procs = []
        if self._proc_smac is not None:
            proc_smac = psutil.Process(self._proc_smac.pid)
            psutil_procs.append(proc_smac)
            procs.append(self._proc_smac)
        if self._proc_ensemble is not None:
            proc_ensemble = psutil.Process(self._proc_ensemble.pid)
            psutil_procs.append(proc_ensemble)
            procs.append(self._proc_ensemble)

        if self._queue is not None:
            self._queue.put(
                [time_for_load_data, data_manager_path, psutil_procs])
        else:
            for proc in procs:
                proc.join()

        if self._queue is None:
            self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning(
                    "Prediction shape for model %s is %s "
                    "while X_.shape is %s" %
                    (model, str(prediction.shape), str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError(
                'Something went wrong generating the predictions. '
                'The ensemble should consist of the following '
                'models: %s, the following models were loaded: '
                '%s' % (str(list(self.ensemble_indices_.keys())),
                        str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self,
                     task=None,
                     metric=None,
                     precision='32',
                     dataset_name=None,
                     ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1,
            task,
            metric,
            precision,
            dataset_name,
            max_iterations=1,
            ensemble_nbest=ensemble_nbest,
            ensemble_size=ensemble_size)
        self._proc_ensemble.main()

    def _get_ensemble_process(self,
                              time_left_for_ensembles,
                              task=None,
                              metric=None,
                              precision=None,
                              dataset_name=None,
                              max_iterations=-1,
                              ensemble_nbest=None,
                              ensemble_size=None):

        if task is None:
            task = self._task
        if metric is None:
            metric = self._metric
        if precision is None:
            precision = self.precision
        if dataset_name is None:
            dataset_name = self._dataset_name
        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        if ensemble_size is None:
            ensemble_size = self._ensemble_size

        return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               output_dir=self._output_dir,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(
                identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y,
                               prediction,
                               self._task,
                               self._metric,
                               self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self,
                             tmp_dir,
                             backend,
                             datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager,
                                         configuration_space):
        return configuration_space

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" % self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                         self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" % self._tmp_dir)
예제 #13
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)
예제 #14
0
class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self,
                 Datamanager,
                 configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = SimpleRegressionPipeline
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = SimpleClassificationPipeline
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    @abc.abstractmethod
    def fit(self):
        pass

    @abc.abstractmethod
    def predict(self):
        pass

    # This function does everything necessary after the fitting is done:
    #        predicting
    #        saving the files for the ensembles_statistics
    #        generate output for SMAC
    # We use it as the signal handler so we can recycle the code for the
    # normal usecase and when the runsolver kills us here :)
    def finish_up(self):
        try:
            self.duration = time.time() - self.starttime
            result, additional_run_info = self.file_output()
            if self.configuration is not None:
                print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                      ('SAT', abs(self.duration), result, self.seed,
                       additional_run_info))
        except Exception as e:
            self.duration = time.time() - self.starttime

            print(traceback.format_exc())
            print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                  ('TIMEOUT', abs(self.duration), 1.0, self.seed,
                   'No results were produced! Error is %s' % str(e)))

    def file_output(self):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict()

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)

        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed,
                                                 num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed,
                                                 num_run)

        self.duration = time.time() - self.starttime
        err = errs[self.D.info['metric']]
        additional_run_info = ';'.join([
            '%s: %s' % (METRIC_TO_STRING[metric]
                        if metric in METRIC_TO_STRING else metric, value)
            for metric, value in errs.items()
        ])
        additional_run_info += ';' + 'duration: ' + str(self.duration)
        additional_run_info += ';' + 'num_run:' + num_run
        return err, additional_run_info

    def predict_proba(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict_proba(X, batch_size=1000)

        if task_type == MULTILABEL_CLASSIFICATION:
            Y_pred = np.hstack([
                Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred))
            ])

        elif task_type == BINARY_CLASSIFICATION:
            if len(Y_pred.shape) != 1:
                Y_pred = Y_pred[:, 1].reshape(-1, 1)

        elif task_type == MULTICLASS_CLASSIFICATION:
            pass

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            classes = list(np.unique(self.D.data['Y_train']))
            if num_classes == prediction.shape[1]:
                return prediction

            if Y_train is not None:
                classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes))
            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction
예제 #15
0
    def main(self):

        watch = StopWatch()
        watch.start_task('ensemble_builder')

        used_time = 0
        time_iter = 0
        index_run = 0
        num_iteration = 0
        current_num_models = 0
        last_hash = None
        current_hash = None

        backend = Backend(self.output_dir, self.autosklearn_tmp_dir)
        dir_ensemble = os.path.join(self.autosklearn_tmp_dir,
                                    '.auto-sklearn',
                                    'predictions_ensemble')
        dir_valid = os.path.join(self.autosklearn_tmp_dir,
                                 '.auto-sklearn',
                                 'predictions_valid')
        dir_test = os.path.join(self.autosklearn_tmp_dir,
                                '.auto-sklearn',
                                'predictions_test')
        paths_ = [dir_ensemble, dir_valid, dir_test]

        dir_ensemble_list_mtimes = []

        self.logger.debug('Starting main loop with %f seconds and %d iterations '
                          'left.' % (self.limit - used_time, num_iteration))
        while used_time < self.limit or (self.max_iterations > 0 and
                                         self.max_iterations >= num_iteration):
            num_iteration += 1
            self.logger.debug('Time left: %f', self.limit - used_time)
            self.logger.debug('Time last ensemble building: %f', time_iter)

            # Reload the ensemble targets every iteration, important, because cv may
            # update the ensemble targets in the cause of running auto-sklearn
            # TODO update cv in order to not need this any more!
            targets_ensemble = backend.load_targets_ensemble()

            # Load the predictions from the models
            exists = [os.path.isdir(dir_) for dir_ in paths_]
            if not exists[0]:  # all(exists):
                self.logger.debug('Prediction directory %s does not exist!' %
                              dir_ensemble)
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if self.shared_mode is False:
                dir_ensemble_list = sorted(glob.glob(os.path.join(
                    dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed)))
                if exists[1]:
                    dir_valid_list = sorted(glob.glob(os.path.join(
                        dir_valid, 'predictions_valid_%s_*.npy' % self.seed)))
                else:
                    dir_valid_list = []
                if exists[2]:
                    dir_test_list = sorted(glob.glob(os.path.join(
                        dir_test, 'predictions_test_%s_*.npy' % self.seed)))
                else:
                    dir_test_list = []
            else:
                dir_ensemble_list = sorted(os.listdir(dir_ensemble))
                dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
                dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

            # Check the modification times because predictions can be updated
            # over time!
            old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
            dir_ensemble_list_mtimes = []

            for dir_ensemble_file in dir_ensemble_list:
                if dir_ensemble_file.endswith("/"):
                    dir_ensemble_file = dir_ensemble_file[:-1]
                basename = os.path.basename(dir_ensemble_file)
                dir_ensemble_file = os.path.join(dir_ensemble, basename)
                mtime = os.path.getmtime(dir_ensemble_file)
                dir_ensemble_list_mtimes.append(mtime)

            if len(dir_ensemble_list) == 0:
                self.logger.debug('Directories are empty')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if len(dir_ensemble_list) <= current_num_models and \
                    old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
                self.logger.debug('Nothing has changed since the last time')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            watch.start_task('index_run' + str(index_run))
            watch.start_task('ensemble_iter_' + str(num_iteration))

            # List of num_runs (which are in the filename) which will be included
            #  later
            include_num_runs = []
            backup_num_runs = []
            model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
            if self.ensemble_nbest is not None:
                # Keeps track of the single scores of each model in our ensemble
                scores_nbest = []
                # The indices of the model that are currently in our ensemble
                indices_nbest = []
                # The names of the models
                model_names = []

            model_names_to_scores = dict()

            model_idx = 0
            for model_name in dir_ensemble_list:
                if model_name.endswith("/"):
                    model_name = model_name[:-1]
                basename = os.path.basename(model_name)

                try:
                    if self.precision is "16":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16)
                    elif self.precision is "32":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32)
                    elif self.precision is "64":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64)
                    else:
                        predictions = np.load(os.path.join(dir_ensemble, basename))

                    score = calculate_score(targets_ensemble, predictions,
                                            self.task_type, self.metric,
                                            predictions.shape[1])

                except Exception as e:
                    self.logger.warning('Error loading %s: %s', basename, e)
                    score = -1

                model_names_to_scores[model_name] = score
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))

                if self.ensemble_nbest is not None:
                    if score <= 0.001:
                        self.logger.error('Model only predicts at random: ' +
                                      model_name + ' has score: ' + str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    # If we have less models in our ensemble than ensemble_nbest add
                    # the current model if it is better than random
                    elif len(scores_nbest) < self.ensemble_nbest:
                        scores_nbest.append(score)
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        model_names.append(model_name)
                    else:
                        # Take the worst performing model in our ensemble so far
                        idx = np.argmin(np.array([scores_nbest]))

                        # If the current model is better than the worst model in
                        # our ensemble replace it by the current model
                        if scores_nbest[idx] < score:
                            self.logger.debug('Worst model in our ensemble: %s with '
                                          'score %f will be replaced by model %s '
                                          'with score %f', model_names[idx],
                                          scores_nbest[idx], model_name, score)
                            # Exclude the old model
                            del scores_nbest[idx]
                            scores_nbest.append(score)
                            del include_num_runs[idx]
                            del indices_nbest[idx]
                            indices_nbest.append(model_idx)
                            include_num_runs.append((automl_seed, num_run))
                            del model_names[idx]
                            model_names.append(model_name)

                        # Otherwise exclude the current model from the ensemble
                        else:
                            # include_num_runs.append(True)
                            pass

                else:
                    # Load all predictions that are better than random
                    if score <= 0.001:
                        # include_num_runs.append(True)
                        self.logger.error('Model only predicts at random: ' +
                                      model_name + ' has score: ' + str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    else:
                        include_num_runs.append((automl_seed, num_run))

                model_idx += 1

            # If there is no model better than random guessing, we have to use
            # all models which do random guessing
            if len(include_num_runs) == 0:
                include_num_runs = backup_num_runs

            indices_to_model_names = dict()
            indices_to_run_num = dict()
            for i, model_name in enumerate(dir_ensemble_list):
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))
                if (automl_seed, num_run) in include_num_runs:
                    num_indices = len(indices_to_model_names)
                    indices_to_model_names[num_indices] = model_name
                    indices_to_run_num[num_indices] = (automl_seed, num_run)

            try:
                all_predictions_train, all_predictions_valid, all_predictions_test =\
                    self.get_all_predictions(dir_ensemble, dir_ensemble_list,
                                             dir_valid, dir_valid_list,
                                             dir_test, dir_test_list,
                                             include_num_runs,
                                             model_and_automl_re,
                                             self.precision)
            except IOError:
                self.logger.error('Could not load the predictions.')
                continue

            if len(include_num_runs) == 0:
                self.logger.error('All models do just random guessing')
                time.sleep(2)
                continue

            else:
                ensemble = EnsembleSelection(ensemble_size=self.ensemble_size,
                                             task_type=self.task_type,
                                             metric=self.metric)

                try:
                    ensemble.fit(all_predictions_train, targets_ensemble,
                                 include_num_runs)
                    self.logger.info(ensemble)

                except ValueError as e:
                    self.logger.error('Caught ValueError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except IndexError as e:
                    self.logger.error('Caught IndexError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except Exception as e:
                    self.logger.error('Caught error! %s', str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue

                # Output the score
                self.logger.info('Training performance: %f' % ensemble.train_score_)

                self.logger.info('Building the ensemble took %f seconds' %
                            watch.wall_elapsed('ensemble_iter_' + str(num_iteration)))

            # Set this variable here to avoid re-running the ensemble builder
            # every two seconds in case the ensemble did not change
            current_num_models = len(dir_ensemble_list)

            ensemble_predictions = ensemble.predict(all_predictions_train)
            if sys.version_info[0] == 2:
                ensemble_predictions.flags.writeable = False
                current_hash = hash(ensemble_predictions.data)
            else:
                current_hash = hash(ensemble_predictions.data.tobytes())

            # Only output a new ensemble and new predictions if the output of the
            # ensemble would actually change!
            # TODO this is neither safe (collisions, tests only with the ensemble
            #  prediction, but not the ensemble), implement a hash function for
            # each possible ensemble builder.
            if last_hash is not None:
                if current_hash == last_hash:
                    self.logger.info('Ensemble output did not change.')
                    time.sleep(2)
                    continue
                else:
                    last_hash = current_hash
            else:
                last_hash = current_hash

            # Save the ensemble for later use in the main auto-sklearn module!
            backend.save_ensemble(ensemble, index_run, self.seed)

            # Save predictions for valid and test data set
            if len(dir_valid_list) == len(dir_ensemble_list):
                all_predictions_valid = np.array(all_predictions_valid)
                ensemble_predictions_valid = ensemble.predict(all_predictions_valid)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_valid = ensemble_predictions_valid[:, 1]
                if self.low_precision:
                    if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]:
                        ensemble_predictions_valid[ensemble_predictions_valid < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                            ensemble_predictions_valid.shape[1] == 1):
                            bin_array[ensemble_predictions_valid >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_valid.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_valid[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_valid = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_valid.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_valid.size > 1000000:
                            precision = 4
                        else:
                            # File size maximally 2.1MB
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_valid,
                                                'valid', index_run, prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info('Could not find as many validation set predictions (%d)'
                             'as ensemble predictions (%d)!.',
                            len(dir_valid_list), len(dir_ensemble_list))

            del all_predictions_valid

            if len(dir_test_list) == len(dir_ensemble_list):
                all_predictions_test = np.array(all_predictions_test)
                ensemble_predictions_test = ensemble.predict(all_predictions_test)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_test = ensemble_predictions_test[:, 1]
                if self.low_precision:
                    if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]:
                        ensemble_predictions_test[ensemble_predictions_test < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_test.shape,
                                             dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                                    ensemble_predictions_test.shape[1] == 1):
                            bin_array[ensemble_predictions_test >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_test.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_test[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_test = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_test.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_test.size > 1000000:
                            precision = 4
                        else:
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_test,
                                                'test', index_run, prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info('Could not find as many test set predictions (%d) as '
                             'ensemble predictions (%d)!',
                            len(dir_test_list), len(dir_ensemble_list))

            del all_predictions_test

            current_num_models = len(dir_ensemble_list)
            watch.stop_task('index_run' + str(index_run))
            time_iter = watch.get_wall_dur('index_run' + str(index_run))
            used_time = watch.wall_elapsed('ensemble_builder')
            index_run += 1
        return
def main(autosklearn_tmp_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         ensemble_nbest=None,
         seed=1,
         shared_mode=False,
         max_iterations=-1,
         precision="32"):

    watch = StopWatch()
    watch.start_task('ensemble_builder')

    used_time = 0
    time_iter = 0
    index_run = 0
    num_iteration = 0
    current_num_models = 0

    backend = Backend(output_dir, autosklearn_tmp_dir)
    dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                                'predictions_ensemble')
    dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                             'predictions_valid')
    dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                            'predictions_test')
    paths_ = [dir_ensemble, dir_valid, dir_test]

    targets_ensemble = backend.load_targets_ensemble()

    dir_ensemble_list_mtimes = []

    while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration):
        num_iteration += 1
        logger.debug('Time left: %f', limit - used_time)
        logger.debug('Time last iteration: %f', time_iter)

        # Load the predictions from the models
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  # all(exists):
            logger.debug('Prediction directory %s does not exist!' %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if shared_mode is False:
            dir_ensemble_list = sorted(glob.glob(os.path.join(
                dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed)))
            if exists[1]:
                dir_valid_list = sorted(glob.glob(os.path.join(
                    dir_valid, 'predictions_valid_%s_*.npy' % seed)))
            else:
                dir_valid_list = []
            if exists[2]:
                dir_test_list = sorted(glob.glob(os.path.join(
                    dir_test, 'predictions_test_%s_*.npy' % seed)))
            else:
                dir_test_list = []
        else:
            dir_ensemble_list = sorted(os.listdir(dir_ensemble))
            dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
            dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        # Check the modification times because predictions can be updated
        # over time!
        old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
        dir_ensemble_list_mtimes = []

        for dir_ensemble_file in dir_ensemble_list:
            dir_ensemble_file = os.path.join(dir_ensemble, dir_ensemble_file)
            mtime = os.path.getmtime(dir_ensemble_file)
            dir_ensemble_list_mtimes.append(mtime)

        if len(dir_ensemble_list) == 0:
            logger.debug('Directories are empty')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if len(dir_ensemble_list) <= current_num_models and \
                old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
            logger.debug('Nothing has changed since the last time')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        backup_num_runs = []
        model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
        if ensemble_nbest is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            if precision is "16":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float16)
            elif precision is "32":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float32)
            elif precision is "64":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float64)
            else:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = calculate_score(targets_ensemble, predictions,
                                    task_type, metric,
                                    predictions.shape[1])
            model_names_to_scores[model_name] = score
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))

            if ensemble_nbest is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append(num_run)
                # If we have less models in our ensemble than ensemble_nbest add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_nbest:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append((automl_seed, num_run))
                    model_names.append(model_name)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug('Worst model in our ensemble: %s with '
                                      'score %f will be replaced by model %s '
                                      'with score %f', model_names[idx],
                                      scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        del model_names[idx]
                        model_names.append(model_name)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        # include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                else:
                    include_num_runs.append((automl_seed, num_run))

            model_idx += 1

        # If there is no model better than random guessing, we have to use
        # all models which do random guessing
        if len(include_num_runs) == 0:
            include_num_runs = backup_num_runs

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))
            if (automl_seed, num_run) in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = (automl_seed, num_run)

        # logging.info("Indices to model names:")
        # logging.info(indices_to_model_names)

        # for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = get_predictions(dir_ensemble,
                                                dir_ensemble_list,
                                                include_num_runs,
                                                model_and_automl_re,
                                                precision)

#        if len(all_predictions_train) == len(all_predictions_test) == len(
#                all_predictions_valid) == 0:
        if len(include_num_runs) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), targets_ensemble,
                    ensemble_size, task_type, metric)

                logger.info('Trajectory and indices!')
                logger.info(trajectory)
                logger.info(indices)

            except ValueError as e:
                logger.error('Caught ValueError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except Exception as e:
                logger.error('Caught error! %s', e.message)
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue

            # Output the score
            logger.info('Training performance: %f' % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = 'Ensemble members:\n'
            logger.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ('    %s; weight: %10f; performance: %10f\n' %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                      model_names_to_scores[
                         indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[
                    indices_to_run_num[
                        ensemble_member[0]]] = weight
            logger.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        backend.save_ensemble_indices_weights(ensemble_members_run_numbers,
                                              index_run, seed)

        all_predictions_valid = get_predictions(dir_valid,
                                                dir_valid_list,
                                                include_num_runs,
                                                model_and_automl_re,
                                                precision)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            all_predictions_valid = np.array(all_predictions_valid)
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            backend.save_predictions_as_txt(ensemble_predictions_valid,
                                            'valid', index_run, prefix=basename)
        else:
            logger.info('Could not find as many validation set predictions (%d)'
                         'as ensemble predictions (%d)!.',
                        len(dir_valid_list), len(dir_ensemble_list))

        del all_predictions_valid
        all_predictions_test = get_predictions(dir_test,
                                               dir_test_list,
                                               include_num_runs,
                                               model_and_automl_re,
                                               precision)

        if len(dir_test_list) == len(dir_ensemble_list):
            all_predictions_test = np.array(all_predictions_test)
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            backend.save_predictions_as_txt(ensemble_predictions_test,
                                            'test', index_run, prefix=basename)
        else:
            logger.info('Could not find as many test set predictions (%d) as '
                         'ensemble predictions (%d)!',
                        len(dir_test_list), len(dir_ensemble_list))

        del all_predictions_test

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return
예제 #17
0
class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_indices_ = None

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()

    def start(self):
        if self._datamanager is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._datamanager is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, bool)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain bools.')

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager):
        autosklearn.cli.base_interface.main(datamanager,
                                            self._resampling_strategy,
                                            None,
                                            None,
                                            mode_args=self._resampling_strategy_arguments)

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_indices_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        set_auto_seed(self._seed)

        # == Pickle the data manager, here, because no more global
        # OneHotEncoding
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        self._do_dummy_prediction(datamanager)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            datamanager.info,
            self._backend,
            self._stopwatch,
            self._logger,
            self._include_estimators,
            self._include_preprocessors)
        self.configuration_space_created_hook(datamanager)

        # == Calculate metafeatures
        meta_features = _calculate_metafeatures(
            data_feat_type=datamanager.feat_type,
            data_info_task=datamanager.info['task'],
            x_train=datamanager.data['X_train'],
            y_train=datamanager.data['Y_train'],
            basename=self._dataset_name,
            watcher=self._stopwatch,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            logger=self._logger)

        self._stopwatch.start_task('OneHot')
        datamanager.perform1HotEncoding()
        self._stopwatch.stop_task('OneHot')

        if meta_features is None:
            initial_configurations = []
        elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                          BINARY_CLASSIFICATION,
                                          MULTILABEL_CLASSIFICATION]:

            meta_features_encoded = _calculate_metafeatures_encoded(
                self._dataset_name,
                datamanager.data['X_train'],
                datamanager.data['Y_train'],
                self._stopwatch,
                self._logger)

            self._logger.debug(meta_features.__repr__(verbosity=2))
            self._logger.debug(meta_features_encoded.__repr__(verbosity=2))

            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._dataset_name,
                self._metric,
                self.configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                datamanager.info[
                    'is_sparse'],
                self._stopwatch,
                self._logger)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._dataset_name,
                self._time_for_task,
                self._logger,
                self._stopwatch)

        else:
            initial_configurations = []
            self._logger.warning('Metafeatures encoded not calculated')

        # == RUN SMAC
        if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \
            (datamanager.info["task"] == MULTICLASS_CLASSIFICATION):
            config = {'balancing:strategy': 'weighting',
                      'classifier:__choice__': 'sgd',
                      'classifier:sgd:loss': 'hinge',
                      'classifier:sgd:penalty': 'l2',
                      'classifier:sgd:alpha': 0.0001,
                      'classifier:sgd:fit_intercept': 'True',
                      'classifier:sgd:n_iter': 5,
                      'classifier:sgd:learning_rate': 'optimal',
                      'classifier:sgd:eta0': 0.01,
                      'classifier:sgd:average': 'True',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'min/max'}
        elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION:
            config = {'classifier:__choice__': 'adaboost',
                      'classifier:adaboost:algorithm': 'SAMME.R',
                      'classifier:adaboost:learning_rate': 1.0,
                      'classifier:adaboost:max_depth': 1,
                      'classifier:adaboost:n_estimators': 50,
                      'balancing:strategy': 'weighting',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'none'}
        else:
            config = None
            self._logger.info("Tasktype unknown: %s" %
                              TASK_TYPES_TO_STRING[datamanager.info["task"]])

        if config is not None:
            configuration = Configuration(self.configuration_space, config)
            config_string = convert_conf2smac_string(configuration)
            initial_configurations = [config_string] + initial_configurations

        # == RUN SMAC
        proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name,
                             time_for_task=self._time_for_task,
                             ml_memory_limit=self._ml_memory_limit,
                             data_manager_path=data_manager_path,
                             configspace_path=configspace_path,
                             initial_configurations=initial_configurations,
                             per_run_time_limit=self._per_run_time_limit,
                             watcher=self._stopwatch, backend=self._backend,
                             seed=self._seed,
                             resampling_strategy=self._resampling_strategy,
                             resampling_strategy_arguments=self._resampling_strategy_arguments,
                             shared_mode=self._shared_mode)

        # == RUN ensemble builder
        proc_ensembles = self.run_ensemble_builder()

        procs = []

        if proc_smac is not None:
            procs.append(proc_smac)
        if proc_ensembles is not None:
            procs.append(proc_ensembles)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, procs])
        else:
            for proc in procs:
                proc.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()

        # In case
        try:
            del self._datamanager
        except Exception:
            pass

        if self._queue is None:
            self._load_models()

        return self

    def run_ensemble_builder(self,
                             time_left_for_ensembles=None,
                             max_iterations=None,
                             ensemble_size=None):
        if self._ensemble_size > 0 or ensemble_size is not None:
            task_name = 'runEnsemble'
            self._stopwatch.start_task(task_name)

            if time_left_for_ensembles is None:
                time_left_for_ensembles = max(0,
                    self._time_for_task - self._stopwatch.wall_elapsed(
                        self._dataset_name))
            if max_iterations is None:
                max_iterations = -1
            if ensemble_size is None:
                ensemble_size = self._ensemble_size

            # It can happen that run_ensemble_builder is called without
            # calling fit.
            if self._logger:
                self._logger.info(
                    'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
            proc_ensembles = submit_process.run_ensemble_builder(
                tmp_dir=self._tmp_dir,
                dataset_name=self._dataset_name,
                task_type=self._task,
                metric=self._metric,
                limit=time_left_for_ensembles,
                output_dir=self._output_dir,
                ensemble_size=ensemble_size,
                ensemble_nbest=self._ensemble_nbest,
                seed=self._seed,
                shared_mode=self._shared_mode,
                max_iterations=max_iterations,
                precision=self.precision
            )
            self._stopwatch.stop_task(task_name)
            return proc_ensembles
        else:
            self._logger.info('Not starting ensemble script due to ensemble '
                             'size 0.')
            return None

    def predict(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self._resampling_strategy not in  ['holdout',
                                              'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        predictions = []
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            predictions.append(prediction * weight)

        if len(predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = np.sum(np.array(predictions), axis=0)
        return predictions

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.models_ = self._backend.load_all_models(seed)
        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

        self.ensemble_indices_ = self._backend.load_ensemble_indices_weights(
            seed)

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        if self._task == BINARY_CLASSIFICATION:
            prediction = prediction[:, 1].reshape((-1, 1))
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):
        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        output = []
        sio = six.StringIO()
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]
            output.append((weight, model))

        output.sort(reverse=True)

        sio.write("[")
        for weight, model in output:
            sio.write("(%f, %s),\n" % (weight, model))
        sio.write("]")

        return sio.getvalue()

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def configuration_space_created_hook(self, datamanager):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)
예제 #18
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)
예제 #19
0
class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)

        _info = eval_with_limits(datamanager, self._tmp_dir, 1,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 1/2.")
        else:
            self._logger.error('Error creating dummy prediction 1/2:%s ',
                               _info[3])

        num_run += 1

        _info = eval_with_limits(datamanager, self._tmp_dir, 2,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 2/2.")
        else:
            self._logger.error('Error creating dummy prediction 2/2 %s',
                               _info[3])

        num_run += 1
        return num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError('Illegal acquisition %s: Must be one of %s.' %
                             (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._tmp_dir,
            self._backend,
            datamanager,
            self._include_estimators,
            self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info(
                'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
            self._proc_ensemble.start()
        self._stopwatch.stop_task(ensemble_task_name)

        # == RUN SMBO
        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass
            
        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(0,
            self._time_for_task - self._stopwatch.wall_elapsed(
            self._dataset_name))

        if self._logger:
            self._logger.info(
                'Start SMAC with %5.2fsec time left' % time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            self._procsmac = None
        else:
            self._proc_smac = AutoMLSMBO(config_space=self.configuration_space,
                                         dataset_name=self._dataset_name,
                                         tmp_dir=self._tmp_dir,
                                         output_dir=self._output_dir,
                                         total_walltime_limit=time_left_for_smac,
                                         func_eval_time_limit=self._per_run_time_limit,
                                         memory_limit=self._ml_memory_limit,
                                         data_memory_limit=self._data_memory_limit,
                                         watcher=self._stopwatch,
                                         start_num_run=num_run,
                                         num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                                         config_file=configspace_path,
                                         smac_iters=self._max_iter_smac,
                                         seed=self._seed,
                                         metadata_directory=self._metadata_directory,
                                         resampling_strategy=self._resampling_strategy,
                                         resampling_strategy_args=self._resampling_strategy_arguments,
                                         acquisition_function=self.acquisition_function,
                                         shared_mode=self._shared_mode)
            self._proc_smac.start()

        psutil_procs = []
        procs = []
        if self._proc_smac is not None:
            proc_smac = psutil.Process(self._proc_smac.pid)
            psutil_procs.append(proc_smac)
            procs.append(self._proc_smac)
        if self._proc_ensemble is not None:
            proc_ensemble = psutil.Process(self._proc_ensemble.pid)
            psutil_procs.append(proc_ensemble)
            procs.append(self._proc_ensemble)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, psutil_procs])
        else:
            for proc in procs:
                proc.join()

        if self._queue is None:
            self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self, task=None, metric=None, precision='32',
                     dataset_name=None, ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1, task, metric, precision, dataset_name, max_iterations=1,
            ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
        self._proc_ensemble.main()

    def _get_ensemble_process(self, time_left_for_ensembles,
                              task=None, metric=None, precision=None,
                              dataset_name=None, max_iterations=-1,
                              ensemble_nbest=None, ensemble_size=None):

        if task is None:
            task = self._task
        if metric is None:
            metric = self._metric
        if precision is None:
            precision = self.precision
        if dataset_name is None:
            dataset_name = self._dataset_name
        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        if ensemble_size is None:
            ensemble_size = self._ensemble_size

        return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               output_dir=self._output_dir,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')


    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager, configuration_space):
        return configuration_space

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)
예제 #20
0
class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()

    def start(self):
        if self._datamanager is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._datamanager is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager):
        self._logger.info("Starting to create dummy predictions.")
        autosklearn.cli.base_interface.main(datamanager,
                                            self._resampling_strategy,
                                            None,
                                            None,
                                            mode_args=self._resampling_strategy_arguments,
                                            output_dir=self._tmp_dir)
        self._logger.info("Finished creating dummy predictions.")

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        set_auto_seed(self._seed)

        # == Pickle the data manager, here, because no more global
        # OneHotEncoding
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
            self._do_dummy_prediction(datamanager)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            datamanager.info,
            self._backend,
            self._stopwatch,
            self._logger,
            self._include_estimators,
            self._include_preprocessors)
        self.configuration_space_created_hook(datamanager)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        proc_ensembles = self.run_ensemble_builder()

        # == Calculate metafeatures
        meta_features = _calculate_metafeatures(
            data_feat_type=datamanager.feat_type,
            data_info_task=datamanager.info['task'],
            x_train=datamanager.data['X_train'],
            y_train=datamanager.data['Y_train'],
            basename=self._dataset_name,
            watcher=self._stopwatch,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            logger=self._logger)

        self._stopwatch.start_task('OneHot')
        datamanager.perform1HotEncoding()
        self._stopwatch.stop_task('OneHot')

        if meta_features is None:
            initial_configurations = []
        elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                          BINARY_CLASSIFICATION,
                                          MULTILABEL_CLASSIFICATION]:

            meta_features_encoded = _calculate_metafeatures_encoded(
                self._dataset_name,
                datamanager.data['X_train'],
                datamanager.data['Y_train'],
                self._stopwatch,
                self._logger)

            self._logger.debug(meta_features.__repr__(verbosity=2))
            self._logger.debug(meta_features_encoded.__repr__(verbosity=2))

            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._dataset_name,
                self._metric,
                self.configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                datamanager.info[
                    'is_sparse'],
                self._stopwatch,
                self._logger)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._dataset_name,
                self._time_for_task,
                self._logger,
                self._stopwatch)

        else:
            initial_configurations = []
            self._logger.warning('Metafeatures encoded not calculated')

        # == RUN SMAC
        if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \
            (datamanager.info["task"] == MULTICLASS_CLASSIFICATION):
            config = {'balancing:strategy': 'weighting',
                      'classifier:__choice__': 'sgd',
                      'classifier:sgd:loss': 'hinge',
                      'classifier:sgd:penalty': 'l2',
                      'classifier:sgd:alpha': 0.0001,
                      'classifier:sgd:fit_intercept': 'True',
                      'classifier:sgd:n_iter': 5,
                      'classifier:sgd:learning_rate': 'optimal',
                      'classifier:sgd:eta0': 0.01,
                      'classifier:sgd:average': 'True',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'min/max'}
        elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION:
            config = {'classifier:__choice__': 'adaboost',
                      'classifier:adaboost:algorithm': 'SAMME.R',
                      'classifier:adaboost:learning_rate': 1.0,
                      'classifier:adaboost:max_depth': 1,
                      'classifier:adaboost:n_estimators': 50,
                      'balancing:strategy': 'weighting',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'none'}
        else:
            config = None
            self._logger.info("Tasktype unknown: %s" %
                              TASK_TYPES_TO_STRING[datamanager.info["task"]])

        if config is not None:
            try:
                configuration = Configuration(self.configuration_space, config)
                config_string = convert_conf2smac_string(configuration)
                initial_configurations = [config_string] + initial_configurations
            except ValueError:
                pass

        # == RUN SMAC
        proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name,
                             time_for_task=self._time_for_task,
                             ml_memory_limit=self._ml_memory_limit,
                             data_manager_path=data_manager_path,
                             configspace_path=configspace_path,
                             initial_configurations=initial_configurations,
                             per_run_time_limit=self._per_run_time_limit,
                             watcher=self._stopwatch, backend=self._backend,
                             seed=self._seed,
                             resampling_strategy=self._resampling_strategy,
                             resampling_strategy_arguments=self._resampling_strategy_arguments,
                             shared_mode=self._shared_mode)

        procs = []

        if proc_smac is not None:
            procs.append(proc_smac)
        if proc_ensembles is not None:
            procs.append(proc_ensembles)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, procs])
        else:
            for proc in procs:
                proc.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()

        # In case
        try:
            del self._datamanager
        except Exception:
            pass

        if self._queue is None:
            self._load_models()

        return self

    def run_ensemble_builder(self,
                             time_left_for_ensembles=None,
                             max_iterations=None,
                             ensemble_size=None):
        if self._ensemble_size > 0 or ensemble_size is not None:
            task_name = 'runEnsemble'
            self._stopwatch.start_task(task_name)

            if time_left_for_ensembles is None:
                time_left_for_ensembles = max(0,
                    self._time_for_task - self._stopwatch.wall_elapsed(
                        self._dataset_name))
            if max_iterations is None:
                max_iterations = -1
            if ensemble_size is None:
                ensemble_size = self._ensemble_size

            # It can happen that run_ensemble_builder is called without
            # calling fit.
            if self._logger:
                self._logger.info(
                    'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
            proc_ensembles = submit_process.run_ensemble_builder(
                tmp_dir=self._tmp_dir,
                dataset_name=self._dataset_name,
                task_type=self._task,
                metric=self._metric,
                limit=time_left_for_ensembles,
                output_dir=self._output_dir,
                ensemble_size=ensemble_size,
                ensemble_nbest=self._ensemble_nbest,
                seed=self._seed,
                shared_mode=self._shared_mode,
                max_iterations=max_iterations,
                precision=self.precision
            )
            self._stopwatch.stop_task(task_name)
            return proc_ensembles
        else:
            self._logger.info('Not starting ensemble script due to ensemble '
                             'size 0.')
            return None

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')


    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def configuration_space_created_hook(self, datamanager):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)
예제 #21
0
class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self, Datamanager, output_dir, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    def fit_predict_and_loss(self):
        """Fit model(s) according to resampling strategy, predict for the
        validation set and return the loss and predictions on the validation
        set.

        Provides a closed interface in which all steps of the target
        algorithm are performed without any communication with other
        processes. Useful for cross-validation because it allows to train a
        model, predict for the validation set and then forget the model in
        order to save main memory.
        """
        raise NotImplementedError()

    def iterative_fit(self):
        """Fit a model iteratively.

        Fitting can be interrupted in order to use a partially trained model."""
        raise NotImplementedError()

    def predict_and_loss(self):
        """Use current model to predict on the validation set and calculate
        loss.

         Should be used when using iterative fitting."""
        raise NotImplementedError()

    def predict(self):
        """Use the current model to predict on the validation set.

        Should only be used to create dummy predictions."""
        raise NotImplementedError()

    def _loss(self, y_true, y_hat):
        if self.configuration is None:
            if self.all_scoring_functions:
                return {self.metric: 1.0}
            else:
                return 1.0

        score = calculate_score(
            y_true, y_hat, self.task_type,
            self.metric, self.D.info['label_num'],
            all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, '__len__'):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        return err

    def finish_up(self, loss=None, opt_pred=None, valid_pred=None,
                  test_pred=None):
        """This function does everything necessary after the fitting is done:

        * predicting
        * saving the files for the ensembles_statistics
        * generate output for SMAC
        We use it as the signal handler so we can recycle the code for the
        normal usecase and when the runsolver kills us here :)"""

        try:
            self.duration = time.time() - self.starttime
            if loss is None:
                loss, opt_pred, valid_pred, test_pred = self.predict_and_loss()
            self.file_output(loss, opt_pred, valid_pred, test_pred)
            self.duration = time.time() - self.starttime

            num_run = str(self.num_run).zfill(5)
            if isinstance(loss, dict):
                loss_ = loss
                loss = loss_[self.D.info['metric']]
            else:
                loss_ = {}
            additional_run_info = ';'.join(['%s: %s' %
                                    (METRIC_TO_STRING[
                                         metric] if metric in METRIC_TO_STRING else metric,
                                     value)
                                    for metric, value in loss_.items()])
            additional_run_info += ';' + 'duration: ' + str(self.duration)
            additional_run_info += ';' + 'num_run:' + num_run

            if self.configuration is not None:
                self._output_SMAC_string(self.duration, loss, self.seed,
                                         additional_run_info)
        except Exception as e:
            self.duration = time.time() - self.starttime
            print(traceback.format_exc())
            self._output_SMAC_string(self.duration, 2.0, self.seed,
                'No results were produced! Error is %s' % str(e))

    def _output_SMAC_string(self, duration, loss, seed, additional_run_info):
        print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
              ('SAT', abs(self.duration), loss, self.seed,
               additional_run_info))

    def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)
        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid',
                                                 seed, num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test',
                                                 seed, num_run)

    def _predict_proba(self, X, model, task_type, Y_train):
        Y_pred = model.predict_proba(X, batch_size=1000)
        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def _predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            if Y_train is None:
                raise ValueError('Y_train must not be None!')
            classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes),
                                       dtype=np.float32)

            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction
예제 #22
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)
예제 #23
0
    def main(self):

        watch = StopWatch()
        watch.start_task('ensemble_builder')

        used_time = 0
        time_iter = 0
        index_run = 0
        num_iteration = 0
        current_num_models = 0
        last_hash = None
        current_hash = None

        backend = Backend(self.output_dir, self.autosklearn_tmp_dir)
        dir_ensemble = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn',
                                    'predictions_ensemble')
        dir_valid = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn',
                                 'predictions_valid')
        dir_test = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn',
                                'predictions_test')
        paths_ = [dir_ensemble, dir_valid, dir_test]

        dir_ensemble_list_mtimes = []

        self.logger.debug(
            'Starting main loop with %f seconds and %d iterations '
            'left.' % (self.limit - used_time, num_iteration))
        while used_time < self.limit or (self.max_iterations > 0 and
                                         self.max_iterations >= num_iteration):
            num_iteration += 1
            self.logger.debug('Time left: %f', self.limit - used_time)
            self.logger.debug('Time last ensemble building: %f', time_iter)

            # Reload the ensemble targets every iteration, important, because cv may
            # update the ensemble targets in the cause of running auto-sklearn
            # TODO update cv in order to not need this any more!
            targets_ensemble = backend.load_targets_ensemble()

            # Load the predictions from the models
            exists = [os.path.isdir(dir_) for dir_ in paths_]
            if not exists[0]:  # all(exists):
                self.logger.debug('Prediction directory %s does not exist!' %
                                  dir_ensemble)
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if self.shared_mode is False:
                dir_ensemble_list = sorted(
                    glob.glob(
                        os.path.join(
                            dir_ensemble,
                            'predictions_ensemble_%s_*.npy' % self.seed)))
                if exists[1]:
                    dir_valid_list = sorted(
                        glob.glob(
                            os.path.join(
                                dir_valid,
                                'predictions_valid_%s_*.npy' % self.seed)))
                else:
                    dir_valid_list = []
                if exists[2]:
                    dir_test_list = sorted(
                        glob.glob(
                            os.path.join(
                                dir_test,
                                'predictions_test_%s_*.npy' % self.seed)))
                else:
                    dir_test_list = []
            else:
                dir_ensemble_list = sorted(os.listdir(dir_ensemble))
                dir_valid_list = sorted(
                    os.listdir(dir_valid)) if exists[1] else []
                dir_test_list = sorted(
                    os.listdir(dir_test)) if exists[2] else []

            # Check the modification times because predictions can be updated
            # over time!
            old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
            dir_ensemble_list_mtimes = []

            for dir_ensemble_file in dir_ensemble_list:
                if dir_ensemble_file.endswith("/"):
                    dir_ensemble_file = dir_ensemble_file[:-1]
                basename = os.path.basename(dir_ensemble_file)
                dir_ensemble_file = os.path.join(dir_ensemble, basename)
                mtime = os.path.getmtime(dir_ensemble_file)
                dir_ensemble_list_mtimes.append(mtime)

            if len(dir_ensemble_list) == 0:
                self.logger.debug('Directories are empty')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if len(dir_ensemble_list) <= current_num_models and \
                    old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
                self.logger.debug('Nothing has changed since the last time')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            watch.start_task('index_run' + str(index_run))
            watch.start_task('ensemble_iter_' + str(num_iteration))

            # List of num_runs (which are in the filename) which will be included
            #  later
            include_num_runs = []
            backup_num_runs = []
            model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
            if self.ensemble_nbest is not None:
                # Keeps track of the single scores of each model in our ensemble
                scores_nbest = []
                # The indices of the model that are currently in our ensemble
                indices_nbest = []
                # The names of the models
                model_names = []

            model_names_to_scores = dict()

            model_idx = 0
            for model_name in dir_ensemble_list:
                if model_name.endswith("/"):
                    model_name = model_name[:-1]
                basename = os.path.basename(model_name)

                try:
                    if self.precision is "16":
                        predictions = np.load(
                            os.path.join(dir_ensemble,
                                         basename)).astype(dtype=np.float16)
                    elif self.precision is "32":
                        predictions = np.load(
                            os.path.join(dir_ensemble,
                                         basename)).astype(dtype=np.float32)
                    elif self.precision is "64":
                        predictions = np.load(
                            os.path.join(dir_ensemble,
                                         basename)).astype(dtype=np.float64)
                    else:
                        predictions = np.load(
                            os.path.join(dir_ensemble, basename))

                    score = calculate_score(targets_ensemble, predictions,
                                            self.task_type, self.metric,
                                            predictions.shape[1])

                except Exception as e:
                    self.logger.warning('Error loading %s: %s', basename, e)
                    score = -1

                model_names_to_scores[model_name] = score
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))

                if self.ensemble_nbest is not None:
                    if score <= 0.001:
                        self.logger.error('Model only predicts at random: ' +
                                          model_name + ' has score: ' +
                                          str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    # If we have less models in our ensemble than ensemble_nbest add
                    # the current model if it is better than random
                    elif len(scores_nbest) < self.ensemble_nbest:
                        scores_nbest.append(score)
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        model_names.append(model_name)
                    else:
                        # Take the worst performing model in our ensemble so far
                        idx = np.argmin(np.array([scores_nbest]))

                        # If the current model is better than the worst model in
                        # our ensemble replace it by the current model
                        if scores_nbest[idx] < score:
                            self.logger.debug(
                                'Worst model in our ensemble: %s with '
                                'score %f will be replaced by model %s '
                                'with score %f', model_names[idx],
                                scores_nbest[idx], model_name, score)
                            # Exclude the old model
                            del scores_nbest[idx]
                            scores_nbest.append(score)
                            del include_num_runs[idx]
                            del indices_nbest[idx]
                            indices_nbest.append(model_idx)
                            include_num_runs.append((automl_seed, num_run))
                            del model_names[idx]
                            model_names.append(model_name)

                        # Otherwise exclude the current model from the ensemble
                        else:
                            # include_num_runs.append(True)
                            pass

                else:
                    # Load all predictions that are better than random
                    if score <= 0.001:
                        # include_num_runs.append(True)
                        self.logger.error('Model only predicts at random: ' +
                                          model_name + ' has score: ' +
                                          str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    else:
                        include_num_runs.append((automl_seed, num_run))

                model_idx += 1

            # If there is no model better than random guessing, we have to use
            # all models which do random guessing
            if len(include_num_runs) == 0:
                include_num_runs = backup_num_runs

            indices_to_model_names = dict()
            indices_to_run_num = dict()
            for i, model_name in enumerate(dir_ensemble_list):
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))
                if (automl_seed, num_run) in include_num_runs:
                    num_indices = len(indices_to_model_names)
                    indices_to_model_names[num_indices] = model_name
                    indices_to_run_num[num_indices] = (automl_seed, num_run)

            try:
                all_predictions_train, all_predictions_valid, all_predictions_test =\
                    self.get_all_predictions(dir_ensemble, dir_ensemble_list,
                                             dir_valid, dir_valid_list,
                                             dir_test, dir_test_list,
                                             include_num_runs,
                                             model_and_automl_re,
                                             self.precision)
            except IOError:
                self.logger.error('Could not load the predictions.')
                continue

            if len(include_num_runs) == 0:
                self.logger.error('All models do just random guessing')
                time.sleep(2)
                continue

            else:
                ensemble = EnsembleSelection(ensemble_size=self.ensemble_size,
                                             task_type=self.task_type,
                                             metric=self.metric)

                try:
                    ensemble.fit(all_predictions_train, targets_ensemble,
                                 include_num_runs)
                    self.logger.info(ensemble)

                except ValueError as e:
                    self.logger.error('Caught ValueError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except IndexError as e:
                    self.logger.error('Caught IndexError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except Exception as e:
                    self.logger.error('Caught error! %s', str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue

                # Output the score
                self.logger.info('Training performance: %f' %
                                 ensemble.train_score_)

                self.logger.info(
                    'Building the ensemble took %f seconds' %
                    watch.wall_elapsed('ensemble_iter_' + str(num_iteration)))

            # Set this variable here to avoid re-running the ensemble builder
            # every two seconds in case the ensemble did not change
            current_num_models = len(dir_ensemble_list)

            ensemble_predictions = ensemble.predict(all_predictions_train)
            if sys.version_info[0] == 2:
                ensemble_predictions.flags.writeable = False
                current_hash = hash(ensemble_predictions.data)
            else:
                current_hash = hash(ensemble_predictions.data.tobytes())

            # Only output a new ensemble and new predictions if the output of the
            # ensemble would actually change!
            # TODO this is neither safe (collisions, tests only with the ensemble
            #  prediction, but not the ensemble), implement a hash function for
            # each possible ensemble builder.
            if last_hash is not None:
                if current_hash == last_hash:
                    self.logger.info('Ensemble output did not change.')
                    time.sleep(2)
                    continue
                else:
                    last_hash = current_hash
            else:
                last_hash = current_hash

            # Save the ensemble for later use in the main auto-sklearn module!
            backend.save_ensemble(ensemble, index_run, self.seed)

            # Save predictions for valid and test data set
            if len(dir_valid_list) == len(dir_ensemble_list):
                all_predictions_valid = np.array(all_predictions_valid)
                ensemble_predictions_valid = ensemble.predict(
                    all_predictions_valid)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_valid = ensemble_predictions_valid[:,
                                                                            1]
                if self.low_precision:
                    if self.task_type in [
                            BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION,
                            MULTILABEL_CLASSIFICATION
                    ]:
                        ensemble_predictions_valid[
                            ensemble_predictions_valid < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_valid.shape,
                                             dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                                ensemble_predictions_valid.shape[1] == 1):
                            bin_array[ensemble_predictions_valid >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_valid.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_valid[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_valid = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_valid.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_valid.size > 1000000:
                            precision = 4
                        else:
                            # File size maximally 2.1MB
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_valid,
                                                'valid',
                                                index_run,
                                                prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info(
                    'Could not find as many validation set predictions (%d)'
                    'as ensemble predictions (%d)!.', len(dir_valid_list),
                    len(dir_ensemble_list))

            del all_predictions_valid

            if len(dir_test_list) == len(dir_ensemble_list):
                all_predictions_test = np.array(all_predictions_test)
                ensemble_predictions_test = ensemble.predict(
                    all_predictions_test)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_test = ensemble_predictions_test[:, 1]
                if self.low_precision:
                    if self.task_type in [
                            BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION,
                            MULTILABEL_CLASSIFICATION
                    ]:
                        ensemble_predictions_test[
                            ensemble_predictions_test < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_test.shape,
                                             dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                                ensemble_predictions_test.shape[1] == 1):
                            bin_array[ensemble_predictions_test >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_test.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_test[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_test = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_test.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_test.size > 1000000:
                            precision = 4
                        else:
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_test,
                                                'test',
                                                index_run,
                                                prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info(
                    'Could not find as many test set predictions (%d) as '
                    'ensemble predictions (%d)!', len(dir_test_list),
                    len(dir_ensemble_list))

            del all_predictions_test

            current_num_models = len(dir_ensemble_list)
            watch.stop_task('index_run' + str(index_run))
            time_iter = watch.get_wall_dur('index_run' + str(index_run))
            used_time = watch.wall_elapsed('ensemble_builder')
            index_run += 1
        return
예제 #24
0
class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self, Datamanager, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = ParamSklearnRegressor
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = ParamSklearnClassifier
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    @abc.abstractmethod
    def fit(self):
        pass

    @abc.abstractmethod
    def predict(self):
        pass

    # This function does everything necessary after the fitting is done:
    #        predicting
    #        saving the files for the ensembles_statistics
    #        generate output for SMAC
    # We use it as the signal handler so we can recycle the code for the
    # normal usecase and when the runsolver kills us here :)
    def finish_up(self):
        try:
            self.duration = time.time() - self.starttime
            result, additional_run_info = self.file_output()
            if self.configuration is not None:
                print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                      ('SAT', abs(self.duration), result, self.seed,
                       additional_run_info))
        except Exception as e:
            self.duration = time.time() - self.starttime

            print(traceback.format_exc())
            print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                  ('TIMEOUT', abs(self.duration), 1.0, self.seed,
                   'No results were produced! Error is %s' % str(e)))

    def file_output(self):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict()

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)

        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid',
                                                 seed, num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test',
                                                 seed, num_run)

        self.duration = time.time() - self.starttime
        err = errs[self.D.info['metric']]
        additional_run_info = ';'.join(['%s: %s' %
            (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric,
                                                                     value)
                                        for metric, value in errs.items()])
        additional_run_info += ';' + 'duration: ' + str(self.duration)
        additional_run_info += ';' + 'num_run:' + num_run
        return err, additional_run_info

    def predict_proba(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict_proba(X, batch_size=1000)

        if task_type == MULTILABEL_CLASSIFICATION:
            Y_pred = np.hstack([Y_pred[i][:, -1].reshape((-1, 1))
                                for i in range(len(Y_pred))])

        elif task_type == BINARY_CLASSIFICATION:
            if len(Y_pred.shape) != 1:
                Y_pred = Y_pred[:, 1].reshape(-1, 1)

        elif task_type == MULTICLASS_CLASSIFICATION:
            pass

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            classes = list(np.unique(self.D.data['Y_train']))
            if num_classes == prediction.shape[1]:
                return prediction

            if Y_train is not None:
                classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes))
            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction
예제 #25
0
class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self,
                 Datamanager,
                 output_dir,
                 configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    def fit_predict_and_loss(self):
        """Fit model(s) according to resampling strategy, predict for the
        validation set and return the loss and predictions on the validation
        set.

        Provides a closed interface in which all steps of the target
        algorithm are performed without any communication with other
        processes. Useful for cross-validation because it allows to train a
        model, predict for the validation set and then forget the model in
        order to save main memory.
        """
        raise NotImplementedError()

    def iterative_fit(self):
        """Fit a model iteratively.

        Fitting can be interrupted in order to use a partially trained model."""
        raise NotImplementedError()

    def predict_and_loss(self):
        """Use current model to predict on the validation set and calculate
        loss.

         Should be used when using iterative fitting."""
        raise NotImplementedError()

    def predict(self):
        """Use the current model to predict on the validation set.

        Should only be used to create dummy predictions."""
        raise NotImplementedError()

    def _loss(self, y_true, y_hat):
        if self.configuration is None:
            if self.all_scoring_functions:
                return {self.metric: 1.0}
            else:
                return 1.0

        score = calculate_score(
            y_true,
            y_hat,
            self.task_type,
            self.metric,
            self.D.info['label_num'],
            all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, '__len__'):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        return err

    def finish_up(self,
                  loss=None,
                  opt_pred=None,
                  valid_pred=None,
                  test_pred=None):
        """This function does everything necessary after the fitting is done:

        * predicting
        * saving the files for the ensembles_statistics
        * generate output for SMAC
        We use it as the signal handler so we can recycle the code for the
        normal usecase and when the runsolver kills us here :)"""

        try:
            self.duration = time.time() - self.starttime
            if loss is None:
                loss, opt_pred, valid_pred, test_pred = self.predict_and_loss()
            self.file_output(loss, opt_pred, valid_pred, test_pred)
            self.duration = time.time() - self.starttime

            num_run = str(self.num_run).zfill(5)
            if isinstance(loss, dict):
                loss_ = loss
                loss = loss_[self.D.info['metric']]
            else:
                loss_ = {}
            additional_run_info = ';'.join([
                '%s: %s' % (METRIC_TO_STRING[metric]
                            if metric in METRIC_TO_STRING else metric, value)
                for metric, value in loss_.items()
            ])
            additional_run_info += ';' + 'duration: ' + str(self.duration)
            additional_run_info += ';' + 'num_run:' + num_run

            if self.configuration is not None:
                self._output_SMAC_string(self.duration, loss, self.seed,
                                         additional_run_info)
        except Exception as e:
            self.duration = time.time() - self.starttime
            print(traceback.format_exc())
            self._output_SMAC_string(
                self.duration, 2.0, self.seed,
                'No results were produced! Error is %s' % str(e))

    def _output_SMAC_string(self, duration, loss, seed, additional_run_info):
        print(
            'Result for ParamILS: %s, %f, 1, %f, %d, %s' %
            ('SAT', abs(self.duration), loss, self.seed, additional_run_info))

    def file_output(self, loss, Y_optimization_pred, Y_valid_pred,
                    Y_test_pred):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)
        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed,
                                                 num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed,
                                                 num_run)

    def _predict_proba(self, X, model, task_type, Y_train):
        Y_pred = model.predict_proba(X, batch_size=1000)
        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def _predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            if Y_train is None:
                raise ValueError('Y_train must not be None!')
            classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes),
                                       dtype=np.float32)

            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction
예제 #26
0
def main(autosklearn_tmp_dir,
         dataset_name,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         ensemble_nbest=None,
         seed=1,
         shared_mode=False,
         max_iterations=-1,
         precision="32"):

    watch = StopWatch()
    watch.start_task('ensemble_builder')

    used_time = 0
    time_iter = 0
    index_run = 0
    num_iteration = 0
    current_num_models = 0

    backend = Backend(output_dir, autosklearn_tmp_dir)
    dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                                'predictions_ensemble')
    dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                             'predictions_valid')
    dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                            'predictions_test')
    paths_ = [dir_ensemble, dir_valid, dir_test]

    dir_ensemble_list_mtimes = []

    while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration):
        num_iteration += 1
        logger.debug('Time left: %f', limit - used_time)
        logger.debug('Time last iteration: %f', time_iter)

        # Reload the ensemble targets every iteration, important, because cv may
        # update the ensemble targets in the cause of running auto-sklearn
        # TODO update cv in order to not need this any more!
        targets_ensemble = backend.load_targets_ensemble()

        # Load the predictions from the models
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  # all(exists):
            logger.debug('Prediction directory %s does not exist!' %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if shared_mode is False:
            dir_ensemble_list = sorted(glob.glob(os.path.join(
                dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed)))
            if exists[1]:
                dir_valid_list = sorted(glob.glob(os.path.join(
                    dir_valid, 'predictions_valid_%s_*.npy' % seed)))
            else:
                dir_valid_list = []
            if exists[2]:
                dir_test_list = sorted(glob.glob(os.path.join(
                    dir_test, 'predictions_test_%s_*.npy' % seed)))
            else:
                dir_test_list = []
        else:
            dir_ensemble_list = sorted(os.listdir(dir_ensemble))
            dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
            dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        # Check the modification times because predictions can be updated
        # over time!
        old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
        dir_ensemble_list_mtimes = []

        for dir_ensemble_file in dir_ensemble_list:
            if dir_ensemble_file.endswith("/"):
                dir_ensemble_file = dir_ensemble_file[:-1]
            basename = os.path.basename(dir_ensemble_file)
            dir_ensemble_file = os.path.join(dir_ensemble, basename)
            mtime = os.path.getmtime(dir_ensemble_file)
            dir_ensemble_list_mtimes.append(mtime)

        if len(dir_ensemble_list) == 0:
            logger.debug('Directories are empty')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if len(dir_ensemble_list) <= current_num_models and \
                old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
            logger.debug('Nothing has changed since the last time')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        backup_num_runs = []
        model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
        if ensemble_nbest is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            if model_name.endswith("/"):
                model_name = model_name[:-1]
            basename = os.path.basename(model_name)

            if precision is "16":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16)
            elif precision is "32":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32)
            elif precision is "64":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64)
            else:
                predictions = np.load(os.path.join(dir_ensemble, basename))

            try:
                score = calculate_score(targets_ensemble, predictions,
                                        task_type, metric,
                                        predictions.shape[1])
            except:
                score = -1

            model_names_to_scores[model_name] = score
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))

            if ensemble_nbest is not None:
                if score <= 0.001:
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                # If we have less models in our ensemble than ensemble_nbest add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_nbest:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append((automl_seed, num_run))
                    model_names.append(model_name)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug('Worst model in our ensemble: %s with '
                                      'score %f will be replaced by model %s '
                                      'with score %f', model_names[idx],
                                      scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        del model_names[idx]
                        model_names.append(model_name)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        # include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                else:
                    include_num_runs.append((automl_seed, num_run))

            model_idx += 1

        # If there is no model better than random guessing, we have to use
        # all models which do random guessing
        if len(include_num_runs) == 0:
            include_num_runs = backup_num_runs

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))
            if (automl_seed, num_run) in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = (automl_seed, num_run)

        try:
            all_predictions_train, all_predictions_valid, all_predictions_test =\
                get_all_predictions(dir_ensemble, dir_ensemble_list,
                                    dir_valid, dir_valid_list,
                                    dir_test, dir_test_list,
                                    include_num_runs,
                                    model_and_automl_re,
                                    precision)
        except IOError:
            logger.error('Could not load the predictions.')
            continue

        if len(include_num_runs) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        else:
            ensemble = EnsembleSelection(ensemble_size=ensemble_size,
                                         task_type=task_type,
                                         metric=metric)

            try:
                ensemble.fit(all_predictions_train, targets_ensemble,
                             include_num_runs)
                logger.info(ensemble)

            except ValueError as e:
                logger.error('Caught ValueError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except IndexError as e:
                logger.error('Caught IndexError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except Exception as e:
                logger.error('Caught error! %s', e.message)
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue

            # Output the score
            logger.info('Training performance: %f' % ensemble.train_score_)

        # Save the ensemble for later use in the main auto-sklearn module!
        backend.save_ensemble(ensemble, index_run, seed)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            all_predictions_valid = np.array(all_predictions_valid)
            ensemble_predictions_valid = ensemble.predict(all_predictions_valid)
            backend.save_predictions_as_txt(ensemble_predictions_valid,
                                            'valid', index_run, prefix=dataset_name)
        else:
            logger.info('Could not find as many validation set predictions (%d)'
                         'as ensemble predictions (%d)!.',
                        len(dir_valid_list), len(dir_ensemble_list))

        del all_predictions_valid

        if len(dir_test_list) == len(dir_ensemble_list):
            all_predictions_test = np.array(all_predictions_test)
            ensemble_predictions_test = ensemble.predict(all_predictions_test)
            backend.save_predictions_as_txt(ensemble_predictions_test,
                                            'test', index_run, prefix=dataset_name)
        else:
            logger.info('Could not find as many test set predictions (%d) as '
                         'ensemble predictions (%d)!',
                        len(dir_test_list), len(dir_ensemble_list))

        del all_predictions_test

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return