Python Backend.get_model_dir 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: autosklearn.util

클래스/타입: Backend

메소드/함수: get_model_dir

hotexamples.com에서의 예제들: 10

Python Backend.get_model_dir - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 autosklearn.util.Backend.get_model_dir에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Backend(9)

get_model_dir(5)

save_targets_ensemble(4)

save_datamanager(3)

save_model(3)

load_all_models(2)

load_datamanager(2)

load_ensemble(2)

load_models_by_identifiers(2)

_make_internals_directory(2)

save_ensemble(2)

save_predictions_as_npy(2)

save_predictions_as_txt(2)

save_start_time(2)

load_targets_ensemble(2)

load_ensemble_indices_weights(1)

save_ensemble_indices_weights(1)

예제 #1

파일 보기

파일: base_interface.py 프로젝트: stokasto/auto-sklearn

def make_mode_holdout_iterative_fit(data, seed, configuration, num_run):
    global evaluator
    evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict())
    evaluator.iterative_fit()
    signal.signal(15, empty_signal_handler)
    evaluator.finish_up()

    backend = Backend(None, os.getcwd())
    if os.path.exists(backend.get_model_dir()):
        backend.save_model(evaluator.model, num_run, seed)

예제 #2

파일 보기

def make_mode_holdout_iterative_fit(data, seed, configuration, num_run):
    global evaluator
    evaluator = HoldoutEvaluator(data,
                                 configuration,
                                 seed=seed,
                                 num_run=num_run,
                                 **_get_base_dict())
    evaluator.iterative_fit()
    signal.signal(15, empty_signal_handler)
    evaluator.finish_up()

    backend = Backend(None, os.getcwd())
    if os.path.exists(backend.get_model_dir()):
        backend.save_model(evaluator.model, num_run, seed)

예제 #3

파일 보기

class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self,
                 Datamanager,
                 output_dir,
                 configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    def fit_predict_and_loss(self):
        """Fit model(s) according to resampling strategy, predict for the
        validation set and return the loss and predictions on the validation
        set.

        Provides a closed interface in which all steps of the target
        algorithm are performed without any communication with other
        processes. Useful for cross-validation because it allows to train a
        model, predict for the validation set and then forget the model in
        order to save main memory.
        """
        raise NotImplementedError()

    def iterative_fit(self):
        """Fit a model iteratively.

        Fitting can be interrupted in order to use a partially trained model."""
        raise NotImplementedError()

    def predict_and_loss(self):
        """Use current model to predict on the validation set and calculate
        loss.

         Should be used when using iterative fitting."""
        raise NotImplementedError()

    def predict(self):
        """Use the current model to predict on the validation set.

        Should only be used to create dummy predictions."""
        raise NotImplementedError()

    def _loss(self, y_true, y_hat):
        if self.configuration is None:
            if self.all_scoring_functions:
                return {self.metric: 1.0}
            else:
                return 1.0

        score = calculate_score(
            y_true,
            y_hat,
            self.task_type,
            self.metric,
            self.D.info['label_num'],
            all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, '__len__'):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        return err

    def finish_up(self,
                  loss=None,
                  opt_pred=None,
                  valid_pred=None,
                  test_pred=None):
        """This function does everything necessary after the fitting is done:

        * predicting
        * saving the files for the ensembles_statistics
        * generate output for SMAC
        We use it as the signal handler so we can recycle the code for the
        normal usecase and when the runsolver kills us here :)"""

        try:
            self.duration = time.time() - self.starttime
            if loss is None:
                loss, opt_pred, valid_pred, test_pred = self.predict_and_loss()
            self.file_output(loss, opt_pred, valid_pred, test_pred)
            self.duration = time.time() - self.starttime

            num_run = str(self.num_run).zfill(5)
            if isinstance(loss, dict):
                loss_ = loss
                loss = loss_[self.D.info['metric']]
            else:
                loss_ = {}
            additional_run_info = ';'.join([
                '%s: %s' % (METRIC_TO_STRING[metric]
                            if metric in METRIC_TO_STRING else metric, value)
                for metric, value in loss_.items()
            ])
            additional_run_info += ';' + 'duration: ' + str(self.duration)
            additional_run_info += ';' + 'num_run:' + num_run

            if self.configuration is not None:
                self._output_SMAC_string(self.duration, loss, self.seed,
                                         additional_run_info)
        except Exception as e:
            self.duration = time.time() - self.starttime
            print(traceback.format_exc())
            self._output_SMAC_string(
                self.duration, 2.0, self.seed,
                'No results were produced! Error is %s' % str(e))

    def _output_SMAC_string(self, duration, loss, seed, additional_run_info):
        print(
            'Result for ParamILS: %s, %f, 1, %f, %d, %s' %
            ('SAT', abs(self.duration), loss, self.seed, additional_run_info))

    def file_output(self, loss, Y_optimization_pred, Y_valid_pred,
                    Y_test_pred):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)
        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed,
                                                 num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed,
                                                 num_run)

    def _predict_proba(self, X, model, task_type, Y_train):
        Y_pred = model.predict_proba(X, batch_size=1000)
        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def _predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            if Y_train is None:
                raise ValueError('Y_train must not be None!')
            classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes),
                                       dtype=np.float32)

            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction

예제 #4

파일 보기

class AutoML(BaseEstimator, multiprocessing.Process):
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset,
            encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task, time_for_load_data,
                         logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)

        _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed,
                                 num_run, self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 1/2.")
        else:
            self._logger.error('Error creating dummy prediction 1/2:%s ',
                               _info[3])

        num_run += 1

        _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed,
                                 num_run, self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 2/2.")
        else:
            self._logger.error('Error creating dummy prediction 2/2 %s',
                               _info[3])

        num_run += 1
        return num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in [
                'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv',
                'partial-cv'
        ]:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError(
                'Illegal acquisition %s: Must be one of %s.' %
                (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(datamanager.data['X_train'],
                                 datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(self._dataset_name, self._time_for_task,
                                  time_for_load_data, self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._tmp_dir, self._backend, datamanager,
            self._include_estimators, self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info('Start Ensemble with %5.2fsec time left' %
                              time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(
                time_left_for_ensembles)
            self._proc_ensemble.start()
        self._stopwatch.stop_task(ensemble_task_name)

        # == RUN SMBO
        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass

        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(
            0, self._time_for_task -
            self._stopwatch.wall_elapsed(self._dataset_name))

        if self._logger:
            self._logger.info('Start SMAC with %5.2fsec time left' %
                              time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            self._procsmac = None
        else:
            self._proc_smac = AutoMLSMBO(
                config_space=self.configuration_space,
                dataset_name=self._dataset_name,
                tmp_dir=self._tmp_dir,
                output_dir=self._output_dir,
                total_walltime_limit=time_left_for_smac,
                func_eval_time_limit=self._per_run_time_limit,
                memory_limit=self._ml_memory_limit,
                data_memory_limit=self._data_memory_limit,
                watcher=self._stopwatch,
                start_num_run=num_run,
                num_metalearning_cfgs=self.
                _initial_configurations_via_metalearning,
                config_file=configspace_path,
                smac_iters=self._max_iter_smac,
                seed=self._seed,
                metadata_directory=self._metadata_directory,
                resampling_strategy=self._resampling_strategy,
                resampling_strategy_args=self._resampling_strategy_arguments,
                acquisition_function=self.acquisition_function,
                shared_mode=self._shared_mode)
            self._proc_smac.start()

        psutil_procs = []
        procs = []
        if self._proc_smac is not None:
            proc_smac = psutil.Process(self._proc_smac.pid)
            psutil_procs.append(proc_smac)
            procs.append(self._proc_smac)
        if self._proc_ensemble is not None:
            proc_ensemble = psutil.Process(self._proc_ensemble.pid)
            psutil_procs.append(proc_ensemble)
            procs.append(self._proc_ensemble)

        if self._queue is not None:
            self._queue.put(
                [time_for_load_data, data_manager_path, psutil_procs])
        else:
            for proc in procs:
                proc.join()

        if self._queue is None:
            self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning(
                    "Prediction shape for model %s is %s "
                    "while X_.shape is %s" %
                    (model, str(prediction.shape), str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError(
                'Something went wrong generating the predictions. '
                'The ensemble should consist of the following '
                'models: %s, the following models were loaded: '
                '%s' % (str(list(self.ensemble_indices_.keys())),
                        str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self,
                     task=None,
                     metric=None,
                     precision='32',
                     dataset_name=None,
                     ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1,
            task,
            metric,
            precision,
            dataset_name,
            max_iterations=1,
            ensemble_nbest=ensemble_nbest,
            ensemble_size=ensemble_size)
        self._proc_ensemble.main()

    def _get_ensemble_process(self,
                              time_left_for_ensembles,
                              task=None,
                              metric=None,
                              precision=None,
                              dataset_name=None,
                              max_iterations=-1,
                              ensemble_nbest=None,
                              ensemble_size=None):

        if task is None:
            task = self._task
        if metric is None:
            metric = self._metric
        if precision is None:
            precision = self.precision
        if dataset_name is None:
            dataset_name = self._dataset_name
        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        if ensemble_size is None:
            ensemble_size = self._ensemble_size

        return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               output_dir=self._output_dir,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(
                identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y,
                               prediction,
                               self._task,
                               self._metric,
                               self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self,
                             tmp_dir,
                             backend,
                             datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager,
                                         configuration_space):
        return configuration_space

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" % self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                         self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" % self._tmp_dir)

예제 #5

파일 보기

파일: automl.py 프로젝트: stokasto/auto-sklearn

class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_indices_ = None

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()

    def start(self):
        if self._datamanager is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._datamanager is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, bool)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain bools.')

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager):
        autosklearn.cli.base_interface.main(datamanager,
                                            self._resampling_strategy,
                                            None,
                                            None,
                                            mode_args=self._resampling_strategy_arguments)

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_indices_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        set_auto_seed(self._seed)

        # == Pickle the data manager, here, because no more global
        # OneHotEncoding
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        self._do_dummy_prediction(datamanager)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            datamanager.info,
            self._backend,
            self._stopwatch,
            self._logger,
            self._include_estimators,
            self._include_preprocessors)
        self.configuration_space_created_hook(datamanager)

        # == Calculate metafeatures
        meta_features = _calculate_metafeatures(
            data_feat_type=datamanager.feat_type,
            data_info_task=datamanager.info['task'],
            x_train=datamanager.data['X_train'],
            y_train=datamanager.data['Y_train'],
            basename=self._dataset_name,
            watcher=self._stopwatch,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            logger=self._logger)

        self._stopwatch.start_task('OneHot')
        datamanager.perform1HotEncoding()
        self._stopwatch.stop_task('OneHot')

        if meta_features is None:
            initial_configurations = []
        elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                          BINARY_CLASSIFICATION,
                                          MULTILABEL_CLASSIFICATION]:

            meta_features_encoded = _calculate_metafeatures_encoded(
                self._dataset_name,
                datamanager.data['X_train'],
                datamanager.data['Y_train'],
                self._stopwatch,
                self._logger)

            self._logger.debug(meta_features.__repr__(verbosity=2))
            self._logger.debug(meta_features_encoded.__repr__(verbosity=2))

            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._dataset_name,
                self._metric,
                self.configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                datamanager.info[
                    'is_sparse'],
                self._stopwatch,
                self._logger)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._dataset_name,
                self._time_for_task,
                self._logger,
                self._stopwatch)

        else:
            initial_configurations = []
            self._logger.warning('Metafeatures encoded not calculated')

        # == RUN SMAC
        if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \
            (datamanager.info["task"] == MULTICLASS_CLASSIFICATION):
            config = {'balancing:strategy': 'weighting',
                      'classifier:__choice__': 'sgd',
                      'classifier:sgd:loss': 'hinge',
                      'classifier:sgd:penalty': 'l2',
                      'classifier:sgd:alpha': 0.0001,
                      'classifier:sgd:fit_intercept': 'True',
                      'classifier:sgd:n_iter': 5,
                      'classifier:sgd:learning_rate': 'optimal',
                      'classifier:sgd:eta0': 0.01,
                      'classifier:sgd:average': 'True',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'min/max'}
        elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION:
            config = {'classifier:__choice__': 'adaboost',
                      'classifier:adaboost:algorithm': 'SAMME.R',
                      'classifier:adaboost:learning_rate': 1.0,
                      'classifier:adaboost:max_depth': 1,
                      'classifier:adaboost:n_estimators': 50,
                      'balancing:strategy': 'weighting',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'none'}
        else:
            config = None
            self._logger.info("Tasktype unknown: %s" %
                              TASK_TYPES_TO_STRING[datamanager.info["task"]])

        if config is not None:
            configuration = Configuration(self.configuration_space, config)
            config_string = convert_conf2smac_string(configuration)
            initial_configurations = [config_string] + initial_configurations

        # == RUN SMAC
        proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name,
                             time_for_task=self._time_for_task,
                             ml_memory_limit=self._ml_memory_limit,
                             data_manager_path=data_manager_path,
                             configspace_path=configspace_path,
                             initial_configurations=initial_configurations,
                             per_run_time_limit=self._per_run_time_limit,
                             watcher=self._stopwatch, backend=self._backend,
                             seed=self._seed,
                             resampling_strategy=self._resampling_strategy,
                             resampling_strategy_arguments=self._resampling_strategy_arguments,
                             shared_mode=self._shared_mode)

        # == RUN ensemble builder
        proc_ensembles = self.run_ensemble_builder()

        procs = []

        if proc_smac is not None:
            procs.append(proc_smac)
        if proc_ensembles is not None:
            procs.append(proc_ensembles)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, procs])
        else:
            for proc in procs:
                proc.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()

        # In case
        try:
            del self._datamanager
        except Exception:
            pass

        if self._queue is None:
            self._load_models()

        return self

    def run_ensemble_builder(self,
                             time_left_for_ensembles=None,
                             max_iterations=None,
                             ensemble_size=None):
        if self._ensemble_size > 0 or ensemble_size is not None:
            task_name = 'runEnsemble'
            self._stopwatch.start_task(task_name)

            if time_left_for_ensembles is None:
                time_left_for_ensembles = max(0,
                    self._time_for_task - self._stopwatch.wall_elapsed(
                        self._dataset_name))
            if max_iterations is None:
                max_iterations = -1
            if ensemble_size is None:
                ensemble_size = self._ensemble_size

            # It can happen that run_ensemble_builder is called without
            # calling fit.
            if self._logger:
                self._logger.info(
                    'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
            proc_ensembles = submit_process.run_ensemble_builder(
                tmp_dir=self._tmp_dir,
                dataset_name=self._dataset_name,
                task_type=self._task,
                metric=self._metric,
                limit=time_left_for_ensembles,
                output_dir=self._output_dir,
                ensemble_size=ensemble_size,
                ensemble_nbest=self._ensemble_nbest,
                seed=self._seed,
                shared_mode=self._shared_mode,
                max_iterations=max_iterations,
                precision=self.precision
            )
            self._stopwatch.stop_task(task_name)
            return proc_ensembles
        else:
            self._logger.info('Not starting ensemble script due to ensemble '
                             'size 0.')
            return None

    def predict(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self._resampling_strategy not in  ['holdout',
                                              'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        predictions = []
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            predictions.append(prediction * weight)

        if len(predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = np.sum(np.array(predictions), axis=0)
        return predictions

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.models_ = self._backend.load_all_models(seed)
        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

        self.ensemble_indices_ = self._backend.load_ensemble_indices_weights(
            seed)

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        if self._task == BINARY_CLASSIFICATION:
            prediction = prediction[:, 1].reshape((-1, 1))
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):
        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        output = []
        sio = six.StringIO()
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]
            output.append((weight, model))

        output.sort(reverse=True)

        sio.write("[")
        for weight, model in output:
            sio.write("(%f, %s),\n" % (weight, model))
        sio.write("]")

        return sio.getvalue()

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def configuration_space_created_hook(self, datamanager):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)

예제 #6

파일 보기

파일: automl.py 프로젝트: wanjinchang/auto-sklearn

class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()

    def start(self):
        if self._datamanager is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._datamanager is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager):
        self._logger.info("Starting to create dummy predictions.")
        autosklearn.cli.base_interface.main(datamanager,
                                            self._resampling_strategy,
                                            None,
                                            None,
                                            mode_args=self._resampling_strategy_arguments,
                                            output_dir=self._tmp_dir)
        self._logger.info("Finished creating dummy predictions.")

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        set_auto_seed(self._seed)

        # == Pickle the data manager, here, because no more global
        # OneHotEncoding
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
            self._do_dummy_prediction(datamanager)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            datamanager.info,
            self._backend,
            self._stopwatch,
            self._logger,
            self._include_estimators,
            self._include_preprocessors)
        self.configuration_space_created_hook(datamanager)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        proc_ensembles = self.run_ensemble_builder()

        # == Calculate metafeatures
        meta_features = _calculate_metafeatures(
            data_feat_type=datamanager.feat_type,
            data_info_task=datamanager.info['task'],
            x_train=datamanager.data['X_train'],
            y_train=datamanager.data['Y_train'],
            basename=self._dataset_name,
            watcher=self._stopwatch,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            logger=self._logger)

        self._stopwatch.start_task('OneHot')
        datamanager.perform1HotEncoding()
        self._stopwatch.stop_task('OneHot')

        if meta_features is None:
            initial_configurations = []
        elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                          BINARY_CLASSIFICATION,
                                          MULTILABEL_CLASSIFICATION]:

            meta_features_encoded = _calculate_metafeatures_encoded(
                self._dataset_name,
                datamanager.data['X_train'],
                datamanager.data['Y_train'],
                self._stopwatch,
                self._logger)

            self._logger.debug(meta_features.__repr__(verbosity=2))
            self._logger.debug(meta_features_encoded.__repr__(verbosity=2))

            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._dataset_name,
                self._metric,
                self.configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                datamanager.info[
                    'is_sparse'],
                self._stopwatch,
                self._logger)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._dataset_name,
                self._time_for_task,
                self._logger,
                self._stopwatch)

        else:
            initial_configurations = []
            self._logger.warning('Metafeatures encoded not calculated')

        # == RUN SMAC
        if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \
            (datamanager.info["task"] == MULTICLASS_CLASSIFICATION):
            config = {'balancing:strategy': 'weighting',
                      'classifier:__choice__': 'sgd',
                      'classifier:sgd:loss': 'hinge',
                      'classifier:sgd:penalty': 'l2',
                      'classifier:sgd:alpha': 0.0001,
                      'classifier:sgd:fit_intercept': 'True',
                      'classifier:sgd:n_iter': 5,
                      'classifier:sgd:learning_rate': 'optimal',
                      'classifier:sgd:eta0': 0.01,
                      'classifier:sgd:average': 'True',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'min/max'}
        elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION:
            config = {'classifier:__choice__': 'adaboost',
                      'classifier:adaboost:algorithm': 'SAMME.R',
                      'classifier:adaboost:learning_rate': 1.0,
                      'classifier:adaboost:max_depth': 1,
                      'classifier:adaboost:n_estimators': 50,
                      'balancing:strategy': 'weighting',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'none'}
        else:
            config = None
            self._logger.info("Tasktype unknown: %s" %
                              TASK_TYPES_TO_STRING[datamanager.info["task"]])

        if config is not None:
            try:
                configuration = Configuration(self.configuration_space, config)
                config_string = convert_conf2smac_string(configuration)
                initial_configurations = [config_string] + initial_configurations
            except ValueError:
                pass

        # == RUN SMAC
        proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name,
                             time_for_task=self._time_for_task,
                             ml_memory_limit=self._ml_memory_limit,
                             data_manager_path=data_manager_path,
                             configspace_path=configspace_path,
                             initial_configurations=initial_configurations,
                             per_run_time_limit=self._per_run_time_limit,
                             watcher=self._stopwatch, backend=self._backend,
                             seed=self._seed,
                             resampling_strategy=self._resampling_strategy,
                             resampling_strategy_arguments=self._resampling_strategy_arguments,
                             shared_mode=self._shared_mode)

        procs = []

        if proc_smac is not None:
            procs.append(proc_smac)
        if proc_ensembles is not None:
            procs.append(proc_ensembles)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, procs])
        else:
            for proc in procs:
                proc.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()

        # In case
        try:
            del self._datamanager
        except Exception:
            pass

        if self._queue is None:
            self._load_models()

        return self

    def run_ensemble_builder(self,
                             time_left_for_ensembles=None,
                             max_iterations=None,
                             ensemble_size=None):
        if self._ensemble_size > 0 or ensemble_size is not None:
            task_name = 'runEnsemble'
            self._stopwatch.start_task(task_name)

            if time_left_for_ensembles is None:
                time_left_for_ensembles = max(0,
                    self._time_for_task - self._stopwatch.wall_elapsed(
                        self._dataset_name))
            if max_iterations is None:
                max_iterations = -1
            if ensemble_size is None:
                ensemble_size = self._ensemble_size

            # It can happen that run_ensemble_builder is called without
            # calling fit.
            if self._logger:
                self._logger.info(
                    'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
            proc_ensembles = submit_process.run_ensemble_builder(
                tmp_dir=self._tmp_dir,
                dataset_name=self._dataset_name,
                task_type=self._task,
                metric=self._metric,
                limit=time_left_for_ensembles,
                output_dir=self._output_dir,
                ensemble_size=ensemble_size,
                ensemble_nbest=self._ensemble_nbest,
                seed=self._seed,
                shared_mode=self._shared_mode,
                max_iterations=max_iterations,
                precision=self.precision
            )
            self._stopwatch.stop_task(task_name)
            return proc_ensembles
        else:
            self._logger.info('Not starting ensemble script due to ensemble '
                             'size 0.')
            return None

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')


    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def configuration_space_created_hook(self, datamanager):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)

예제 #7

파일 보기

파일: abstract_evaluator.py 프로젝트: Allen1203/auto-sklearn

class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self, Datamanager, output_dir, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.output_dir = output_dir
        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    def fit_predict_and_loss(self):
        """Fit model(s) according to resampling strategy, predict for the
        validation set and return the loss and predictions on the validation
        set.

        Provides a closed interface in which all steps of the target
        algorithm are performed without any communication with other
        processes. Useful for cross-validation because it allows to train a
        model, predict for the validation set and then forget the model in
        order to save main memory.
        """
        raise NotImplementedError()

    def iterative_fit(self):
        """Fit a model iteratively.

        Fitting can be interrupted in order to use a partially trained model."""
        raise NotImplementedError()

    def predict_and_loss(self):
        """Use current model to predict on the validation set and calculate
        loss.

         Should be used when using iterative fitting."""
        raise NotImplementedError()

    def predict(self):
        """Use the current model to predict on the validation set.

        Should only be used to create dummy predictions."""
        raise NotImplementedError()

    def _loss(self, y_true, y_hat):
        if self.configuration is None:
            if self.all_scoring_functions:
                return {self.metric: 1.0}
            else:
                return 1.0

        score = calculate_score(
            y_true, y_hat, self.task_type,
            self.metric, self.D.info['label_num'],
            all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, '__len__'):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        return err

    def finish_up(self, loss=None, opt_pred=None, valid_pred=None,
                  test_pred=None):
        """This function does everything necessary after the fitting is done:

        * predicting
        * saving the files for the ensembles_statistics
        * generate output for SMAC
        We use it as the signal handler so we can recycle the code for the
        normal usecase and when the runsolver kills us here :)"""

        try:
            self.duration = time.time() - self.starttime
            if loss is None:
                loss, opt_pred, valid_pred, test_pred = self.predict_and_loss()
            self.file_output(loss, opt_pred, valid_pred, test_pred)
            self.duration = time.time() - self.starttime

            num_run = str(self.num_run).zfill(5)
            if isinstance(loss, dict):
                loss_ = loss
                loss = loss_[self.D.info['metric']]
            else:
                loss_ = {}
            additional_run_info = ';'.join(['%s: %s' %
                                    (METRIC_TO_STRING[
                                         metric] if metric in METRIC_TO_STRING else metric,
                                     value)
                                    for metric, value in loss_.items()])
            additional_run_info += ';' + 'duration: ' + str(self.duration)
            additional_run_info += ';' + 'num_run:' + num_run

            if self.configuration is not None:
                self._output_SMAC_string(self.duration, loss, self.seed,
                                         additional_run_info)
        except Exception as e:
            self.duration = time.time() - self.starttime
            print(traceback.format_exc())
            self._output_SMAC_string(self.duration, 2.0, self.seed,
                'No results were produced! Error is %s' % str(e))

    def _output_SMAC_string(self, duration, loss, seed, additional_run_info):
        print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
              ('SAT', abs(self.duration), loss, self.seed,
               additional_run_info))

    def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)
        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid',
                                                 seed, num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test',
                                                 seed, num_run)

    def _predict_proba(self, X, model, task_type, Y_train):
        Y_pred = model.predict_proba(X, batch_size=1000)
        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def _predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            if Y_train is None:
                raise ValueError('Y_train must not be None!')
            classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes),
                                       dtype=np.float32)

            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction

예제 #8

파일 보기

파일: abstract_evaluator.py 프로젝트: ixtel/auto-sklearn

class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self, Datamanager, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = ParamSklearnRegressor
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = ParamSklearnClassifier
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    @abc.abstractmethod
    def fit(self):
        pass

    @abc.abstractmethod
    def predict(self):
        pass

    # This function does everything necessary after the fitting is done:
    #        predicting
    #        saving the files for the ensembles_statistics
    #        generate output for SMAC
    # We use it as the signal handler so we can recycle the code for the
    # normal usecase and when the runsolver kills us here :)
    def finish_up(self):
        try:
            self.duration = time.time() - self.starttime
            result, additional_run_info = self.file_output()
            if self.configuration is not None:
                print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                      ('SAT', abs(self.duration), result, self.seed,
                       additional_run_info))
        except Exception as e:
            self.duration = time.time() - self.starttime

            print(traceback.format_exc())
            print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                  ('TIMEOUT', abs(self.duration), 1.0, self.seed,
                   'No results were produced! Error is %s' % str(e)))

    def file_output(self):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict()

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)

        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid',
                                                 seed, num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test',
                                                 seed, num_run)

        self.duration = time.time() - self.starttime
        err = errs[self.D.info['metric']]
        additional_run_info = ';'.join(['%s: %s' %
            (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric,
                                                                     value)
                                        for metric, value in errs.items()])
        additional_run_info += ';' + 'duration: ' + str(self.duration)
        additional_run_info += ';' + 'num_run:' + num_run
        return err, additional_run_info

    def predict_proba(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict_proba(X, batch_size=1000)

        if task_type == MULTILABEL_CLASSIFICATION:
            Y_pred = np.hstack([Y_pred[i][:, -1].reshape((-1, 1))
                                for i in range(len(Y_pred))])

        elif task_type == BINARY_CLASSIFICATION:
            if len(Y_pred.shape) != 1:
                Y_pred = Y_pred[:, 1].reshape(-1, 1)

        elif task_type == MULTICLASS_CLASSIFICATION:
            pass

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            classes = list(np.unique(self.D.data['Y_train']))
            if num_classes == prediction.shape[1]:
                return prediction

            if Y_train is not None:
                classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes))
            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction

예제 #9

파일 보기

파일: automl.py 프로젝트: Ayaro/auto-sklearn

class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)

        _info = eval_with_limits(datamanager, self._tmp_dir, 1,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 1/2.")
        else:
            self._logger.error('Error creating dummy prediction 1/2:%s ',
                               _info[3])

        num_run += 1

        _info = eval_with_limits(datamanager, self._tmp_dir, 2,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 2/2.")
        else:
            self._logger.error('Error creating dummy prediction 2/2 %s',
                               _info[3])

        num_run += 1
        return num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError('Illegal acquisition %s: Must be one of %s.' %
                             (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._tmp_dir,
            self._backend,
            datamanager,
            self._include_estimators,
            self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info(
                'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
            self._proc_ensemble.start()
        self._stopwatch.stop_task(ensemble_task_name)

        # == RUN SMBO
        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass
            
        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(0,
            self._time_for_task - self._stopwatch.wall_elapsed(
            self._dataset_name))

        if self._logger:
            self._logger.info(
                'Start SMAC with %5.2fsec time left' % time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            self._procsmac = None
        else:
            self._proc_smac = AutoMLSMBO(config_space=self.configuration_space,
                                         dataset_name=self._dataset_name,
                                         tmp_dir=self._tmp_dir,
                                         output_dir=self._output_dir,
                                         total_walltime_limit=time_left_for_smac,
                                         func_eval_time_limit=self._per_run_time_limit,
                                         memory_limit=self._ml_memory_limit,
                                         data_memory_limit=self._data_memory_limit,
                                         watcher=self._stopwatch,
                                         start_num_run=num_run,
                                         num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                                         config_file=configspace_path,
                                         smac_iters=self._max_iter_smac,
                                         seed=self._seed,
                                         metadata_directory=self._metadata_directory,
                                         resampling_strategy=self._resampling_strategy,
                                         resampling_strategy_args=self._resampling_strategy_arguments,
                                         acquisition_function=self.acquisition_function,
                                         shared_mode=self._shared_mode)
            self._proc_smac.start()

        psutil_procs = []
        procs = []
        if self._proc_smac is not None:
            proc_smac = psutil.Process(self._proc_smac.pid)
            psutil_procs.append(proc_smac)
            procs.append(self._proc_smac)
        if self._proc_ensemble is not None:
            proc_ensemble = psutil.Process(self._proc_ensemble.pid)
            psutil_procs.append(proc_ensemble)
            procs.append(self._proc_ensemble)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, psutil_procs])
        else:
            for proc in procs:
                proc.join()

        if self._queue is None:
            self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self, task=None, metric=None, precision='32',
                     dataset_name=None, ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1, task, metric, precision, dataset_name, max_iterations=1,
            ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
        self._proc_ensemble.main()

    def _get_ensemble_process(self, time_left_for_ensembles,
                              task=None, metric=None, precision=None,
                              dataset_name=None, max_iterations=-1,
                              ensemble_nbest=None, ensemble_size=None):

        if task is None:
            task = self._task
        if metric is None:
            metric = self._metric
        if precision is None:
            precision = self.precision
        if dataset_name is None:
            dataset_name = self._dataset_name
        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        if ensemble_size is None:
            ensemble_size = self._ensemble_size

        return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               output_dir=self._output_dir,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')


    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager, configuration_space):
        return configuration_space

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)

예제 #10

파일 보기

class AbstractEvaluator(object):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __init__(self,
                 Datamanager,
                 configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if self.configuration is None:
                self.model_class = MyDummyRegressor
            else:
                self.model_class = SimpleRegressionPipeline
            self.predict_function = self.predict_regression
        else:
            if self.configuration is None:
                self.model_class = MyDummyClassifier
            else:
                self.model_class = SimpleClassificationPipeline
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self.backend = Backend(None, self.output_dir)
        self.model = self.model_class(self.configuration, self.seed)

    @abc.abstractmethod
    def fit(self):
        pass

    @abc.abstractmethod
    def predict(self):
        pass

    # This function does everything necessary after the fitting is done:
    #        predicting
    #        saving the files for the ensembles_statistics
    #        generate output for SMAC
    # We use it as the signal handler so we can recycle the code for the
    # normal usecase and when the runsolver kills us here :)
    def finish_up(self):
        try:
            self.duration = time.time() - self.starttime
            result, additional_run_info = self.file_output()
            if self.configuration is not None:
                print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                      ('SAT', abs(self.duration), result, self.seed,
                       additional_run_info))
        except Exception as e:
            self.duration = time.time() - self.starttime

            print(traceback.format_exc())
            print('Result for ParamILS: %s, %f, 1, %f, %d, %s' %
                  ('TIMEOUT', abs(self.duration), 1.0, self.seed,
                   'No results were produced! Error is %s' % str(e)))

    def file_output(self):
        seed = os.environ.get('AUTOSKLEARN_SEED')

        errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict()

        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
            return 2, "Targets %s and prediction %s don't have the same " \
            "length. Probably training didn't finish" % (
                self.Y_optimization.shape, Y_optimization_pred.shape)

        num_run = str(self.num_run).zfill(5)

        if os.path.exists(self.backend.get_model_dir()):
            self.backend.save_model(self.model, self.num_run, seed)

        if self.output_y_test:
            try:
                os.makedirs(self.output_dir)
            except OSError:
                pass
            self.backend.save_targets_ensemble(self.Y_optimization)

        self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble',
                                             seed, num_run)

        if Y_valid_pred is not None:
            self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed,
                                                 num_run)

        if Y_test_pred is not None:
            self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed,
                                                 num_run)

        self.duration = time.time() - self.starttime
        err = errs[self.D.info['metric']]
        additional_run_info = ';'.join([
            '%s: %s' % (METRIC_TO_STRING[metric]
                        if metric in METRIC_TO_STRING else metric, value)
            for metric, value in errs.items()
        ])
        additional_run_info += ';' + 'duration: ' + str(self.duration)
        additional_run_info += ';' + 'num_run:' + num_run
        return err, additional_run_info

    def predict_proba(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict_proba(X, batch_size=1000)

        if task_type == MULTILABEL_CLASSIFICATION:
            Y_pred = np.hstack([
                Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred))
            ])

        elif task_type == BINARY_CLASSIFICATION:
            if len(Y_pred.shape) != 1:
                Y_pred = Y_pred[:, 1].reshape(-1, 1)

        elif task_type == MULTICLASS_CLASSIFICATION:
            pass

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred

    def predict_regression(self, X, model, task_type, Y_train=None):
        Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred

    def _ensure_prediction_array_sizes(self, prediction, Y_train):
        num_classes = self.D.info['label_num']

        if self.task_type == MULTICLASS_CLASSIFICATION and \
                prediction.shape[1] < num_classes:
            classes = list(np.unique(self.D.data['Y_train']))
            if num_classes == prediction.shape[1]:
                return prediction

            if Y_train is not None:
                classes = list(np.unique(Y_train))

            mapping = dict()
            for class_number in range(num_classes):
                if class_number in classes:
                    index = classes.index(class_number)
                    mapping[index] = class_number
            new_predictions = np.zeros((prediction.shape[0], num_classes))
            for index in mapping:
                class_index = mapping[index]
                new_predictions[:, class_index] = prediction[:, index]

            return new_predictions

        return prediction