def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long proc_ensembles = self.run_ensemble_builder() # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: try: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations except ValueError: pass # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self
def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_indices_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) # == RUN ensemble builder proc_ensembles = self.run_ensemble_builder() procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self
def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_indices_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warn("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warn('Metafeatures encoded not calculated') # == RUN SMACtmp_dir, basename, time_for_task, ml_memory_limit, proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) # == RUN ensemble builder proc_ensembles = self.run_ensemble_builder() procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self