def test_same_data_arrays(self): first_array = np.array([[1, 2], [3, 4]]) second_array = np.array([[1, 2], [3, 4]]) first_hash = hash_array_or_matrix(first_array) second_hash = hash_array_or_matrix(second_array) self.assertEqual(first_hash, second_hash)
def test_different_data_arrays(self): first_array = np.array([[1, 2], [3, 4]]) second_array = np.array([[1, 3], [2, 4]]) first_hash = hash_array_or_matrix(first_array) second_hash = hash_array_or_matrix(second_array) self.assertNotEqual(first_hash, second_hash)
def test_transpose_arrays(self): c_array = np.array([[1, 2], [3, 4]]) f_array = np.array([[1, 3], [2, 4]]) f_array = np.asfortranarray(f_array) c_hash = hash_array_or_matrix(c_array) f_hash = hash_array_or_matrix(f_array) self.assertEqual(c_hash, f_hash)
def test_f_contiguous_array(self): array = np.array([[1, 2], [3, 4]]) array = np.asfortranarray(array) hash = hash_array_or_matrix(array) self.assertIsNotNone(hash)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric=None, feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None #pdb.set_trace() loaded_data_manager = XYDataManager(X, y, task=task, feat_type=feat_type, dataset_name=dataset_name) #pdb.set_trace() return self._fit(loaded_data_manager, metric)
def test_scipy_csr(self): row = np.array([0, 0, 1, 2, 2, 2]) col = np.array([0, 2, 2, 0, 1, 2]) data = np.array([1, 2, 3, 4, 5, 6]) matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(3, 3)) hash = hash_array_or_matrix(matrix) self.assertIsNotNone(hash)
def fit( self, X: np.ndarray, y: np.ndarray, task: int, metric: Scorer, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[bool]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, incremental_learning: bool = False, ): if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) return self._fit( datamanager=loaded_data_manager, metric=metric, load_models=load_models, only_return_configuration_space=only_return_configuration_space, incremental_learning=incremental_learning)
def fit( self, X: np.ndarray, y: np.ndarray, task: int, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[str]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, ): # Reset learnt stuff self.models_ = None self.cv_models_ = None self.ensemble_ = None # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call if self._metric is None: raise ValueError('No metric given.') if not isinstance(self._metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) datamanager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) self._backend._make_internals_directory() try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise try: os.makedirs(self._backend.get_cv_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 # if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) if only_return_configuration_space: return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_ensembles = max(0, self._time_for_task - elapsed_time) if time_left_for_ensembles <= 0: self._proc_ensemble = None # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") elif self._ensemble_size <= 0: self._proc_ensemble = None self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') else: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_smac = max(0, self._time_for_task - elapsed_time) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None self._budget_type = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: self._logger.warning( 'Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_smac // per_run_time_limit if num_models < 2: per_run_time_limit = time_left_for_smac // 2 self._logger.warning( "Capping the per_run_time_limit to {} to have " "time for a least 2 models in each process.".format( per_run_time_limit)) _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) try: self.runhistory_, self.trajectory_, self._budget_type = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run( self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) except Exception as e: self._logger.exception(e) raise # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None if load_models: self._load_models() return self
def fit( self, X, y, task, metric, X_test=None, y_test=None, feat_type=None, dataset_name=None, only_return_configuration_space=False, ): if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) return self._fit( loaded_data_manager, metric, only_return_configuration_space, )