def _prepare_data(self, X, y=None, **kwargs): logger.info( "Preparing data for outer fold " + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr) + "..." ) # Prepare Train and validation set data train_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices test_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=train_indices ) self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=test_indices ) # write numbers to database info object self.result_object.number_samples_validation = self._validation_y.shape[0] self.result_object.number_samples_test = self._test_y.shape[0] if self._pipe._estimator_type == "classifier": self.result_object.class_distribution_validation = FoldInfo.data_overview( self._validation_y ) self.result_object.class_distribution_test = FoldInfo.data_overview( self._test_y )
def apply_transform_parallelized(self, X): """ :param X: the data to which the delegate should be applied in parallel """ if self.nr_of_processes > 1: jobs_to_do = list() # distribute the data equally to all available cores number_of_items_to_process = PhotonDataHelper.find_n(X) number_of_items_for_each_core = int( np.ceil(number_of_items_to_process / self.nr_of_processes)) logger.info("NeuroBranch " + self.name + ": Using " + str(self.nr_of_processes) + " cores calculating " + str(number_of_items_for_each_core) + " items each") for start, stop in PhotonDataHelper.chunker( number_of_items_to_process, number_of_items_for_each_core): X_batched, _, _ = PhotonDataHelper.split_data( X, None, {}, start, stop) # copy my pipeline new_pipe_mr = self.copy_me() new_pipe_copy = new_pipe_mr.base_element new_pipe_copy.cache_folder = self.base_element.cache_folder new_pipe_copy.skip_loading = True new_pipe_copy._parallel_use = True del_job = dask.delayed(NeuroBranch.parallel_application)( new_pipe_copy, X_batched) jobs_to_do.append(del_job) dask.compute(*jobs_to_do)
def objective_function_simple(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) svc = PipelineElement("SVC", {}, random_state=42, gamma='auto') my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), ('SVC', svc)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def compute_learning_curves(self, new_pipe, train_X, train_y, train, kwargs_cv_train, test_X, test_y, test, kwargs_cv_test): self.cross_validation_infos.learning_curves_cut.transform() cut_range = [ round(cut * train_X.shape[0]) for cut in self.cross_validation_infos.learning_curves_cut.values ] learning_curves = [] for i, cut in enumerate(cut_range[1:]): cut_indices = np.arange(cut) train_cut_X, train_cut_y, train_cut_kwargs = PhotonDataHelper.split_data( train_X, train_y, kwargs_cv_train, indices=cut_indices) train_cut = train[:cut] job_data = self.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=self.JobData(train_cut_X, train_cut_y, train_cut, train_cut_kwargs), test_data=self.JobData(test_X, test_y, test, kwargs_cv_test)) curr_test_cut, curr_train_cut = InnerFoldManager.fit_and_score( job_data) learning_curves.append([ self.cross_validation_infos.learning_curves_cut.values[i], curr_test_cut.metrics, curr_train_cut.metrics ]) return learning_curves
def test_split_join_resorting(self): X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]) kwargs = {"test": np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])} X_new, y_new, kwargs_new = list(), list(), dict() # first randomly split the data and append them to X_new, y_new, kwargs_new idx_list_one, idx_list_two = list(), list() for idx in range(len(X)): if bool(random.getrandbits(1)): idx_list_one.append(idx) else: idx_list_two.append(idx) for ilist in [idx_list_two, idx_list_one]: for idx in ilist: X_batched, y_batched, kwargs_batched = PhotonDataHelper.split_data( X, y, kwargs, idx, idx) # test if batching works self.assertEqual(X_batched, X[idx]) self.assertEqual(y_batched, y[idx]) self.assertDictEqual(kwargs_batched, {"test": [kwargs["test"][idx]]}) # then join again X_new, y_new, kwargs_new = PhotonDataHelper.join_data( X_new, X_batched, y_new, y_batched, kwargs_new, kwargs_batched) # test if joining works joined_idx = PhotonDataHelper.stack_data_vertically( idx_list_two, idx_list_one) self.assertTrue(np.array_equal(X_new, X[joined_idx])) self.assertTrue(np.array_equal(y_new, y[joined_idx])) self.assertTrue( np.array_equal(kwargs_new["test"], kwargs["test"][joined_idx])) # now resort and see if that works too X_resorted, y_resorted, kwargs_resorted = PhotonDataHelper.resort_splitted_data( X_new, y_new, kwargs_new, joined_idx) self.assertTrue(np.array_equal(X_resorted, X)) self.assertTrue(np.array_equal(y_resorted, y)) self.assertListEqual(list(kwargs_resorted.keys()), list(kwargs.keys())) self.assertTrue(np.array_equal(kwargs_resorted["test"], kwargs["test"]))
def test_data_split_indices(self): vals = np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10]) vals_str = np.array([ascii(i) for i in vals]) random_features = np.random.randn(10, 20) kwargs = {"test": vals, "subtest": vals_str, "random": random_features} pick_list = [1, 3, 5] splitted_X, splitted_y, splitted_example = PhotonDataHelper.split_data( random_features, vals, kwargs, indices=pick_list) self.assertTrue(np.array_equal(splitted_X, random_features[pick_list])) self.assertTrue(np.array_equal(splitted_y, vals[pick_list])) self.assertTrue( np.array_equal(splitted_example["test"], vals[pick_list])) self.assertTrue( np.array_equal(splitted_example["subtest"], vals_str[pick_list])) self.assertTrue( np.array_equal(splitted_example["random"], random_features[pick_list]))
def objective_function_switch(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) switch = cfg["Estimator_switch"] del cfg["Estimator_switch"] for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) if switch == 'svc': est = PipelineElement("SVC", {}, random_state=42, gamma='auto') name = 'SVC' else: est = PipelineElement("RandomForestClassifier", {}, random_state=42) name = "RandomForestClassifier" my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), (name, est)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs
def fit(self, X, y, **kwargs): """ Iterates over cross-validation folds and trains the pipeline, then uses it for predictions. Calculates metrics per fold and averages them over fold. :param X: Training and test data :param y: Training and test targets :returns: configuration class for result tree that monitors training and test performance """ # needed for testing Timeboxed Random Grid Search # time.sleep(35) config_item = MDBConfig() config_item.config_dict = self.params config_item.inner_folds = [] config_item.metrics_test = [] config_item.metrics_train = [] config_item.computation_start_time = datetime.datetime.now() try: # do inner cv for idx, (inner_fold_id, inner_fold) in enumerate( self.cross_validation_infos.inner_folds[ self.outer_fold_id].items()): train, test = inner_fold.train_indices, inner_fold.test_indices # split kwargs according to cross validation train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data( X, y, kwargs, indices=train) test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data( X, y, kwargs, indices=test) new_pipe = self.pipe() if self.cache_folder is not None and self.cache_updater is not None: self.cache_updater(new_pipe, self.cache_folder, inner_fold_id) if not config_item.human_readable_config: config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict( new_pipe, self.params) logger.clean_info( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) job_data = InnerFoldManager.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=InnerFoldManager.JobData( train_X, train_y, train, kwargs_cv_train), test_data=InnerFoldManager.JobData(test_X, test_y, test, kwargs_cv_test), ) # only for unparallel processing # inform children in which inner fold we are # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt) # self.mother_inner_fold_handle(fold_cnt) # --> write that output in InnerFoldManager! # logger.debug(config_item.human_readable_config) fold_nr = idx + 1 logger.debug("calculating inner fold " + str(fold_nr) + "...") curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score( job_data) logger.debug("Performance inner fold " + str(fold_nr)) print_double_metrics( curr_train_fold.metrics, curr_test_fold.metrics, photon_system_log=False, ) durations = job_data.pipe.time_monitor self.update_config_item_with_inner_fold( config_item=config_item, fold_cnt=fold_nr, curr_train_fold=curr_train_fold, curr_test_fold=curr_test_fold, time_monitor=durations, feature_importances=new_pipe.feature_importances_, ) if isinstance(self.optimization_constraints, list): break_cv = 0 for cf in self.optimization_constraints: if not cf.shall_continue(config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break_cv += 1 break if break_cv > 0: break elif self.optimization_constraints is not None: if not self.optimization_constraints.shall_continue( config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break InnerFoldManager.process_fit_results( config_item, self.cross_validation_infos.calculate_metrics_across_folds, self.cross_validation_infos.calculate_metrics_per_fold, self.optimization_infos.metrics, ) except Exception as e: if self.raise_error: raise e logger.error(e) logger.error(traceback.format_exc()) traceback.print_exc() if not isinstance(e, Warning): config_item.config_failed = True config_item.config_error = str(e) warnings.warn("One test iteration of pipeline failed with error") logger.debug("...done with") logger.debug( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) config_item.computation_end_time = datetime.datetime.now() return config_item