def _add_mask_to_library(self, mask_name: str = '', target_affine=None, target_shape=None, mask_threshold=0.5): # Todo: find solution for multiprocessing spaming if mask_name in self.photon_masks.keys(): original_mask_object = self.photon_masks[mask_name] else: logger.debug("Checking custom mask") original_mask_object = self._check_custom_mask(mask_name) mask_object = MaskObject(name=mask_name, mask_file=original_mask_object.mask_file) #mask_object.mask = image.threshold_img(mask_object.mask_file, threshold=mask_threshold) mask_object.mask = image.math_img('img > {}'.format(mask_threshold), img=mask_object.mask_file) if target_affine is not None and target_shape is not None: mask_object.mask = self._resample(mask_object.mask, target_affine=target_affine, target_shape=target_shape) # check if roi is empty if np.sum(mask_object.mask.dataobj != 0) == 0: mask_object.is_empty = True msg = 'No voxels in mask after resampling (' + mask_object.name + ').' logger.error(msg) raise ValueError(msg) AtlasLibrary.LIBRARY[(mask_object.name, str(target_affine), str(target_shape), str(mask_threshold))] = mask_object logger.debug("BrainMask: Done adding mask to library!")
def fit_and_score(job: InnerCVJob): pipe = job.pipe # set params to current config pipe.set_params(**job.config) # start fitting pipe.fit(job.train_data.X, job.train_data.y, **job.train_data.cv_kwargs) logger.debug("Scoring Training Data") # score test data curr_test_fold = InnerFoldManager.score(pipe, job.test_data.X, job.test_data.y, job.metrics, indices=job.test_data.indices, **job.test_data.cv_kwargs) logger.debug("Scoring Test Data") # score train data curr_train_fold = InnerFoldManager.score( pipe, job.train_data.X, job.train_data.y, job.metrics, indices=job.train_data.indices, training=True, **job.train_data.cv_kwargs) return curr_test_fold, curr_train_fold
def transform(self, X, y=None, **kwargs): """ Calls transform on every step that offers a transform function including the last step if it has the transformer flag, and excluding the last step if it has the estimator flag but no transformer flag. Returns transformed X, y and kwargs """ if self.single_subject_caching: initial_X = np.array(X) else: initial_X = None X, y, kwargs = self._caching_fit_transform(X, y, kwargs) if self._final_estimator is not None: if self._estimator_type is None: if self.caching: X, y, kwargs = self.load_or_save_cached_data( self._final_estimator.name, X, y, kwargs, self._final_estimator, initial_X=initial_X) else: logger.debug('PhotonPipeline: Transforming data with ' + self._final_estimator.name) X, y, kwargs = self._final_estimator.transform( X, y, **kwargs) return X, y, kwargs
def predict(self, X, training=False, **kwargs): """ Transforms the data for every step that offers a transform function and then calls the estimator with predict on transformed data. It returns the predictions made. In case the last step is no estimator, it returns the transformed data. """ # first transform if not training: X, _, kwargs = self.transform(X, y=None, **kwargs) # then call predict on final estimator if self._final_estimator is not None: if self._final_estimator.is_estimator: logger.debug('PhotonPipeline: Predicting with ' + self._final_estimator.name + ' ...') predict_start_time = datetime.datetime.now() y_pred = self._final_estimator.predict(X, **kwargs) predict_duration = (datetime.datetime.now() - predict_start_time).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['predict'].append( (self.elements[-1][0], predict_duration, n)) return y_pred else: return X else: return None
def transform(self, X, y=None, **kwargs): if self.base_element.cache_folder is not None: # make sure we cache individually self.base_element.single_subject_caching = True self.base_element.caching = True if self.nr_of_processes > 1: if self.base_element.cache_folder is not None: # at first apply the transformation on several cores, everything gets written to the cache, # so the next step only has to reload the data ... self.apply_transform_parallelized(X) else: logger.error( "Cannot use parallelization without a cache folder specified in the hyperpipe." "Using single core instead") logger.debug("NeuroBranch " + self.name + " is collecting data from the different cores...") X_new, _, _ = self.base_element.transform(X) # check if we have a list of niftis, should avoid this, except when output_image = True if not self.output_img: if ((isinstance(X_new, list) and len(X_new) > 0) or (isinstance(X_new, np.ndarray) and len(X_new.shape) == 1)) and isinstance( X_new[0], Nifti1Image): X_new = np.asarray([i.dataobj for i in X_new]) return X_new, y, kwargs
def print_double_metrics(metric_dict_train, metric_dict_test, photon_system_log=True): t = PrettyTable(["METRIC", "PERFORMANCE TRAIN", "PERFORMANCE TEST"]) for m_key, m_value in metric_dict_train.items(): t.add_row([m_key, "%.4f" % m_value, "%.4f" % metric_dict_test[m_key]]) if photon_system_log: logger.photon_system_log(t) else: logger.debug(t)
def _generate_log(self, conf, subset, result, cost, tracking_vars): """ Generates the log and stores it into the logfile and calculate the incumbent if the constructor-parameter log['incumbent'] was true :param conf: used configuration :type conf: dict :param subset: used subset-fragmentation :type subset: int :param result: the result of the evaluation :type result: float :param cost: the cost of the evaluation :type cost: float :param tracking_vars: Fabolas' tracking vars :type tracking_vars: dict """ if self._log is None: return logger.debug("Fabolas: generating log") l = { "config": conf, "subset_frac": subset, "config_result": result, "config_cost": cost, "iteration": self._it, "operation": "init" if self._it < self._n_init else "opt", } if self._it == self._num_iterations: l["operation"] = "final" if self._log["incumbents"] and self._it < self._num_iterations: start = time() if self._it < self._n_init: best_i = np.argmin(self._Y) l["incumbents"], _, track = self._create_param_dict( (self._X[best_i][:-1], 1), {} ) l["incumbents_estimated_performance"] = -1 l["incumbents_log"] = track["config_log"] else: inc, inc_val = self._projected_incumbent_estimation( self._model_objective, self._X[:, :-1] ) l["incumbents"], _, track = self._create_param_dict((inc[:-1], 1), {}) l["incumbents_estimated_performance"] = inc_val l.update({"incumbent_time": time() - start}) l.update(tracking_vars) with open( os.path.join( self._log["path"], self._log["bn"] + "_it{it}.json".format(it=self._it) ), "w", ) as f: json.dump(l, f)
def train(self, X, y, do_optimize=True): """ Computes the Cholesky decomposition of the covariance of X and estimates the GP hyperparameters by optimizing the marginal loglikelihood. The prior mean of the GP is set to the empirical mean of X. :param X: Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. :type X: np.ndarray (N, D) :param y: The corresponding target values. :type y: np.ndarray (N,) :param do_optimize: If set to true the hyperparameters are optimized otherwise the default hyperparameters of the kernel are used. :type do_optimize: boolean """ if self.normalize_input: # Normalize input to be in [0, 1] self.X, self.lower, self.upper = normalization.zero_one_normalization( X, self.lower, self.upper) else: self.X = X if self.normalize_output: # Normalize output to have zero mean and unit standard deviation self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization( y) if self.y_std == 0: raise ValueError( "Cannot normalize output. All targets have the same value") else: self.y = y # Use the empirical mean of the data as mean for the GP self.mean = np.mean(self.y, axis=0) self.gp = george.GP(self.kernel, mean=self.mean) if do_optimize: self.hypers = self.optimize() self.gp.kernel[:] = self.hypers[:-1] self.noise = np.exp(self.hypers[-1]) # sigma^2 else: self.hypers = self.gp.kernel[:] self.hypers = np.append(self.hypers, np.log(self.noise)) if self.verbose: logger.debug("Fabolas.GaussianProcess: GP Hyperparameters: " + str(self.hypers)) self.gp.compute(self.X, yerr=np.sqrt(self.noise)) self.is_trained = True
def _generate_log(self, conf, subset, result, cost, tracking_vars): ''' Generates the log and stores it into the logfile and calculate the incumbent if the constructor-parameter log['incumbent'] was true :param conf: used configuration :type conf: dict :param subset: used subset-fragmentation :type subset: int :param result: the result of the evaluation :type result: float :param cost: the cost of the evaluation :type cost: float :param tracking_vars: Fabolas' tracking vars :type tracking_vars: dict ''' if self._log is None: return logger.debug("Fabolas: generating log") l = { 'config': conf, 'subset_frac': subset, 'config_result': result, 'config_cost': cost, 'iteration': self._it, 'operation': 'init' if self._it < self._n_init else 'opt' } if self._it == self._num_iterations: l['operation'] = 'final' if self._log['incumbents'] and self._it < self._num_iterations: start = time() if self._it < self._n_init: best_i = np.argmin(self._Y) l['incumbents'], _, track = self._create_param_dict( (self._X[best_i][:-1], 1), {}) l['incumbents_estimated_performance'] = -1 l['incumbents_log'] = track['config_log'] else: inc, inc_val = self._projected_incumbent_estimation( self._model_objective, self._X[:, :-1]) l['incumbents'], _, track = self._create_param_dict( (inc[:-1], 1), {}) l['incumbents_estimated_performance'] = inc_val l.update({'incumbent_time': time() - start}) l.update(tracking_vars) with open( os.path.join( self._log['path'], self._log['bn'] + '_it{it}.json'.format(it=self._it)), 'w') as f: json.dump(l, f)
def save_data_to_cache(self, pipe_element_name, data): cache_query = self.generate_cache_key(pipe_element_name) filename = os.path.join(self.cache_folder, str(cache_query) + ".p") self.cache_index[cache_query] = filename if not self.single_subject_caching: logger.debug("Saving data to cache for " + pipe_element_name + ": " + str(self.state.nr_items) + " items " + self.state.first_data_str + " - " + str(self.state.config)) # write cached data to filesystem with open(filename, 'wb') as f: joblib.dump(data, f)
def ask_generator(self): while True: self.flag = False start_time = time.time() X, Y = self.optimizer.rh2EPM.transform(self.optimizer.runhistory) self.optimizer.logger.debug("Search for next configuration.") # get all configurations sorted according to acquision function challengers = self.optimizer.choose_next(X, Y) self.test += 1 print("TEST # of trains", self.test) time_spent = time.time() - start_time time_left = self.optimizer._get_timebound_for_intensification(time_spent) self.to_run = self.intensify( challengers=challengers, incumbent=self.optimizer.incumbent, run_history=self.optimizer.runhistory, aggregate_func=self.optimizer.aggregate_func, time_bound=max(self.optimizer.intensifier._min_time, time_left), ) if self.flag: if self.optimizer.stats.is_budget_exhausted(): # yield self.optimizer.incumbent.get_dictionary() return None else: yield self.check(self.to_run.get_dictionary()) else: print("Size of challenger list: ", len(self.to_run)) for challenger in self.to_run[: min(len(self.to_run), 25)]: if self.optimizer.stats.is_budget_exhausted(): # yield self.optimizer.incumbent.get_dictionary() return None else: yield self.check(challenger.get_dictionary()) logger.debug( "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % ( self.optimizer.stats.get_remaing_time_budget(), self.optimizer.stats.get_remaining_ta_budget(), self.optimizer.stats.get_remaining_ta_runs(), ) ) self.optimizer.stats.print_stats(debug_out=True)
def fit(self, X, y=None, **kwargs): self._validate_elements() X, y, kwargs = self._caching_fit_transform(X, y, kwargs, fit=True) if self._final_estimator is not None: logger.debug("PhotonPipeline: Fitting " + self._final_estimator.name) fit_start_time = datetime.datetime.now() if self.random_state: self._final_estimator.random_state = self.random_state self._final_estimator.fit(X, y, **kwargs) #todo after fit final_estimator actions by estimtor go n = PhotonDataHelper.find_n(X) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((self.elements[-1][0], fit_duration, n)) return self
def _standardize(self, covariates, is_fit): logger.debug('Standardizing confounder prior to removal.') scaled_covs = list() if is_fit: # standardize covariates for cov in covariates.T: self.scalers.append(StandardScaler()) scaled_covs.append(self.scalers[-1].fit_transform( cov.reshape(-1, 1)).squeeze()) scaled_covs = np.asarray(scaled_covs).T else: for i, cov in enumerate(covariates.T): scaled_covs.append(self.scalers[i].transform(cov.reshape( -1, 1)).squeeze()) scaled_covs = np.asarray(scaled_covs).T return scaled_covs
def transform(self, X, y=None, **kwargs): """ Generates "new samples" by computing the mean between all or n_draws pairs of existing samples and appends them to X The target for each new sample is computed as the mean between the constituent targets :param X: data :param y: targets (optional) :param draw_limit: in case the full number of combinations is > 10k, how many to draw? :param rand_seed: sets seed for random sampling of combinations (for reproducibility only) :return: X_new: X and X_augmented; (y_new: the correspoding targets) """ logger.debug("Pairing " + str(self.draw_limit) + " samples...") # ensure class balance in the training set if balance_classes is True unique_classes = np.unique(y) n_pairs = list() for label in unique_classes: if self.balance_classes: n_pairs.append(self.draw_limit - np.sum(y == label)) else: n_pairs.append(self.draw_limit) # run get_samples for each class independently X_extended = list() y_extended = list() kwargs_extended = dict() for label, limit in zip(unique_classes, n_pairs): X_new_class, y_new_class, kwargs_new_class = self._return_samples( X[y == label], y[y == label], PhotonDataHelper.index_dict(kwargs, y == label), generator=self.generator, distance_metric=self.distance_metric, draw_limit=limit, rand_seed=self.random_state, ) X_extended.extend(X_new_class) y_extended.extend(y_new_class) # get the corresponding kwargs if kwargs: kwargs_extended = PhotonDataHelper.join_dictionaries( kwargs_extended, kwargs_new_class) return X_extended, y_extended, kwargs_extended
def load_cached_data(self, pipe_element_name): cache_query = self.generate_cache_key(pipe_element_name) if cache_query in self.cache_index: if not self.single_subject_caching: logger.debug("Loading data from cache for " + pipe_element_name + ": " + str(self.state.nr_items) + " items " + self.state.first_data_str + " - " + str(self.state.config)) filename = self.cache_index[cache_query] # lock = Lock(filename) # lock.acquire() with open(filename, 'rb') as f: (X, y, kwargs) = joblib.load(f) return X, y, kwargs return None
def _add_mask_to_library( self, mask_name: str = "", target_affine=None, target_shape=None, mask_threshold=0.5, ): # Todo: find solution for multiprocessing spaming # print('Adding mask to library: {} - Shape {} - Affine {} - Threshold {}'.format(mask_name, # target_shape, # target_affine, # mask_threshold)) if mask_name in self.photon_masks.keys(): original_mask_object = self.photon_masks[mask_name] else: logger.debug("Checking custom mask") original_mask_object = self._check_custom_mask(mask_name) mask_object = MaskObject(name=mask_name, mask_file=original_mask_object.mask_file) # mask_object.mask = image.threshold_img(mask_object.mask_file, threshold=mask_threshold) mask_object.mask = math_img("img > {}".format(mask_threshold), img=mask_object.mask_file) if target_affine is not None and target_shape is not None: mask_object.mask = self._resample(mask_object.mask, target_affine=target_affine, target_shape=target_shape) # check if roi is empty if np.sum(mask_object.mask.dataobj != 0) == 0: logger.error("No voxels in mask after resampling (" + mask_object.name + ").") mask_object.is_empty = True self.library[( mask_object.name, str(target_affine), str(target_shape), str(mask_threshold), )] = mask_object logger.debug("BrainMask: Done adding mask to library!")
def save(self): if self.output_settings.mongodb_connect_url: connect(self.output_settings.mongodb_connect_url, alias="photon_core") logger.debug("Write results to mongodb...") try: self.results.save() except DocumentTooLarge as e: logger.error( "Could not save document into MongoDB: Document too large") # try to reduce the amount of configs saved # if len(results_tree.outer_folds[0].tested_config_list) > 100: # for outer_fold in results_tree.outer_folds: # metrics_configs = [outer_fold.tested_configlist if self.output_settings.save_output: logger.info("Writing results to project folder...") self.write_result_tree_to_file()
def calc_config(self): ''' Calculates the configurations and the subset-fragmentation to evaluate. Implemented as a generator. The returned tracking vars are for internal use and need to be passed to process_result. :return: next configuration to test, subset-frag to use, tracking-vars :rtype: dict, int, dict ''' logger.info('**Fabolas: Starting initialization') for self._it in range(0, self._n_init): logger.debug('Fabolas: step ' + str(self._it) + ' (init)') start = time() result = self._init_models() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking) self._X = np.array(self._X) self._Y = np.array(self._Y) self._cost = np.array(self._cost) logger.info('**Fabolas: Starting optimization') for self._it in range(self._n_init, self._num_iterations): logger.debug('Fabolas: step ' + str(self._it) + ' (opt)') start = time() result = self._optimize_config() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking) logger.info('Fabolas: Final config') start = time() self._model_objective.train(self._X, self._Y, do_optimize=True) result = self.get_incumbent() tracking = {'overhead_time': time() - start} logger.debug( 'Fabolas: needed {t!s}s'.format(t=tracking['overhead_time'])) yield self._create_param_dict(result, tracking)
def sample_representer_points(self): self.sampling_acquisition.update(self.model) for i in range(5): restarts = np.zeros((self.Nb, self.D)) restarts[0:self.Nb, ] = self.lower + (self.upper - self.lower) \ * self.rng.uniform(size=(self.Nb, self.D)) sampler = emcee.EnsembleSampler(self.Nb, self.D, self.sampling_acquisition_wrapper) # zb are the representer points and lmb are their log EI values self.zb, self.lmb, _ = sampler.run_mcmc(restarts, 50) if not np.any(np.isinf(self.lmb)): break else: if self.verbose: logger.debug("Fabolas.InformationGain: Infinity") if len(self.zb.shape) == 1: self.zb = self.zb[:, None] if len(self.lmb.shape) == 1: self.lmb = self.lmb[:, None]
def transform(self, X, y=None, **kwargs): logger.debug('Regress out confounder.') sample_ols_confounder = self._check_for_confounders(kwargs) self._validate_dimension(X, sample_ols_confounder) # standardize covariates if self.standardize_covariates: sample_ols_confounder = self._standardize(sample_ols_confounder, is_fit=False) sample_ols_confounder = sm.add_constant(sample_ols_confounder) X_new = np.empty(X.shape) for i in range(X.shape[1]): preds = np.matmul(sample_ols_confounder, np.squeeze(self.olsModel_params[i])) residuum_feature_vector = np.squeeze(X[:, i]) - preds # residuum_feature_vector += self.olsModel_params[i, 0] # add intercept X_new[:, i] = np.asarray( residuum_feature_vector ) # writing back the residuum of the feature vector return X_new, kwargs
def _do_timed_fit_transform(self, name, transformer, fit, X, y, **kwargs): n = PhotonDataHelper.find_n(X) if self.random_state: transformer.random_state = self.random_state if fit: logger.debug("PhotonPipeline: Fitting " + transformer.name) fit_start_time = datetime.datetime.now() transformer.fit(X, y, **kwargs) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((name, fit_duration, n)) logger.debug("PhotonPipeline: Transforming data with " + transformer.name) transform_start_time = datetime.datetime.now() X, y, kwargs = transformer.transform(X, y, **kwargs) transform_duration = ( datetime.datetime.now() - transform_start_time ).total_seconds() self.time_monitor["transform_computed"].append((name, transform_duration, n)) return X, y, kwargs
def sample_representer_points(self): # Sample representer points only in the # configuration space by setting all environmental # variables to 1 D = np.where(self.is_env == 0)[0].shape[0] lower = self.lower[np.where(self.is_env == 0)] upper = self.upper[np.where(self.is_env == 0)] self.sampling_acquisition.update(self.model) for i in range(5): restarts = np.random.uniform(low=lower, high=upper, size=(self.Nb, D)) sampler = emcee.EnsembleSampler(self.Nb, D, self.sampling_acquisition_wrapper) self.zb, self.lmb, _ = sampler.run_mcmc(restarts, 50) if not np.any(np.isinf(self.lmb)): break else: if self.verbose: logger.debug( "Fabolas.InformationGainPerUnitCost: Infinity") if np.any(np.isinf(self.lmb)): raise ValueError( "Could not sample valid representer points! LogEI is -infinity" ) if len(self.zb.shape) == 1: self.zb = self.zb[:, None] if len(self.lmb.shape) == 1: self.lmb = self.lmb[:, None] # Project representer points to subspace proj = np.ones( [self.zb.shape[0], self.upper[self.is_env == 1].shape[0]]) proj *= self.upper[self.is_env == 1].shape[0] self.zb = np.concatenate((self.zb, proj), axis=1)
def _optimize_config(self): """ Train models and calculate the configuration and subset-fragmentation in the optimization-phase :return: configuration, subset-fragmentation :rtype: list, int """ # Train models logger.debug("Fabolas: Train model_objective") self._model_objective.train(self._X, self._Y, do_optimize=True) logger.debug("Fabolas: Train model_cost") self._model_cost.train(self._X, self._cost, do_optimize=True) # Maximize acquisition function logger.debug("Fabolas: Update acquisition func") self._acquisition_func.update(self._model_objective, self._model_cost) logger.debug("Fabolas: Generate new config by maximizing") new_x = self._maximizer.maximize() s = self._s_max / self._retransform(new_x[-1]) logger.debug("Fabolas: config generation done for this step") return new_x[:-1], int(s)
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs
def fit(self, X, y, **kwargs): """ Iterates over cross-validation folds and trains the pipeline, then uses it for predictions. Calculates metrics per fold and averages them over fold. :param X: Training and test data :param y: Training and test targets :returns: configuration class for result tree that monitors training and test performance """ # needed for testing Timeboxed Random Grid Search # time.sleep(35) config_item = MDBConfig() config_item.config_dict = self.params config_item.inner_folds = [] config_item.metrics_test = [] config_item.metrics_train = [] config_item.computation_start_time = datetime.datetime.now() try: # do inner cv for idx, (inner_fold_id, inner_fold) in enumerate( self.cross_validation_infos.inner_folds[ self.outer_fold_id].items()): train, test = inner_fold.train_indices, inner_fold.test_indices # split kwargs according to cross validation train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data( X, y, kwargs, indices=train) test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data( X, y, kwargs, indices=test) new_pipe = self.pipe() if self.cache_folder is not None and self.cache_updater is not None: self.cache_updater(new_pipe, self.cache_folder, inner_fold_id) if not config_item.human_readable_config: config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict( new_pipe, self.params) logger.clean_info( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) job_data = InnerFoldManager.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=InnerFoldManager.JobData( train_X, train_y, train, kwargs_cv_train), test_data=InnerFoldManager.JobData(test_X, test_y, test, kwargs_cv_test), ) # only for unparallel processing # inform children in which inner fold we are # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt) # self.mother_inner_fold_handle(fold_cnt) # --> write that output in InnerFoldManager! # logger.debug(config_item.human_readable_config) fold_nr = idx + 1 logger.debug("calculating inner fold " + str(fold_nr) + "...") curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score( job_data) logger.debug("Performance inner fold " + str(fold_nr)) print_double_metrics( curr_train_fold.metrics, curr_test_fold.metrics, photon_system_log=False, ) durations = job_data.pipe.time_monitor self.update_config_item_with_inner_fold( config_item=config_item, fold_cnt=fold_nr, curr_train_fold=curr_train_fold, curr_test_fold=curr_test_fold, time_monitor=durations, feature_importances=new_pipe.feature_importances_, ) if isinstance(self.optimization_constraints, list): break_cv = 0 for cf in self.optimization_constraints: if not cf.shall_continue(config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break_cv += 1 break if break_cv > 0: break elif self.optimization_constraints is not None: if not self.optimization_constraints.shall_continue( config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break InnerFoldManager.process_fit_results( config_item, self.cross_validation_infos.calculate_metrics_across_folds, self.cross_validation_infos.calculate_metrics_per_fold, self.optimization_infos.metrics, ) except Exception as e: if self.raise_error: raise e logger.error(e) logger.error(traceback.format_exc()) traceback.print_exc() if not isinstance(e, Warning): config_item.config_failed = True config_item.config_error = str(e) warnings.warn("One test iteration of pipeline failed with error") logger.debug("...done with") logger.debug( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) config_item.computation_end_time = datetime.datetime.now() return config_item
def transform(self, X, y=None, **kwargs): """ :param X: input data :param y: targets :param kwargs: :return: roi_data: np.ndarray, ROIs data for given brain atlas in concat or list form. """ X, n_subjects = NiftiConverter.transform(X) if self.collection_mode == 'list' or self.collection_mode == 'concat': collection_mode = self.collection_mode else: msg = "Collection mode {} not supported. Use 'list' or 'concat' instead." +\ "Falling back to concat mode.".format(self.collection_mode) logger.error(msg) raise ValueError(msg) # 1. validate if all X are in the same space and have the same voxelsize and have the same orientation # get ROI mask self.affine, self.shape = BrainMask.get_format_info_from_first_image(X) atlas_obj = AtlasLibrary().get_atlas(self.atlas_name, self.affine, self.shape, self.mask_threshold) roi_objects = self._get_rois(atlas_obj, which_rois=self.rois, background_id=self.background_id) roi_data = [list() for i in range(n_subjects)] roi_data_concat = list() t1 = time.time() # convert to series and C ordering since this will speed up the masking process series = _utils.as_ndarray(_utils.niimg._safe_get_data(X), dtype='float32', order="C", copy=True) mask_indices = list() # calculate roi_data for every ROI object by looping for i, roi in enumerate(roi_objects): self.roi_allocation[roi.label] = i logger.debug("Extracting ROI {}".format(roi.label)) # simply call apply_mask to extract one roi extraction = self.apply_mask(series, roi.mask) if collection_mode == 'list': for sub_i in range(extraction.shape[0]): roi_data[sub_i].append(extraction[sub_i]) mask_indices.append(i) else: roi_data_concat.append(extraction) mask_indices.append(np.ones(extraction[0].size) * i) if self.collection_mode == 'concat': if n_subjects > 1: roi_data = np.concatenate(roi_data_concat, axis=1) self.mask_indices = np.concatenate(mask_indices) else: roi_data = np.array(roi_data_concat) self.mask_indices = mask_indices else: self.mask_indices = mask_indices elapsed_time = time.time() - t1 logger.debug("Time for extracting {} ROIs in {} subjects: {} seconds".format(len(roi_objects), n_subjects, elapsed_time)) return roi_data
def transform(self, X, y=None, **kwargs): if len(X) < 1: raise Exception("Brain Atlas: Did not get any data in parameter X") if self.collection_mode == "list" or self.collection_mode == "concat": collection_mode = self.collection_mode else: collection_mode = "concat" logger.error( "Collection mode {} not supported. Use 'list' or 'concat' instead." "Falling back to concat mode.".format(self.collection_mode)) # 1. validate if all X are in the same space and have the same voxelsize and have the same orientation # 2. load sample data to get target affine and target shape to adapt the brain atlas self.affine, self.shape = BrainMask.get_format_info_from_first_image(X) # load all niftis to memory if isinstance(X, list): n_subjects = len(X) X = image.load_img(X) elif isinstance(X, str): n_subjects = 1 X = image.load_img(X) elif isinstance(X, np.ndarray): n_subjects = X.shape[0] X = image.load_img(X) else: n_subjects = X.shape[-1] # get ROI mask atlas_obj = AtlasLibrary().get_atlas(self.atlas_name, self.affine, self.shape, self.mask_threshold) roi_objects = self._get_rois(atlas_obj, which_rois=self.rois, background_id=self.background_id) roi_data = [list() for i in range(n_subjects)] roi_data_concat = list() t1 = time.time() # convert to series and C ordering since this will speed up the masking process series = _utils.as_ndarray(_safe_get_data(X), dtype="float32", order="C", copy=True) mask_indices = list() for i, roi in enumerate(roi_objects): logger.debug("Extracting ROI {}".format(roi.label)) # simply call apply_mask to extract one roi extraction = self.apply_mask(series, roi.mask) if collection_mode == "list": for sub_i in range(extraction.shape[0]): roi_data[sub_i].append(extraction[sub_i]) mask_indices.append(i) else: roi_data_concat.append(extraction) mask_indices.append(np.ones(extraction[0].size) * i) if self.collection_mode == "concat": roi_data = np.concatenate(roi_data_concat, axis=1) self.mask_indices = np.concatenate(mask_indices) else: self.mask_indices = mask_indices elapsed_time = time.time() - t1 logger.debug( "Time for extracting {} ROIs in {} subjects: {} seconds".format( len(roi_objects), n_subjects, elapsed_time)) return roi_data
def fit(self, X, y=None, **kwargs): logger.photon_system_log('') logger.photon_system_log( '***************************************************************************************************************' ) logger.photon_system_log('Outer Cross validation Fold {}'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)) logger.photon_system_log( '***************************************************************************************************************' ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() self.best_metric_yet = None self.tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: self.fold_operation = FoldOperations.MEAN else: self.fold_operation = FoldOperations.RAW self.max_nr_of_configs = '' if hasattr(self.optimizer, 'n_configurations'): self.max_nr_of_configs = str(self.optimizer.n_configurations) if isinstance(self.optimizer, PhotonMasterOptimizer): self.optimizer.optimize() else: # do the optimizing for current_config in self.optimizer.ask: self.objective_function(current_config) logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) logger.info( 'Hyperparameter Optimization finished. Now finding best configuration .... ' ) print(self.tested_config_counter) # now go on with the best config found if self.tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, self.fold_operation) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug( 'Fitting model with best configuration of outer fold...') optimum_pipe.fit(self._validation_X, self._validation_y, **self._validation_kwargs) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0] best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_ if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info( 'Calculating best model performance on test set...') logger.debug('...scoring test data') test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs) logger.debug('... scoring training data') train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(self.fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test) # write best config performance to best config item self.result_object.best_config.best_config_score = best_config_performance_mdb logger.info('Computations in outer fold {} took {} minutes.'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60))
def _add_atlas_to_library(self, atlas_name, target_affine=None, target_shape=None, mask_threshold=None): # Todo: find solution for multiprocessing spaming # print('Adding atlas to library: {} - Shape {} - Affine {} - Threshold {}'.format(atlas_name, # target_shape, # target_affine, # mask_threshold)) # load atlas object from photon_atlasses if atlas_name in self.photon_atlases.keys(): original_atlas_object = self.photon_atlases[atlas_name] else: logger.debug("Checking custom atlas") original_atlas_object = self._check_custom_atlas(atlas_name) # now create new atlas object with different affine, shape and mask_threshold atlas_object = AtlasObject( name=original_atlas_object.name, path=original_atlas_object.path, labels_file=original_atlas_object.labels_file, mask_threshold=mask_threshold, affine=target_affine, shape=target_shape, ) # load atlas img = image.load_img(atlas_object.path) resampled_img = self._resample(img, target_affine=target_affine, target_shape=target_shape) atlas_object.atlas = resampled_img atlas_object.map = np.asarray(atlas_object.atlas.get_data()) # apply mask threshold if mask_threshold is not None: atlas_object.map[atlas_object.map < mask_threshold] = 0 atlas_object.map = atlas_object.map.astype(int) # now get indices atlas_object.indices = list(np.unique(atlas_object.map)) # check labels if Path(atlas_object.labels_file).is_file( ): # if we have a file with indices and labels labels = pd.read_table(atlas_object.labels_file, header=None) labels_dict = pd.Series(labels.iloc[:, 1].values, index=labels.iloc[:, 0]).to_dict() # check if background has been defined in labels.txt if 0 not in labels_dict.keys() and 0 in atlas_object.indices: # add 0 as background labels_dict[0] = "Background" # check if map indices correspond with indices in the labels file if not sorted(atlas_object.indices) == sorted( list(labels_dict.keys())): logger.error(""" The indices in map image ARE NOT the same as those in your *_labels.txt! Ignoring *_labels.txt. MapImage: {} File: {} """.format(str(sorted(self.indices)), str(sorted(list(labels_dict.keys()))))) atlas_object.roi_list = [ RoiObject(index=i, label=str(i), size=np.sum(i == atlas_object.map)) for i in atlas_object.indices ] else: for i in range(len(atlas_object.indices)): roi_index = atlas_object.indices[i] new_roi = RoiObject( index=roi_index, label=labels_dict[roi_index].replace("\n", ""), size=np.sum(roi_index == atlas_object.map), ) atlas_object.roi_list.append(new_roi) else: # if we don't have a labels file, we just use str(indices) as labels atlas_object.roi_list = [ RoiObject(index=i, label=str(i), size=np.sum(i == atlas_object.map)) for i in atlas_object.indices ] # check for empty ROIs and create roi mask for roi in atlas_object.roi_list: if roi.size == 0: continue roi.mask = image.new_img_like(atlas_object.path, atlas_object.map == roi.index) # check if roi is empty if np.sum(roi.mask.dataobj != 0) == 0: roi.is_empty = True # finally add atlas to atlas library self.library[(atlas_name, str(target_affine), str(target_shape), str(mask_threshold))] = atlas_object logger.debug("BrainAtlas: Done adding atlas to library!")
def objective_function(self, current_config): if current_config is None: return logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) self.tested_config_counter += 1 if hasattr(self.optimizer, 'ask_for_pipe'): pipe_ctor = self.optimizer.ask_for_pipe() else: pipe_ctor = self.copy_pipe_fnc # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter) hp = InnerFoldManager(pipe_ctor, current_config, self.optimization_info, self.cross_validaton_info, self.outer_fold_id, self.constraint_objects, cache_folder=self.cache_folder, cache_updater=self.cache_updater) # Test the configuration cross validated by inner_cv object current_config_mdb = hp.fit(self._validation_X, self._validation_y, **self._validation_kwargs) current_config_mdb.config_nr = self.tested_config_counter if not current_config_mdb.config_failed: metric_train = MDBHelper.get_metric( current_config_mdb, self.fold_operation, self.optimization_info.best_config_metric) metric_test = MDBHelper.get_metric( current_config_mdb, self.fold_operation, self.optimization_info.best_config_metric, train=False) if metric_train is None or metric_test is None: raise Exception( "Config did not fail, but did not get any metrics either....!!?" ) config_performance = (metric_train, metric_test) if self.best_metric_yet is None: self.best_metric_yet = config_performance self.current_best_config = current_config_mdb else: # check if we have the next superstar around that exceeds any old performance if self.optimization_info.maximize_metric: if metric_test > self.best_metric_yet[1]: self.best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() else: if metric_test < self.best_metric_yet[1]: self.best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() # Print Result for config computation_duration = current_config_mdb.computation_end_time - current_config_mdb.computation_start_time logger.info('Computed configuration ' + str(self.tested_config_counter) + "/" + self.max_nr_of_configs + " in " + str(computation_duration)) logger.info("Performance: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % config_performance[0] + ", Validation: " + "%.4f" % config_performance[1]) logger.info("Best Performance So Far: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % self.best_metric_yet[0] + ", Validation: " + "%.4f" % self.best_metric_yet[1]) else: config_performance = (-1, -1) # Print Result for config logger.debug('...failed:') logger.error(current_config_mdb.config_error) # add config to result tree self.result_object.tested_config_list.append(current_config_mdb) # 3. inform optimizer about performance logger.debug( "Telling hyperparameter optimizer about recent performance.") if isinstance(self.optimizer, PhotonSlaveOptimizer): self.optimizer.tell(current_config, config_performance) logger.debug("Asking hyperparameter optimizer for new config.") if self.optimization_info.maximize_metric: return 1 - config_performance[1] else: return config_performance[1]