def apply_transform_parallelized(self, X): """ :param X: the data to which the delegate should be applied in parallel """ if self.nr_of_processes > 1: jobs_to_do = list() # distribute the data equally to all available cores number_of_items_to_process = PhotonDataHelper.find_n(X) number_of_items_for_each_core = int( np.ceil(number_of_items_to_process / self.nr_of_processes)) logger.info("NeuroBranch " + self.name + ": Using " + str(self.nr_of_processes) + " cores calculating " + str(number_of_items_for_each_core) + " items each") for start, stop in PhotonDataHelper.chunker( number_of_items_to_process, number_of_items_for_each_core): X_batched, _, _ = PhotonDataHelper.split_data( X, None, {}, start, stop) # copy my pipeline new_pipe_mr = self.copy_me() new_pipe_copy = new_pipe_mr.base_element new_pipe_copy.cache_folder = self.base_element.cache_folder new_pipe_copy.skip_loading = True new_pipe_copy._parallel_use = True del_job = dask.delayed(NeuroBranch.parallel_application)( new_pipe_copy, X_batched) jobs_to_do.append(del_job) dask.compute(*jobs_to_do)
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs