def apply_transform_parallelized(self, X): """ :param X: the data to which the delegate should be applied in parallel """ if self.nr_of_processes > 1: jobs_to_do = list() # distribute the data equally to all available cores number_of_items_to_process = PhotonDataHelper.find_n(X) number_of_items_for_each_core = int( np.ceil(number_of_items_to_process / self.nr_of_processes)) logger.info("NeuroBranch " + self.name + ": Using " + str(self.nr_of_processes) + " cores calculating " + str(number_of_items_for_each_core) + " items each") for start, stop in PhotonDataHelper.chunker( number_of_items_to_process, number_of_items_for_each_core): X_batched, _, _ = PhotonDataHelper.split_data( X, None, {}, start, stop) # copy my pipeline new_pipe_mr = self.copy_me() new_pipe_copy = new_pipe_mr.base_element new_pipe_copy.cache_folder = self.base_element.cache_folder new_pipe_copy.skip_loading = True new_pipe_copy._parallel_use = True del_job = dask.delayed(NeuroBranch.parallel_application)( new_pipe_copy, X_batched) jobs_to_do.append(del_job) dask.compute(*jobs_to_do)
def predict(self, X, training=False, **kwargs): """ Transforms the data for every step that offers a transform function and then calls the estimator with predict on transformed data. It returns the predictions made. In case the last step is no estimator, it returns the transformed data. """ # first transform if not training: X, _, kwargs = self.transform(X, y=None, **kwargs) # then call predict on final estimator if self._final_estimator is not None: if self._final_estimator.is_estimator: logger.debug('PhotonPipeline: Predicting with ' + self._final_estimator.name + ' ...') predict_start_time = datetime.datetime.now() y_pred = self._final_estimator.predict(X, **kwargs) predict_duration = (datetime.datetime.now() - predict_start_time).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['predict'].append( (self.elements[-1][0], predict_duration, n)) return y_pred else: return X else: return None
def fit(self, X, y=None, **kwargs): self._validate_elements() X, y, kwargs = self._caching_fit_transform(X, y, kwargs, fit=True) if self._final_estimator is not None: logger.debug("PhotonPipeline: Fitting " + self._final_estimator.name) fit_start_time = datetime.datetime.now() if self.random_state: self._final_estimator.random_state = self.random_state self._final_estimator.fit(X, y, **kwargs) #todo after fit final_estimator actions by estimtor go n = PhotonDataHelper.find_n(X) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((self.elements[-1][0], fit_duration, n)) return self
def _do_timed_fit_transform(self, name, transformer, fit, X, y, **kwargs): n = PhotonDataHelper.find_n(X) if self.random_state: transformer.random_state = self.random_state if fit: logger.debug("PhotonPipeline: Fitting " + transformer.name) fit_start_time = datetime.datetime.now() transformer.fit(X, y, **kwargs) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((name, fit_duration, n)) logger.debug("PhotonPipeline: Transforming data with " + transformer.name) transform_start_time = datetime.datetime.now() X, y, kwargs = transformer.transform(X, y, **kwargs) transform_duration = ( datetime.datetime.now() - transform_start_time ).total_seconds() self.time_monitor["transform_computed"].append((name, transform_duration, n)) return X, y, kwargs
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs