def test_split_join_resorting(self): X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]) kwargs = {"test": np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])} X_new, y_new, kwargs_new = list(), list(), dict() # first randomly split the data and append them to X_new, y_new, kwargs_new idx_list_one, idx_list_two = list(), list() for idx in range(len(X)): if bool(random.getrandbits(1)): idx_list_one.append(idx) else: idx_list_two.append(idx) for ilist in [idx_list_two, idx_list_one]: for idx in ilist: X_batched, y_batched, kwargs_batched = PhotonDataHelper.split_data( X, y, kwargs, idx, idx) # test if batching works self.assertEqual(X_batched, X[idx]) self.assertEqual(y_batched, y[idx]) self.assertDictEqual(kwargs_batched, {"test": [kwargs["test"][idx]]}) # then join again X_new, y_new, kwargs_new = PhotonDataHelper.join_data( X_new, X_batched, y_new, y_batched, kwargs_new, kwargs_batched) # test if joining works joined_idx = PhotonDataHelper.stack_data_vertically( idx_list_two, idx_list_one) self.assertTrue(np.array_equal(X_new, X[joined_idx])) self.assertTrue(np.array_equal(y_new, y[joined_idx])) self.assertTrue( np.array_equal(kwargs_new["test"], kwargs["test"][joined_idx])) # now resort and see if that works too X_resorted, y_resorted, kwargs_resorted = PhotonDataHelper.resort_splitted_data( X_new, y_new, kwargs_new, joined_idx) self.assertTrue(np.array_equal(X_resorted, X)) self.assertTrue(np.array_equal(y_resorted, y)) self.assertListEqual(list(kwargs_resorted.keys()), list(kwargs.keys())) self.assertTrue(np.array_equal(kwargs_resorted["test"], kwargs["test"]))
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs