def test_concatenate_dict(self): dict_a = { "variable_one": np.random.randn(10), "variable_two": np.random.randn(15), } dict_b = { "variable_one": np.random.randn(20), "variable_two": np.random.randn(20), } dict_c = { "variable_one": np.random.randn(10, 10), "variable_two": np.random.randn(15, 15), } dict_d = { "variable_one": np.random.randn(20, 10), "variable_two": np.random.randn(20, 15), } dict_e = {} dict_a_b = PhotonDataHelper.join_dictionaries(dict_a, dict_b) dict_c_d = PhotonDataHelper.join_dictionaries(dict_c, dict_d) dict_e_a = PhotonDataHelper.join_dictionaries(dict_e, dict_a) self.assertEqual(len(dict_a_b["variable_one"]), 30) self.assertEqual(len(dict_a_b["variable_two"]), 35) self.assertEqual(dict_c_d["variable_one"].shape, (30, 10)) self.assertEqual(dict_c_d["variable_two"].shape, (35, 15)) self.assertEqual(len(dict_e_a["variable_one"]), 10) self.assertEqual(len(dict_e_a["variable_two"]), 15)
def transform(self, X, y=None, **kwargs): """ Generates "new samples" by computing the mean between all or n_draws pairs of existing samples and appends them to X The target for each new sample is computed as the mean between the constituent targets :param X: data :param y: targets (optional) :param draw_limit: in case the full number of combinations is > 10k, how many to draw? :param rand_seed: sets seed for random sampling of combinations (for reproducibility only) :return: X_new: X and X_augmented; (y_new: the correspoding targets) """ logger.debug("Pairing " + str(self.draw_limit) + " samples...") # ensure class balance in the training set if balance_classes is True unique_classes = np.unique(y) n_pairs = list() for label in unique_classes: if self.balance_classes: n_pairs.append(self.draw_limit - np.sum(y == label)) else: n_pairs.append(self.draw_limit) # run get_samples for each class independently X_extended = list() y_extended = list() kwargs_extended = dict() for label, limit in zip(unique_classes, n_pairs): X_new_class, y_new_class, kwargs_new_class = self._return_samples( X[y == label], y[y == label], PhotonDataHelper.index_dict(kwargs, y == label), generator=self.generator, distance_metric=self.distance_metric, draw_limit=limit, rand_seed=self.random_state, ) X_extended.extend(X_new_class) y_extended.extend(y_new_class) # get the corresponding kwargs if kwargs: kwargs_extended = PhotonDataHelper.join_dictionaries( kwargs_extended, kwargs_new_class) return X_extended, y_extended, kwargs_extended
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs