def stack(*datasets): """ Assuming that the datasets have same structure, stacks data, targets and other info :param datasets: :return: stacked dataset """ return Dataset( data=vstack([d.data for d in datasets]), target=stack_or_concat([d.target for d in datasets]), sample_info=np.concatenate([d.sample_info for d in datasets]), info={ k: [d.info.get(k, None) for d in datasets] for k in merge_dicts(*[d.info for d in datasets]) }, )
def redivide_data( datasets, partition_proportions=None, shuffle=False, filters=None, maps=None, balance_classes=False, rand=None, ): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param rand: :param balance_classes: # TODO RICCARDO :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ rnd = get_rand_state(rand) all_data = vstack([get_data(d) for d in datasets]) all_labels = stack_or_concat([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance( partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, ( "partition proportions must sum up to at most one: %d" % sum_proportions) if sum_proportions < 1.0: partition_proportions += [1.0 - sum_proportions] else: partition_proportions = [ 1.0 * get_data(d).shape[0] / N for d in datasets ] if shuffle: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() # if sk_shuffle: # TODO this does not work!!! find a way to shuffle these matrices while # keeping compatibility with tensorflow! # all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos) # else: permutation = np.arange(all_data.shape[0]) rnd.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [ xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [ _map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0], ) calculated_partitions[-1] = N print( "datasets.redivide_data:, computed partitions numbers -", calculated_partitions, "len all", N, end=" ", ) new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.info} if balance_classes: new_datasets = [] forbidden_indices = np.empty(0, dtype=np.int64) for d1, d2 in zip(calculated_partitions[:-1], calculated_partitions[1:-1]): indices = np.array( get_indices_balanced_classes(d2 - d1, all_labels, forbidden_indices)) dataset = dl.Dataset( data=all_data[indices], target=all_labels[indices], sample_info=all_infos[indices], info=new_general_info_dict, ) new_datasets.append(dataset) forbidden_indices = np.append(forbidden_indices, indices) test_if_balanced(dataset) remaining_indices = np.array( list(set(list(range(N))) - set(forbidden_indices))) new_datasets.append( dl.Dataset( data=all_data[remaining_indices], target=all_labels[remaining_indices], sample_info=all_infos[remaining_indices], info=new_general_info_dict, )) else: new_datasets = [ dl.Dataset( data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict, ) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print("DONE") return new_datasets