Exemple #1
0
    def stack(*datasets):
        """
        Assuming that the datasets have same structure, stacks data, targets and other info

        :param datasets:
        :return: stacked dataset
        """
        return Dataset(
            data=vstack([d.data for d in datasets]),
            target=stack_or_concat([d.target for d in datasets]),
            sample_info=np.concatenate([d.sample_info for d in datasets]),
            info={
                k: [d.info.get(k, None) for d in datasets]
                for k in merge_dicts(*[d.info for d in datasets])
            },
        )
Exemple #2
0
def redivide_data(
    datasets,
    partition_proportions=None,
    shuffle=False,
    filters=None,
    maps=None,
    balance_classes=False,
    rand=None,
):
    """
    Function that redivides datasets. Can be use also to shuffle or filter or map examples.

    :param rand:
    :param balance_classes: # TODO RICCARDO
    :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for
                        compatibility with mnist datasets
    :param partition_proportions: (optional, default None)  list of fractions that can either sum up to 1 or less
                                    then one, in which case one additional partition is created with
                                    proportion 1 - sum(partition proportions).
                                    If None it will retain the same proportion of samples found in datasets
    :param shuffle: (optional, default False) if True shuffles the examples
    :param filters: (optional, default None) filter or list of filters: functions with signature
                        (data, target, index) -> boolean (accept or reject the sample)
    :param maps: (optional, default None) map or list of maps: functions with signature
                        (data, target, index) ->  (new_data, new_target) (maps the old sample to a new one,
                        possibly also to more
                        than one sample, for data augmentation)
    :return: a list of datasets of length equal to the (possibly augmented) partition_proportion
    """

    rnd = get_rand_state(rand)

    all_data = vstack([get_data(d) for d in datasets])
    all_labels = stack_or_concat([get_targets(d) for d in datasets])

    all_infos = np.concatenate([d.sample_info for d in datasets])

    N = all_data.shape[0]

    if partition_proportions:  # argument check
        partition_proportions = list([partition_proportions] if isinstance(
            partition_proportions, float) else partition_proportions)
        sum_proportions = sum(partition_proportions)
        assert sum_proportions <= 1, (
            "partition proportions must sum up to at most one: %d" %
            sum_proportions)
        if sum_proportions < 1.0:
            partition_proportions += [1.0 - sum_proportions]
    else:
        partition_proportions = [
            1.0 * get_data(d).shape[0] / N for d in datasets
        ]

    if shuffle:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        # if sk_shuffle:  # TODO this does not work!!! find a way to shuffle these matrices while
        # keeping compatibility with tensorflow!
        #     all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos)
        # else:
        permutation = np.arange(all_data.shape[0])
        rnd.shuffle(permutation)

        all_data = all_data[permutation]
        all_labels = np.array(all_labels[permutation])
        all_infos = np.array(all_infos[permutation])

    if filters:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        filters = as_list(filters)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for fiat in filters:
            data_triple = [
                xy for i, xy in enumerate(data_triple)
                if fiat(xy[0], xy[1], xy[2], i)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    if maps:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        maps = as_list(maps)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for _map in maps:
            data_triple = [
                _map(xy[0], xy[1], xy[2], i)
                for i, xy in enumerate(data_triple)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    N = all_data.shape[0]
    assert N == all_labels.shape[0]

    calculated_partitions = reduce(
        lambda v1, v2: v1 + [sum(v1) + v2],
        [int(N * prp) for prp in partition_proportions],
        [0],
    )
    calculated_partitions[-1] = N

    print(
        "datasets.redivide_data:, computed partitions numbers -",
        calculated_partitions,
        "len all",
        N,
        end=" ",
    )

    new_general_info_dict = {}
    for data in datasets:
        new_general_info_dict = {**new_general_info_dict, **data.info}

        if balance_classes:
            new_datasets = []
            forbidden_indices = np.empty(0, dtype=np.int64)
            for d1, d2 in zip(calculated_partitions[:-1],
                              calculated_partitions[1:-1]):
                indices = np.array(
                    get_indices_balanced_classes(d2 - d1, all_labels,
                                                 forbidden_indices))
                dataset = dl.Dataset(
                    data=all_data[indices],
                    target=all_labels[indices],
                    sample_info=all_infos[indices],
                    info=new_general_info_dict,
                )
                new_datasets.append(dataset)
                forbidden_indices = np.append(forbidden_indices, indices)
                test_if_balanced(dataset)
            remaining_indices = np.array(
                list(set(list(range(N))) - set(forbidden_indices)))
            new_datasets.append(
                dl.Dataset(
                    data=all_data[remaining_indices],
                    target=all_labels[remaining_indices],
                    sample_info=all_infos[remaining_indices],
                    info=new_general_info_dict,
                ))
        else:
            new_datasets = [
                dl.Dataset(
                    data=all_data[d1:d2],
                    target=all_labels[d1:d2],
                    sample_info=all_infos[d1:d2],
                    info=new_general_info_dict,
                ) for d1, d2 in zip(calculated_partitions,
                                    calculated_partitions[1:])
            ]

        print("DONE")

        return new_datasets