Exemple #1
0
def omni_light(folder=join(DATA_FOLDER, "omniglot-light"), add_bias=False):
    """
    Extract from omniglot dataset with rotated images, 100 classes,
    3 examples per class in training set
    3 examples per class in validation set
    15 examples per class in test set
    """
    file = h5py.File(os.path.join(folder, "omni-light.h5"), "r")
    return dl.Datasets.from_list([
        dl.Dataset(
            np.array(file["X_ft_tr"]),
            np.array(file["Y_tr"]),
            info={"original images": np.array(file["X_orig_tr"])},
            add_bias=add_bias,
        ),
        dl.Dataset(
            np.array(file["X_ft_val"]),
            np.array(file["Y_val"]),
            info={"original images": np.array(file["X_orig_val"])},
            add_bias=add_bias,
        ),
        dl.Dataset(
            np.array(file["X_ft_test"]),
            np.array(file["Y_test"]),
            info={"original images": np.array(file["X_orig_test"])},
            add_bias=add_bias,
        ),
    ])
Exemple #2
0
def mnist(folder=None,
          one_hot=True,
          partitions=None,
          filters=None,
          maps=None,
          shuffle=False):
    if not folder:
        folder = MNIST_DIR
    datasets = read_data_sets(folder, one_hot=one_hot)
    train = dl.Dataset(datasets.train.images,
                       datasets.train.labels,
                       name="MNIST")
    validation = dl.Dataset(datasets.validation.images,
                            datasets.validation.labels,
                            name="MNIST")
    test = dl.Dataset(datasets.test.images, datasets.test.labels, name="MNIST")
    res = [train, validation, test]
    if partitions:
        res = redivide_data(
            res,
            partition_proportions=partitions,
            filters=filters,
            maps=maps,
            shuffle=shuffle,
        )
        res += [None] * (3 - len(res))
    return dl.Datasets.from_list(res)
Exemple #3
0
def random_regression_datasets(n_samples,
                               features=100,
                               outs=1,
                               informative=0.1,
                               partition_proportions=(0.5, 0.3),
                               rnd=None,
                               **mk_rgr_kwargs):
    rnd_state = dl.get_rand_state(rnd)
    X, Y, w = make_regression(n_samples,
                              features,
                              int(features * informative),
                              outs,
                              random_state=rnd_state,
                              coef=True,
                              **mk_rgr_kwargs)
    if outs == 1:
        Y = np.reshape(Y, (n_samples, 1))

    print("range of Y", np.min(Y), np.max(Y))
    info = merge_dicts({
        "informative": informative,
        "random_seed": rnd,
        "w": w
    }, mk_rgr_kwargs)
    name = dl.em_utils.name_from_dict(info, "w")
    dt = dl.Dataset(X, Y, name=name, info=info)
    datasets = dl.Datasets.from_list(redivide_data([dt],
                                                   partition_proportions))
    print(
        "conditioning of X^T X",
        np.linalg.cond(datasets.train.data.T @ datasets.train.data),
    )
    return datasets
Exemple #4
0
def random_classification_datasets(n_samples,
                                   features=100,
                                   classes=2,
                                   informative=0.1,
                                   partition_proportions=(0.5, 0.3),
                                   rnd=None,
                                   one_hot=True,
                                   **mk_cls_kwargs):
    rnd_state = dl.get_rand_state(rnd)
    X, Y = make_classification(n_samples,
                               features,
                               n_classes=classes,
                               random_state=rnd_state,
                               **mk_cls_kwargs)
    if one_hot:
        Y = to_one_hot_enc(Y)

    print("range of Y", np.min(Y), np.max(Y))
    info = merge_dicts({
        "informative": informative,
        "random_seed": rnd
    }, mk_cls_kwargs)
    name = dl.em_utils.name_from_dict(info, "w")
    dt = dl.Dataset(X, Y, name=name, info=info)
    datasets = dl.Datasets.from_list(redivide_data([dt],
                                                   partition_proportions))
    print(
        "conditioning of X^T X",
        np.linalg.cond(datasets.train.data.T @ datasets.train.data),
    )
    return datasets
Exemple #5
0
    def all_data(self, partition_proportions=None, seed=None):
        if not self._loaded_images:
            self.load_all_images()
            while not self.check_loaded_images(600):
                import time

                time.sleep(5)
        data, targets = [], []
        for k, c in enumerate(sorted(self._loaded_images)):
            data += list(self._loaded_images[c].values())
            targets += [k] * 600
        if self.info["one_hot_enc"]:
            targets = dl.to_one_hot_enc(targets,
                                        dimension=len(self._loaded_images))
        _dts = [
            dl.Dataset(data=np.stack(data),
                       target=np.array(targets),
                       name="MiniImagenet_full")
        ]
        if seed:
            np.random.seed(seed)
        if partition_proportions:
            _dts = redivide_data(_dts,
                                 partition_proportions=partition_proportions,
                                 shuffle=True)
        return dl.Datasets.from_list(_dts)
Exemple #6
0
    def generate_datasets(self,
                          rand=None,
                          num_classes=None,
                          num_examples=None):
        rand = dl.get_rand_state(rand)

        if not num_examples:
            num_examples = self.kwargs["num_examples"]
        if not num_classes:
            num_classes = self.kwargs["num_classes"]

        clss = self._loaded_images if self._loaded_images else self.info[
            "classes"]

        random_classes = rand.choice(list(clss.keys()),
                                     size=(num_classes, ),
                                     replace=False)
        rand_class_dict = {rnd: k for k, rnd in enumerate(random_classes)}

        _dts = []
        for ns in as_tuple_or_list(num_examples):
            classes = balanced_choice_wr(random_classes, ns, rand)

            all_images = {cls: list(clss[cls]) for cls in classes}
            data, targets, sample_info = [], [], []
            for c in classes:
                rand.shuffle(all_images[c])
                img_name = all_images[c][0]
                all_images[c].remove(img_name)
                sample_info.append({"name": img_name, "label": c})
                data.append(clss[c][img_name])
                targets.append(rand_class_dict[c])

            if self.info["one_hot_enc"]:
                targets = dl.to_one_hot_enc(targets, dimension=num_classes)

            _dts.append(
                dl.Dataset(
                    data=np.array(np.stack(data)),
                    target=targets,
                    sample_info=sample_info,
                    info={"all_classes": random_classes},
                ))
        return dl.Datasets.from_list(_dts)
Exemple #7
0
def redivide_data(
    datasets,
    partition_proportions=None,
    shuffle=False,
    filters=None,
    maps=None,
    balance_classes=False,
    rand=None,
):
    """
    Function that redivides datasets. Can be use also to shuffle or filter or map examples.

    :param rand:
    :param balance_classes: # TODO RICCARDO
    :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for
                        compatibility with mnist datasets
    :param partition_proportions: (optional, default None)  list of fractions that can either sum up to 1 or less
                                    then one, in which case one additional partition is created with
                                    proportion 1 - sum(partition proportions).
                                    If None it will retain the same proportion of samples found in datasets
    :param shuffle: (optional, default False) if True shuffles the examples
    :param filters: (optional, default None) filter or list of filters: functions with signature
                        (data, target, index) -> boolean (accept or reject the sample)
    :param maps: (optional, default None) map or list of maps: functions with signature
                        (data, target, index) ->  (new_data, new_target) (maps the old sample to a new one,
                        possibly also to more
                        than one sample, for data augmentation)
    :return: a list of datasets of length equal to the (possibly augmented) partition_proportion
    """

    rnd = get_rand_state(rand)

    all_data = vstack([get_data(d) for d in datasets])
    all_labels = stack_or_concat([get_targets(d) for d in datasets])

    all_infos = np.concatenate([d.sample_info for d in datasets])

    N = all_data.shape[0]

    if partition_proportions:  # argument check
        partition_proportions = list([partition_proportions] if isinstance(
            partition_proportions, float) else partition_proportions)
        sum_proportions = sum(partition_proportions)
        assert sum_proportions <= 1, (
            "partition proportions must sum up to at most one: %d" %
            sum_proportions)
        if sum_proportions < 1.0:
            partition_proportions += [1.0 - sum_proportions]
    else:
        partition_proportions = [
            1.0 * get_data(d).shape[0] / N for d in datasets
        ]

    if shuffle:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        # if sk_shuffle:  # TODO this does not work!!! find a way to shuffle these matrices while
        # keeping compatibility with tensorflow!
        #     all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos)
        # else:
        permutation = np.arange(all_data.shape[0])
        rnd.shuffle(permutation)

        all_data = all_data[permutation]
        all_labels = np.array(all_labels[permutation])
        all_infos = np.array(all_infos[permutation])

    if filters:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        filters = as_list(filters)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for fiat in filters:
            data_triple = [
                xy for i, xy in enumerate(data_triple)
                if fiat(xy[0], xy[1], xy[2], i)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    if maps:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        maps = as_list(maps)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for _map in maps:
            data_triple = [
                _map(xy[0], xy[1], xy[2], i)
                for i, xy in enumerate(data_triple)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    N = all_data.shape[0]
    assert N == all_labels.shape[0]

    calculated_partitions = reduce(
        lambda v1, v2: v1 + [sum(v1) + v2],
        [int(N * prp) for prp in partition_proportions],
        [0],
    )
    calculated_partitions[-1] = N

    print(
        "datasets.redivide_data:, computed partitions numbers -",
        calculated_partitions,
        "len all",
        N,
        end=" ",
    )

    new_general_info_dict = {}
    for data in datasets:
        new_general_info_dict = {**new_general_info_dict, **data.info}

        if balance_classes:
            new_datasets = []
            forbidden_indices = np.empty(0, dtype=np.int64)
            for d1, d2 in zip(calculated_partitions[:-1],
                              calculated_partitions[1:-1]):
                indices = np.array(
                    get_indices_balanced_classes(d2 - d1, all_labels,
                                                 forbidden_indices))
                dataset = dl.Dataset(
                    data=all_data[indices],
                    target=all_labels[indices],
                    sample_info=all_infos[indices],
                    info=new_general_info_dict,
                )
                new_datasets.append(dataset)
                forbidden_indices = np.append(forbidden_indices, indices)
                test_if_balanced(dataset)
            remaining_indices = np.array(
                list(set(list(range(N))) - set(forbidden_indices)))
            new_datasets.append(
                dl.Dataset(
                    data=all_data[remaining_indices],
                    target=all_labels[remaining_indices],
                    sample_info=all_infos[remaining_indices],
                    info=new_general_info_dict,
                ))
        else:
            new_datasets = [
                dl.Dataset(
                    data=all_data[d1:d2],
                    target=all_labels[d1:d2],
                    sample_info=all_infos[d1:d2],
                    info=new_general_info_dict,
                ) for d1, d2 in zip(calculated_partitions,
                                    calculated_partitions[1:])
            ]

        print("DONE")

        return new_datasets
Exemple #8
0
    def generate_datasets(self,
                          rand=None,
                          num_classes=None,
                          num_examples=None,
                          wait_for_n_min=None):

        rand = dl.get_rand_state(rand)

        if wait_for_n_min:
            import time

            while not self.check_loaded_images(wait_for_n_min):
                time.sleep(5)

        if not num_examples:
            num_examples = self.kwargs["num_examples"]
        if not num_classes:
            num_classes = self.kwargs["num_classes"]

        clss = self._loaded_images if self._loaded_images else self.info[
            "classes"]

        random_classes = rand.choice(list(clss.keys()),
                                     size=(num_classes, ),
                                     replace=False)
        rand_class_dict = {rnd: k for k, rnd in enumerate(random_classes)}

        _dts = []
        for ns in dl.as_tuple_or_list(num_examples):
            classes = balanced_choice_wr(random_classes, ns, rand)

            all_images = {cls: list(clss[cls]) for cls in classes}
            data, targets, sample_info = [], [], []
            for c in classes:
                rand.shuffle(all_images[c])
                img_name = all_images[c][0]
                all_images[c].remove(img_name)
                sample_info.append({"name": img_name, "label": c})

                if self._loaded_images:
                    data.append(clss[c][img_name])
                else:
                    from imageio import imread

                    data.append(
                        np.array(
                            Image.fromarray(
                                imread(
                                    join(self.info["base_folder"],
                                         join(c, img_name)))).resize(
                                             size=(self.info["resize"],
                                                   self.info["resize"]))) /
                        255.0)
                targets.append(rand_class_dict[c])

            if self.info["one_hot_enc"]:
                targets = to_one_hot_enc(targets, dimension=num_classes)

            _dts.append(
                dl.Dataset(
                    data=np.array(np.stack(data)),
                    target=targets,
                    sample_info=sample_info,
                    info={"all_classes": random_classes},
                ))
        return dl.Datasets.from_list(_dts)