def omni_light(folder=join(DATA_FOLDER, "omniglot-light"), add_bias=False): """ Extract from omniglot dataset with rotated images, 100 classes, 3 examples per class in training set 3 examples per class in validation set 15 examples per class in test set """ file = h5py.File(os.path.join(folder, "omni-light.h5"), "r") return dl.Datasets.from_list([ dl.Dataset( np.array(file["X_ft_tr"]), np.array(file["Y_tr"]), info={"original images": np.array(file["X_orig_tr"])}, add_bias=add_bias, ), dl.Dataset( np.array(file["X_ft_val"]), np.array(file["Y_val"]), info={"original images": np.array(file["X_orig_val"])}, add_bias=add_bias, ), dl.Dataset( np.array(file["X_ft_test"]), np.array(file["Y_test"]), info={"original images": np.array(file["X_orig_test"])}, add_bias=add_bias, ), ])
def mnist(folder=None, one_hot=True, partitions=None, filters=None, maps=None, shuffle=False): if not folder: folder = MNIST_DIR datasets = read_data_sets(folder, one_hot=one_hot) train = dl.Dataset(datasets.train.images, datasets.train.labels, name="MNIST") validation = dl.Dataset(datasets.validation.images, datasets.validation.labels, name="MNIST") test = dl.Dataset(datasets.test.images, datasets.test.labels, name="MNIST") res = [train, validation, test] if partitions: res = redivide_data( res, partition_proportions=partitions, filters=filters, maps=maps, shuffle=shuffle, ) res += [None] * (3 - len(res)) return dl.Datasets.from_list(res)
def random_regression_datasets(n_samples, features=100, outs=1, informative=0.1, partition_proportions=(0.5, 0.3), rnd=None, **mk_rgr_kwargs): rnd_state = dl.get_rand_state(rnd) X, Y, w = make_regression(n_samples, features, int(features * informative), outs, random_state=rnd_state, coef=True, **mk_rgr_kwargs) if outs == 1: Y = np.reshape(Y, (n_samples, 1)) print("range of Y", np.min(Y), np.max(Y)) info = merge_dicts({ "informative": informative, "random_seed": rnd, "w": w }, mk_rgr_kwargs) name = dl.em_utils.name_from_dict(info, "w") dt = dl.Dataset(X, Y, name=name, info=info) datasets = dl.Datasets.from_list(redivide_data([dt], partition_proportions)) print( "conditioning of X^T X", np.linalg.cond(datasets.train.data.T @ datasets.train.data), ) return datasets
def random_classification_datasets(n_samples, features=100, classes=2, informative=0.1, partition_proportions=(0.5, 0.3), rnd=None, one_hot=True, **mk_cls_kwargs): rnd_state = dl.get_rand_state(rnd) X, Y = make_classification(n_samples, features, n_classes=classes, random_state=rnd_state, **mk_cls_kwargs) if one_hot: Y = to_one_hot_enc(Y) print("range of Y", np.min(Y), np.max(Y)) info = merge_dicts({ "informative": informative, "random_seed": rnd }, mk_cls_kwargs) name = dl.em_utils.name_from_dict(info, "w") dt = dl.Dataset(X, Y, name=name, info=info) datasets = dl.Datasets.from_list(redivide_data([dt], partition_proportions)) print( "conditioning of X^T X", np.linalg.cond(datasets.train.data.T @ datasets.train.data), ) return datasets
def all_data(self, partition_proportions=None, seed=None): if not self._loaded_images: self.load_all_images() while not self.check_loaded_images(600): import time time.sleep(5) data, targets = [], [] for k, c in enumerate(sorted(self._loaded_images)): data += list(self._loaded_images[c].values()) targets += [k] * 600 if self.info["one_hot_enc"]: targets = dl.to_one_hot_enc(targets, dimension=len(self._loaded_images)) _dts = [ dl.Dataset(data=np.stack(data), target=np.array(targets), name="MiniImagenet_full") ] if seed: np.random.seed(seed) if partition_proportions: _dts = redivide_data(_dts, partition_proportions=partition_proportions, shuffle=True) return dl.Datasets.from_list(_dts)
def generate_datasets(self, rand=None, num_classes=None, num_examples=None): rand = dl.get_rand_state(rand) if not num_examples: num_examples = self.kwargs["num_examples"] if not num_classes: num_classes = self.kwargs["num_classes"] clss = self._loaded_images if self._loaded_images else self.info[ "classes"] random_classes = rand.choice(list(clss.keys()), size=(num_classes, ), replace=False) rand_class_dict = {rnd: k for k, rnd in enumerate(random_classes)} _dts = [] for ns in as_tuple_or_list(num_examples): classes = balanced_choice_wr(random_classes, ns, rand) all_images = {cls: list(clss[cls]) for cls in classes} data, targets, sample_info = [], [], [] for c in classes: rand.shuffle(all_images[c]) img_name = all_images[c][0] all_images[c].remove(img_name) sample_info.append({"name": img_name, "label": c}) data.append(clss[c][img_name]) targets.append(rand_class_dict[c]) if self.info["one_hot_enc"]: targets = dl.to_one_hot_enc(targets, dimension=num_classes) _dts.append( dl.Dataset( data=np.array(np.stack(data)), target=targets, sample_info=sample_info, info={"all_classes": random_classes}, )) return dl.Datasets.from_list(_dts)
def redivide_data( datasets, partition_proportions=None, shuffle=False, filters=None, maps=None, balance_classes=False, rand=None, ): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param rand: :param balance_classes: # TODO RICCARDO :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ rnd = get_rand_state(rand) all_data = vstack([get_data(d) for d in datasets]) all_labels = stack_or_concat([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance( partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, ( "partition proportions must sum up to at most one: %d" % sum_proportions) if sum_proportions < 1.0: partition_proportions += [1.0 - sum_proportions] else: partition_proportions = [ 1.0 * get_data(d).shape[0] / N for d in datasets ] if shuffle: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() # if sk_shuffle: # TODO this does not work!!! find a way to shuffle these matrices while # keeping compatibility with tensorflow! # all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos) # else: permutation = np.arange(all_data.shape[0]) rnd.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [ xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [ _map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0], ) calculated_partitions[-1] = N print( "datasets.redivide_data:, computed partitions numbers -", calculated_partitions, "len all", N, end=" ", ) new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.info} if balance_classes: new_datasets = [] forbidden_indices = np.empty(0, dtype=np.int64) for d1, d2 in zip(calculated_partitions[:-1], calculated_partitions[1:-1]): indices = np.array( get_indices_balanced_classes(d2 - d1, all_labels, forbidden_indices)) dataset = dl.Dataset( data=all_data[indices], target=all_labels[indices], sample_info=all_infos[indices], info=new_general_info_dict, ) new_datasets.append(dataset) forbidden_indices = np.append(forbidden_indices, indices) test_if_balanced(dataset) remaining_indices = np.array( list(set(list(range(N))) - set(forbidden_indices))) new_datasets.append( dl.Dataset( data=all_data[remaining_indices], target=all_labels[remaining_indices], sample_info=all_infos[remaining_indices], info=new_general_info_dict, )) else: new_datasets = [ dl.Dataset( data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict, ) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print("DONE") return new_datasets
def generate_datasets(self, rand=None, num_classes=None, num_examples=None, wait_for_n_min=None): rand = dl.get_rand_state(rand) if wait_for_n_min: import time while not self.check_loaded_images(wait_for_n_min): time.sleep(5) if not num_examples: num_examples = self.kwargs["num_examples"] if not num_classes: num_classes = self.kwargs["num_classes"] clss = self._loaded_images if self._loaded_images else self.info[ "classes"] random_classes = rand.choice(list(clss.keys()), size=(num_classes, ), replace=False) rand_class_dict = {rnd: k for k, rnd in enumerate(random_classes)} _dts = [] for ns in dl.as_tuple_or_list(num_examples): classes = balanced_choice_wr(random_classes, ns, rand) all_images = {cls: list(clss[cls]) for cls in classes} data, targets, sample_info = [], [], [] for c in classes: rand.shuffle(all_images[c]) img_name = all_images[c][0] all_images[c].remove(img_name) sample_info.append({"name": img_name, "label": c}) if self._loaded_images: data.append(clss[c][img_name]) else: from imageio import imread data.append( np.array( Image.fromarray( imread( join(self.info["base_folder"], join(c, img_name)))).resize( size=(self.info["resize"], self.info["resize"]))) / 255.0) targets.append(rand_class_dict[c]) if self.info["one_hot_enc"]: targets = to_one_hot_enc(targets, dimension=num_classes) _dts.append( dl.Dataset( data=np.array(np.stack(data)), target=targets, sample_info=sample_info, info={"all_classes": random_classes}, )) return dl.Datasets.from_list(_dts)