def create_subscenario(base_scenario, task_indexes): """ In this function we want to create a subscenario from the different tasks, either by subsampling tasks or reodering or both. """ new_x, new_y, new_t = None, None, None if base_scenario.cl_dataset.bounding_boxes is not None: raise ValueError( "the function create_subscenario is not compatible with scenario with bounding_boxes yet." ) for i, index in enumerate(task_indexes): taskset = base_scenario[index] all_task_indexes = np.arange(len(taskset)) x, y, t = taskset.get_raw_samples(all_task_indexes) t = np.ones(len(y)) * i if new_x is None: new_x = x new_y = y new_t = t else: new_x = np.concatenate([new_x, x], axis=0) new_y = np.concatenate([new_y, y], axis=0) new_t = np.concatenate([new_t, t], axis=0) dataset = InMemoryDataset(new_x, new_y, new_t, data_type=base_scenario.cl_dataset.data_type) return ContinualScenario(dataset)
def test_h5dataset_add_data(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) h5dataset.add_data(x_, y_, t_) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) assert scenario.nb_tasks == nb_task
def test_scenario(): x = np.ones((100, 4, 4, 3), dtype=np.uint8) y = np.arange(100) // 5 nb_tasks = 10 t = np.random.randint(nb_tasks, size=100) dummy = InMemoryDataset(x, y, t) scenario = ContinualScenario(dummy) assert scenario.nb_tasks == nb_tasks
def test_bad_task_ids(): x = np.ones((100, 4, 4, 3), dtype=np.uint8) y = np.arange(100) // 5 nb_tasks = 10 # test if one missing generate an error t = np.random.randint(10, size=100) t = t + np.ones(100) # shift indexes from [0 - 9] to [1 - 10] dummy = InMemoryDataset(x, y, t) with pytest.raises(Exception): scenario = ContinualScenario(dummy)
def test_scenario_CIFAR100_Scenarios(): dataset = CIFAR100(DATA_PATH, train=True, labels_type="category", task_labels="category") scenario = ContinualScenario(dataset) assert scenario.nb_classes == 20 assert scenario.nb_tasks == 20 dataset = CIFAR100(DATA_PATH, train=True, labels_type="category", task_labels="class") scenario = ContinualScenario(dataset) assert scenario.nb_classes == 20 assert scenario.nb_tasks == 100 dataset = CIFAR100(DATA_PATH, train=True, labels_type="class", task_labels="class") scenario = ContinualScenario(dataset) assert scenario.nb_classes == 100 assert scenario.nb_tasks == 100 dataset = CIFAR100(DATA_PATH, train=True, labels_type="class", task_labels="category") scenario = ContinualScenario(dataset) assert scenario.nb_classes == 100 assert scenario.nb_tasks == 20 dataset = CIFAR100(DATA_PATH, train=True, labels_type="category", task_labels="lifelong") scenario = ContinualScenario(dataset) assert scenario.nb_classes == 20 assert scenario.nb_tasks == 5
def test_h5dataset_ContinualScenario(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) assert scenario.nb_tasks == nb_task data_indexes = np.where(t_ == 0)[0] assert len(data_indexes) == len(scenario[0])
def test_h5dataset_loading(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_task
def test_h5dataset_get_raw(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: indexes = np.random.randint(len(task_set), size=len(task_set) // 2) _, _, _ = task_set.get_raw_samples(indexes.sort()) # test with no indexes _, _, _ = task_set.get_raw_samples() assert scenario.nb_tasks == nb_task
def test_create_subscenario_h5dataset(data, tmpdir): from continuum.scenarios import create_subscenario filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) sub_scenario = create_subscenario(scenario, np.arange(nb_task - 1)) for task_set in sub_scenario: loader = DataLoader(task_set) for _ in loader: pass assert sub_scenario.nb_tasks == nb_task - 1
def encode_scenario(scenario, model, batch_size, filename, inference_fct=None): """This function created an encoded scenario dataset and convert it into a ContinualScenario. :param model: model to encode the data. :param scenario: scenario to encode. :param batch_size: batch size to load data. :param filename: filename for the h5 dataset. :param inference_fct: A function that make possible to have a sophisticate way to get features. """ if os.path.isfile(filename): raise ValueError(f"File name: {filename} already exists") print(f"Encoding {filename}.") encoded_dataset = encode_into_dataset(model, scenario, batch_size, filename, inference_fct) print(f"Encoding is done.") return ContinualScenario(encoded_dataset)
def test_h5dataset_split_train_test(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) for task_set in scenario: task_set_tr, task_set_val = split_train_val(task_set) loader_tr = DataLoader(task_set_tr) for _ in loader_tr: pass loader_val = DataLoader(task_set_val) for _ in loader_val: pass assert scenario.nb_tasks == nb_task
def test_on_array_dataset(tmpdir): filename_h5 = os.path.join(tmpdir, "test_CIFAR100_h5.hdf5") cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True, labels_type="category", task_labels="lifelong") # in practice the construction is part by part to reduce data load but here we do it at once x, y, t = cl_dataset.get_data() h5dataset = H5Dataset(x, y, t, data_path=filename_h5) scenario = ContinualScenario(h5dataset) for task_set in scenario: loader = DataLoader(task_set, batch_size=64) for x, y, t in loader: assert x.shape == torch.Size([64, 3, 32, 32]) break assert scenario.nb_tasks == 5 # number of task of CIFAR100Lifelong
def test_h5dataset_reloading(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data # create dataset h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) # destroy object del h5dataset # reload data set h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset_reloaded) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_task
def test_create_subscenario_suffle_h5dataset(data, tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") x_, y_, t_ = data h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5) nb_task = len(np.unique(t_)) scenario = ContinualScenario(h5dataset) task_order = np.arange(nb_task) np.random.shuffle(task_order) sub_scenario = create_subscenario(scenario, task_order) for task_set in sub_scenario: loader = DataLoader(task_set) for _ in loader: pass assert sub_scenario.nb_tasks == nb_task
def test_h5dataset_reloading_slow(tmpdir): filename_h5 = os.path.join(tmpdir, "test_h5.hdf5") nb_tasks = 5 cl_dataset = CIFAR100(data_path=DATA_PATH, download=False, train=True, labels_type="category", task_labels="lifelong") x, y, t = cl_dataset.get_data() # create dataset h5dataset = H5Dataset(x, y, t, data_path=filename_h5) # destroy object del h5dataset # reload data set h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5) scenario = ContinualScenario(h5dataset_reloaded) for task_set in scenario: loader = DataLoader(task_set) for _ in loader: pass assert scenario.nb_tasks == nb_tasks task_order = np.arange(nb_tasks) sub_scenario = create_subscenario(scenario, task_order[:-1]) assert sub_scenario.nb_tasks == nb_tasks-1 np.random.shuffle(task_order) sub_scenario = create_subscenario(scenario, task_order) assert sub_scenario.nb_tasks == nb_tasks
def create_subscenario(base_scenario, task_indexes): """ In this function we want to create a subscenario from the different tasks, either by subsampling tasks or reodering or both. :param base_scenario: scenario from which the subscenario will be created :param task_indexes: array with new order of tasks :return: A train PyTorch's Datasets. """ if torch.is_tensor(task_indexes): task_indexes = task_indexes.numpy() if base_scenario.transformations is not None and isinstance( base_scenario.transformations[0], list): transformations = [ base_scenario.transformations[i] for i in task_indexes ] else: transformations = base_scenario.transformations sub_scenario = None if isinstance(base_scenario, OnlineFellowship): # We just want to changes base_scenario.cl_datasets order new_cl_datasets = [base_scenario.cl_datasets[i] for i in task_indexes] sub_scenario = OnlineFellowship( new_cl_datasets, transformations=transformations, update_labels=base_scenario.update_labels) elif base_scenario.cl_dataset.data_type == TaskType.H5: list_taskset = [base_scenario[i] for i in task_indexes] sub_scenario = OnlineFellowship(list_taskset, transformations=transformations, update_labels=False) else: new_x, new_y, new_t = None, None, None if base_scenario.cl_dataset.bounding_boxes is not None: raise ValueError( "the function create_subscenario is not compatible with scenario with bounding_boxes yet." ) for i, index in enumerate(task_indexes): taskset = base_scenario[index] all_task_indexes = np.arange(len(taskset)) x, y, t = taskset.get_raw_samples(all_task_indexes) t = np.ones(len(y)) * i if new_x is None: new_x = x new_y = y new_t = t else: new_x = np.concatenate([new_x, x], axis=0) new_y = np.concatenate([new_y, y], axis=0) new_t = np.concatenate([new_t, t], axis=0) dataset = InMemoryDataset(new_x, new_y, new_t, data_type=base_scenario.cl_dataset.data_type) sub_scenario = ContinualScenario(dataset, transformations=transformations) return sub_scenario