def test_active_learning_mixin(): hparams = None dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) assert (len(model.pool_loader()) == 2)
def main(hparams): train_transform = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.ToTensor()]) active_set = ActiveLearningDataset( CIFAR10(hparams.data_root, train=True, transform=train_transform, download=True), pool_specifics={ 'transform': test_transform }) active_set.label_randomly(10) heuristic = BALD() model = VGG16(active_set, hparams) dp = 'dp' if hparams.n_gpus > 1 else None trainer = BaalTrainer(max_epochs=3, default_root_dir=hparams.data_root, gpus=hparams.n_gpus, distributed_backend=dp, # The weights of the model will change as it gets # trained; we need to keep a copy (deepcopy) so that # we can reset them. callbacks=[ResetCallback(copy.deepcopy(model.state_dict()))]) loop = ActiveLearningLoop(active_set, get_probabilities=trainer.predict_on_dataset_generator, heuristic=heuristic, ndata_to_label=hparams.query_size) AL_STEPS = 100 for al_step in range(AL_STEPS): print(f'Step {al_step} Dataset size {len(active_set)}') trainer.fit(model) should_continue = loop.step() if not should_continue: break
def get_datasets(initial_pool): transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(30), transforms.ToTensor(), transforms.Normalize(3 * [0.5], 3 * [0.5]), ]) test_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(3 * [0.5], 3 * [0.5]), ]) # Note: We use the test set here as an example. You should make your own validation set. train_ds = datasets.CIFAR10(".", train=True, transform=transform, target_transform=None, download=True) test_set = datasets.CIFAR10(".", train=False, transform=test_transform, target_transform=None, download=True) active_set = ActiveLearningDataset( train_ds, pool_specifics={"transform": test_transform}) # We start labeling randomly. active_set.label_randomly(initial_pool) return active_set, test_set
def test_len(self): assert len(self.dataset) == 0 assert self.dataset.n_unlabelled == 100 assert len(self.dataset.pool) == 100 self.dataset.label(0) assert len(self.dataset) == self.dataset.n_labelled == 1 assert self.dataset.n_unlabelled == 99 assert len(self.dataset.pool) == 99 self.dataset.label(list(range(99))) assert len(self.dataset) == 100 assert self.dataset.n_unlabelled == 0 assert len(self.dataset.pool) == 0 dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=self.dataset._labelled, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool) dummy_lbl = torch.from_numpy(self.dataset._labelled.astype(np.float32)) dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=dummy_lbl, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool)
class SSLModuleTest(unittest.TestCase): def setUp(self): d1_len = 100 d2_len = 1000 d1 = SSLTestDataset(labeled=True, length=d1_len) d2 = SSLTestDataset(labeled=False, length=d2_len) dataset = ConcatDataset([d1, d2]) print(len(dataset)) self.al_dataset = ActiveLearningDataset(dataset) self.al_dataset.label(list( range(d1_len))) # Label data from d1 (even numbers) def test_epoch(self): hparams = { 'p': None, 'num_steps': None, 'batch_size': 10, 'workers': 0 } module = TestSSLModule(self.al_dataset, Namespace(**hparams)) trainer = Trainer(max_epochs=1, num_sanity_val_steps=0, progress_bar_refresh_rate=0, logger=False, checkpoint_callback=False) trainer.fit(module) assert len(module.labeled_data) == len(module.unlabeled_data) assert torch.all(torch.tensor(module.labeled_data) % 2 == 0) assert torch.all(torch.tensor(module.unlabeled_data) % 2 != 0)
def test_arrowds(): dataset = HFdata.load_dataset('glue', 'sst2')['test'] dataset = ActiveLearningDataset(dataset) dataset.label(np.arange(10)) assert len(dataset) == 10 assert len(dataset.pool) == 1811 data = dataset.pool[0] assert [k in ['idx', 'label', 'sentence'] for k, v in data.items()]
def test_calibration_integration(): transform_pipeline = Compose([Resize((64, 64)), ToTensor()]) cifar10_train = DummyDataset(transform_pipeline) cifar10_test = DummyDataset(transform_pipeline) # we don't create different trainset for calibration since the goal is not # to calibrate al_dataset = ActiveLearningDataset( cifar10_train, pool_specifics={'transform': transform_pipeline}) al_dataset.label_randomly(10) use_cuda = False model = vgg.vgg16(pretrained=False, num_classes=10) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005) wrapper = ModelWrapper(model, criterion) calibrator = DirichletCalibrator(wrapper=wrapper, num_classes=10, lr=0.001, reg_factor=0.01) for step in range(2): wrapper.train_on_dataset(al_dataset, optimizer=optimizer, batch_size=10, epoch=1, use_cuda=use_cuda, workers=0) wrapper.test_on_dataset(cifar10_test, batch_size=10, use_cuda=use_cuda, workers=0) before_calib_param = list( map(lambda x: x.clone(), wrapper.model.parameters())) calibrator.calibrate(al_dataset, cifar10_test, batch_size=10, epoch=5, use_cuda=use_cuda, double_fit=False, workers=0) after_calib_param = list(map(lambda x: x.clone(), model.parameters())) assert all([ np.allclose(i.detach(), j.detach()) for i, j in zip(before_calib_param, after_calib_param) ]) assert len(list(wrapper.model.modules())) < len( list(calibrator.calibrated_model.modules()))
def test_integration(): transform_pipeline = Compose([Resize((64, 64)), ToTensor()]) cifar10_train = DummyDataset(transform_pipeline) cifar10_test = DummyDataset(transform_pipeline) al_dataset = ActiveLearningDataset( cifar10_train, pool_specifics={'transform': transform_pipeline}) al_dataset.label_randomly(10) use_cuda = False model = vgg.vgg16(pretrained=False, num_classes=10) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005) # We can now use BaaL to create the active learning loop. model = ModelWrapper(model, criterion) # We create an ActiveLearningLoop that will automatically label the most uncertain samples. # In this case, we use the widely used BALD heuristic. active_loop = ActiveLearningLoop(al_dataset, model.predict_on_dataset, heuristic=heuristics.BALD(), ndata_to_label=10, batch_size=10, iterations=10, use_cuda=use_cuda, workers=4) # We're all set! num_steps = 10 for step in range(num_steps): old_param = list(map(lambda x: x.clone(), model.model.parameters())) model.train_on_dataset(al_dataset, optimizer=optimizer, batch_size=10, epoch=5, use_cuda=use_cuda, workers=2) model.test_on_dataset(cifar10_test, batch_size=10, use_cuda=use_cuda, workers=2) if not active_loop.step(): break new_param = list(map(lambda x: x.clone(), model.model.parameters())) assert any([ not np.allclose(i.detach(), j.detach()) for i, j in zip(old_param, new_param) ]) assert step == 4 # 10 + (4 * 10) = 50, so it stops at iterations 4
def setUp(self): d1_len = 100 d2_len = 1000 d1 = SSLTestDataset(labeled=True, length=d1_len) d2 = SSLTestDataset(labeled=False, length=d2_len) dataset = ConcatDataset([d1, d2]) print(len(dataset)) self.al_dataset = ActiveLearningDataset(dataset) self.al_dataset.label(list(range(d1_len))) # Label data from d1 (even numbers)
def test_sad(max_sample, expected): dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heuristics.Random(), max_sample=max_sample, query_size=10, dummy_param=1) dataset.label_randomly(10) active_loop.step() assert len(dataset) == 10 + expected
def test_transform(self): train_transform = Lambda(lambda k: 1) test_transform = Lambda(lambda k: 0) dataset = ActiveLearningDataset(MyDataset(train_transform), test_transform, make_unlabelled=lambda x: (x[0], -1)) dataset.label(np.arange(10)) pool = dataset.pool assert np.equal([i for i in pool], [(0, -1) for i in np.arange(10, 100)]).all() assert np.equal([i for i in dataset], [(1, i) for i in np.arange(10)]).all()
def test_transform(self): train_transform = Lambda(lambda k: 1) test_transform = Lambda(lambda k: 0) dataset = ActiveLearningDataset(MyDataset(train_transform), make_unlabelled=lambda x: (x[0], -1), pool_specifics={'transform': test_transform}) dataset.label(np.arange(10)) pool = dataset.pool assert np.equal([i for i in pool], [(0, -1) for i in np.arange(10, 100)]).all() assert np.equal([i for i in dataset], [(1, i) for i in np.arange(10)]).all() with pytest.raises(ValueError) as e: ActiveLearningDataset(MyDataset(train_transform), pool_specifics={'whatever': 123}).pool
def test_on_load_checkpoint(): hparams = None dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) ckpt = {} save_chkp = model.on_save_checkpoint(ckpt) assert ('active_dataset' in ckpt) active_set_2 = ActiveLearningDataset(dataset) model_2 = DummyPytorchLightning(active_set_2, hparams) on_load_chkp = model_2.on_load_checkpoint(ckpt) assert (len(active_set) == len(active_set_2))
def setUp(self): self.lbls = None self.transform = Compose([Resize(60), RandomRotation(90), ToTensor()]) testtransform = Compose([Resize(32), ToTensor()]) self.dataset = FileDataset(self.paths, self.lbls, transform=self.transform) self.lbls = self.generate_labels(len(self.paths), 10) self.dataset = FileDataset(self.paths, self.lbls, transform=self.transform) self.active = ActiveLearningDataset( self.dataset, labelled=(np.array(self.lbls) != -1), pool_specifics={'transform': testtransform})
def active_pascal( path="/tmp", *args, transform=transforms.ToTensor(), test_transform=transforms.ToTensor(), **kwargs, ): """Get active Pascal-VOC 2102 datasets. Arguments: path : str The root folder for the Pascal dataset Returns: ActiveLearningDataset the active learning dataset, training data Dataset the evaluation dataset """ return ( ActiveLearningDataset( datasets.VOCSegmentation(path, image_set='train', transform=transform, download=False, *args, **kwargs)), datasets.VOCSegmentation(path, image_set='val', transform=test_transform, download=False, *args, **kwargs), )
def test_should_stop_iter(heur): dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heur, query_size=10, dummy_param=1) dataset.label_randomly(10) step = 0 for _ in range(15): flg = active_loop.step() step += 1 if not flg: break assert step == 10
def test_on_load_checkpoint(a_data_module, a_dataset, hparams): ckpt = {} _ = a_data_module.on_save_checkpoint(ckpt) assert ('active_dataset' in ckpt) active_set_2 = ActiveLearningDataset(a_dataset) data_mdoule_2 = MyDataModule(active_set_2, hparams['batch_size']) on_load_chkp = data_mdoule_2.on_load_checkpoint(ckpt) assert (len(a_data_module.active_dataset) == len(active_set_2))
def setUp(self): d1_len = 100 d2_len = 1000 d1 = SSLTestDataset(labeled=True, length=d1_len) d2 = SSLTestDataset(labeled=False, length=d2_len) dataset = ConcatDataset([d1, d2]) print(len(dataset)) self.al_dataset = ActiveLearningDataset(dataset) self.al_dataset.label(list( range(d1_len))) # Label data from d1 (even numbers) self.ss_iterator = SemiSupervisedIterator(self.al_dataset, p=None, num_steps=None, batch_size=10)
def test_labelled_map(): ds = ActiveLearningDataset(MyDataset()) assert ds.current_al_step == 0 ds.label_randomly(10) assert ds.current_al_step == 1 ds.label_randomly(10) assert ds.labelled_map.max() == 2 and np.equal(ds.labelled, ds.labelled_map > 0).all() st = ds.state_dict() ds2 = ActiveLearningDataset(MyDataset(), labelled=st["labelled"]) assert ds2.current_al_step == ds.current_al_step
def test_pl_step(): hparams = HParams() dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) ckpt = {} save_chkp = model.on_save_checkpoint(ckpt) trainer = BaalTrainer(dataset=active_set, max_epochs=3, default_root_dir='/tmp', ndata_to_label=hparams.query_size, callbacks=[ResetCallback(copy.deepcopy(save_chkp))]) trainer.model = model before = len(active_set) trainer.step() after = len(active_set) assert after - before == hparams.query_size
def test_file_saving(tmpdir): tmpdir = str(tmpdir) heur = heuristics.BALD() ds = MyDataset() dataset = ActiveLearningDataset(ds, make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heur, uncertainty_folder=tmpdir, query_size=10, dummy_param=1) dataset.label_randomly(10) _ = active_loop.step() assert len(os.listdir(tmpdir)) == 1 file = pjoin(tmpdir, os.listdir(tmpdir)[0]) assert "pool=90" in file and "labelled=10" in file data = pickle.load(open(file, 'rb')) assert len(data['uncertainty']) == 90 # The diff between the current state and the step before is the newly labelled item. assert (data['dataset']['labelled'] != dataset.labelled).sum() == 10
def test_deprecation(): heur = heuristics.BALD() ds = MyDataset() dataset = ActiveLearningDataset(ds, make_unlabelled=lambda x: -1) with warnings.catch_warnings(record=True) as w: active_loop = ActiveLearningLoop(dataset, get_probs_iter, heur, ndata_to_label=10, dummy_param=1) assert issubclass(w[-1].category, DeprecationWarning) assert "ndata_to_label" in str(w[-1].message)
def __init__(self, data_root, batch_size): train_transform = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.ToTensor()]) active_set = ActiveLearningDataset( CIFAR10(data_root, train=True, transform=train_transform, download=True), pool_specifics={ 'transform': test_transform }) self.test_set = CIFAR10(data_root, train=False, transform=test_transform, download=True) super().__init__(active_dataset=active_set, batch_size=batch_size, train_transforms=train_transform, test_transforms=test_transform)
def test_label_randomly_full(self): dataset_1 = ActiveLearningDataset(MyDataset()) dataset_1.label_randomly(99) assert dataset_1.n_unlabelled == 1 assert len(dataset_1.pool) == 1 dataset_1.label_randomly(1) assert dataset_1.n_unlabelled == 0 assert dataset_1.n_labelled == 100
def test_last_active_step(): ds = ActiveLearningDataset(MyDataset(), last_active_steps=1) assert len(ds) == 0 ds.label_randomly(10) assert len(ds) == 10 ds.label_randomly(10) # We only iterate over the items labelled at step 2. assert len(ds) == 10 assert all(ds.labelled_map[x] == 2 for x, _ in ds)
def test_no_pool(self): d1 = SSLTestDataset(labeled=True, length=100) al_dataset = ActiveLearningDataset(d1) al_dataset.label_randomly(100) ss_iterator = SemiSupervisedIterator(al_dataset, p=0.1, num_steps=None, batch_size=10) labeled_data = [] unlabeled_data = [] for batch_idx, batch in enumerate(ss_iterator): if SemiSupervisedIterator.is_labeled(batch): batch = SemiSupervisedIterator.get_batch(batch) labeled_data.extend(batch) else: batch = SemiSupervisedIterator.get_batch(batch) unlabeled_data.extend(batch) total = len(labeled_data) + len(unlabeled_data) l_ratio = len(labeled_data) / total u_ratio = len(unlabeled_data) / total assert l_ratio == 1 assert u_ratio == 0
def setUp(self): self.lbls = None self.transform = Compose([Resize(60), RandomRotation(90), ToTensor()]) testtransform = Compose([Resize(32), ToTensor()]) self.dataset = FileDataset(self.paths, self.lbls, transform=self.transform) self.lbls = self.generate_labels(len(self.paths), 10) self.dataset = FileDataset(self.paths, self.lbls, transform=self.transform) self.active = ActiveLearningDataset( self.dataset, eval_transform=testtransform, labelled=torch.from_numpy( (np.array(self.lbls) != -1).astype(np.uint8)))
def test_warning_raised_on_label(): class DS(Dataset): def __init__(self): self.x = [1, 2, 3] self.label = [1, 1, 1] def __len__(self): return len(self.x) def __getitem__(self, item): return self.x[item], self.y[item] with warnings.catch_warnings(record=True) as w: al = ActiveLearningDataset(DS()) assert not al.can_label assert len(w) == 1 assert "label" in str(w[-1].message)
def active_huggingface_dataset(dataset, tokenizer=None, target_key: str = "label", input_key: str = "sentence", max_seq_len: int = 128, **kwargs): """ Wrapping huggingface.datasets with baal.active.ActiveLearningDataset. Args: dataset (torch.utils.data.Dataset): a dataset provided by huggingface. tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface. target_key (str): target key used in the dataset's dictionary. input_key (str): input key used in the dataset's dictionary. max_seq_len (int): max length of a sequence to be used for padding the shorter sequences. kwargs (Dict): Parameters forwarded to 'ActiveLearningDataset'. Returns: an baal.active.ActiveLearningDataset object. """ return ActiveLearningDataset( HuggingFaceDatasets(dataset, tokenizer, target_key, input_key, max_seq_len), **kwargs)
def get_dataset(split, dataset_dict): if dataset_dict["name"] == "active_learning": transform = tt.Compose([tt.ToPILImage(), tt.ToTensor()]) n_classes = dataset_dict.get('n_classes') n_classes = n_classes or (52 if dataset_dict['task'] == 'char' else 1002) path = get_data_path_or_download(dataset_dict["path"], DATA_ROOT) dataset = AleatoricSynbols( path=path, split=split, key=dataset_dict["task"], transform=transform, p=dataset_dict.get('p', 0.0), seed=dataset_dict.get('seed', 666), n_classes=n_classes, ) if split == 'train': # Make an AL dataset and label randomly. dataset = ActiveLearningDataset( dataset, pool_specifics={'transform': transform}) dataset.label_randomly(dataset_dict['initial_pool']) return dataset else: raise ValueError("Dataset %s not found" % dataset_dict["name"])