def get_datasets(initial_pool): transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(30), transforms.ToTensor(), transforms.Normalize(3 * [0.5], 3 * [0.5]), ]) test_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(3 * [0.5], 3 * [0.5]), ]) # Note: We use the test set here as an example. You should make your own validation set. train_ds = datasets.CIFAR10(".", train=True, transform=transform, target_transform=None, download=True) test_set = datasets.CIFAR10(".", train=False, transform=test_transform, target_transform=None, download=True) active_set = ActiveLearningDataset( train_ds, pool_specifics={"transform": test_transform}) # We start labeling randomly. active_set.label_randomly(initial_pool) return active_set, test_set
def main(hparams): train_transform = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.ToTensor()]) active_set = ActiveLearningDataset( CIFAR10(hparams.data_root, train=True, transform=train_transform, download=True), pool_specifics={ 'transform': test_transform }) active_set.label_randomly(10) heuristic = BALD() model = VGG16(active_set, hparams) dp = 'dp' if hparams.n_gpus > 1 else None trainer = BaalTrainer(max_epochs=3, default_root_dir=hparams.data_root, gpus=hparams.n_gpus, distributed_backend=dp, # The weights of the model will change as it gets # trained; we need to keep a copy (deepcopy) so that # we can reset them. callbacks=[ResetCallback(copy.deepcopy(model.state_dict()))]) loop = ActiveLearningLoop(active_set, get_probabilities=trainer.predict_on_dataset_generator, heuristic=heuristic, ndata_to_label=hparams.query_size) AL_STEPS = 100 for al_step in range(AL_STEPS): print(f'Step {al_step} Dataset size {len(active_set)}') trainer.fit(model) should_continue = loop.step() if not should_continue: break
def test_active_learning_mixin(): hparams = None dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) assert (len(model.pool_loader()) == 2)
def test_label_randomly_full(self): dataset_1 = ActiveLearningDataset(MyDataset()) dataset_1.label_randomly(99) assert dataset_1.n_unlabelled == 1 assert len(dataset_1.pool) == 1 dataset_1.label_randomly(1) assert dataset_1.n_unlabelled == 0 assert dataset_1.n_labelled == 100
def test_calibration_integration(): transform_pipeline = Compose([Resize((64, 64)), ToTensor()]) cifar10_train = DummyDataset(transform_pipeline) cifar10_test = DummyDataset(transform_pipeline) # we don't create different trainset for calibration since the goal is not # to calibrate al_dataset = ActiveLearningDataset( cifar10_train, pool_specifics={'transform': transform_pipeline}) al_dataset.label_randomly(10) use_cuda = False model = vgg.vgg16(pretrained=False, num_classes=10) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005) wrapper = ModelWrapper(model, criterion) calibrator = DirichletCalibrator(wrapper=wrapper, num_classes=10, lr=0.001, reg_factor=0.01) for step in range(2): wrapper.train_on_dataset(al_dataset, optimizer=optimizer, batch_size=10, epoch=1, use_cuda=use_cuda, workers=0) wrapper.test_on_dataset(cifar10_test, batch_size=10, use_cuda=use_cuda, workers=0) before_calib_param = list( map(lambda x: x.clone(), wrapper.model.parameters())) calibrator.calibrate(al_dataset, cifar10_test, batch_size=10, epoch=5, use_cuda=use_cuda, double_fit=False, workers=0) after_calib_param = list(map(lambda x: x.clone(), model.parameters())) assert all([ np.allclose(i.detach(), j.detach()) for i, j in zip(before_calib_param, after_calib_param) ]) assert len(list(wrapper.model.modules())) < len( list(calibrator.calibrated_model.modules()))
def test_last_active_step(): ds = ActiveLearningDataset(MyDataset(), last_active_steps=1) assert len(ds) == 0 ds.label_randomly(10) assert len(ds) == 10 ds.label_randomly(10) # We only iterate over the items labelled at step 2. assert len(ds) == 10 assert all(ds.labelled_map[x] == 2 for x, _ in ds)
def test_integration(): transform_pipeline = Compose([Resize((64, 64)), ToTensor()]) cifar10_train = DummyDataset(transform_pipeline) cifar10_test = DummyDataset(transform_pipeline) al_dataset = ActiveLearningDataset( cifar10_train, pool_specifics={'transform': transform_pipeline}) al_dataset.label_randomly(10) use_cuda = False model = vgg.vgg16(pretrained=False, num_classes=10) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005) # We can now use BaaL to create the active learning loop. model = ModelWrapper(model, criterion) # We create an ActiveLearningLoop that will automatically label the most uncertain samples. # In this case, we use the widely used BALD heuristic. active_loop = ActiveLearningLoop(al_dataset, model.predict_on_dataset, heuristic=heuristics.BALD(), ndata_to_label=10, batch_size=10, iterations=10, use_cuda=use_cuda, workers=4) # We're all set! num_steps = 10 for step in range(num_steps): old_param = list(map(lambda x: x.clone(), model.model.parameters())) model.train_on_dataset(al_dataset, optimizer=optimizer, batch_size=10, epoch=5, use_cuda=use_cuda, workers=2) model.test_on_dataset(cifar10_test, batch_size=10, use_cuda=use_cuda, workers=2) if not active_loop.step(): break new_param = list(map(lambda x: x.clone(), model.model.parameters())) assert any([ not np.allclose(i.detach(), j.detach()) for i, j in zip(old_param, new_param) ]) assert step == 4 # 10 + (4 * 10) = 50, so it stops at iterations 4
def test_sad(max_sample, expected): dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heuristics.Random(), max_sample=max_sample, query_size=10, dummy_param=1) dataset.label_randomly(10) active_loop.step() assert len(dataset) == 10 + expected
def test_labelled_map(): ds = ActiveLearningDataset(MyDataset()) assert ds.current_al_step == 0 ds.label_randomly(10) assert ds.current_al_step == 1 ds.label_randomly(10) assert ds.labelled_map.max() == 2 and np.equal(ds.labelled, ds.labelled_map > 0).all() st = ds.state_dict() ds2 = ActiveLearningDataset(MyDataset(), labelled=st["labelled"]) assert ds2.current_al_step == ds.current_al_step
def test_on_load_checkpoint(): hparams = None dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) ckpt = {} save_chkp = model.on_save_checkpoint(ckpt) assert ('active_dataset' in ckpt) active_set_2 = ActiveLearningDataset(dataset) model_2 = DummyPytorchLightning(active_set_2, hparams) on_load_chkp = model_2.on_load_checkpoint(ckpt) assert (len(active_set) == len(active_set_2))
def test_should_stop_iter(heur): dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heur, query_size=10, dummy_param=1) dataset.label_randomly(10) step = 0 for _ in range(15): flg = active_loop.step() step += 1 if not flg: break assert step == 10
def test_pl_step(): hparams = HParams() dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) ckpt = {} save_chkp = model.on_save_checkpoint(ckpt) trainer = BaalTrainer(dataset=active_set, max_epochs=3, default_root_dir='/tmp', ndata_to_label=hparams.query_size, callbacks=[ResetCallback(copy.deepcopy(save_chkp))]) trainer.model = model before = len(active_set) trainer.step() after = len(active_set) assert after - before == hparams.query_size
def test_file_saving(tmpdir): tmpdir = str(tmpdir) heur = heuristics.BALD() ds = MyDataset() dataset = ActiveLearningDataset(ds, make_unlabelled=lambda x: -1) active_loop = ActiveLearningLoop(dataset, get_probs_iter, heur, uncertainty_folder=tmpdir, query_size=10, dummy_param=1) dataset.label_randomly(10) _ = active_loop.step() assert len(os.listdir(tmpdir)) == 1 file = pjoin(tmpdir, os.listdir(tmpdir)[0]) assert "pool=90" in file and "labelled=10" in file data = pickle.load(open(file, 'rb')) assert len(data['uncertainty']) == 90 # The diff between the current state and the step before is the newly labelled item. assert (data['dataset']['labelled'] != dataset.labelled).sum() == 10
def test_predict(): ckpt = {} hparams = HParams() dataset = DummyDataset() active_set = ActiveLearningDataset(dataset) active_set.label_randomly(10) model = DummyPytorchLightning(active_set, hparams) save_chkp = model.on_save_checkpoint(ckpt) trainer = BaalTrainer(dataset=active_set, max_epochs=3, default_root_dir='/tmp', callbacks=[ResetCallback(copy.deepcopy(save_chkp))]) trainer.model = model alt = trainer.predict_on_dataset() assert len(alt) == len(active_set.pool) assert 'active_dataset' in save_chkp n_labelled = len(active_set) copy_save_chkp = copy.deepcopy(save_chkp) active_set.label_randomly(5) model.on_load_checkpoint(copy_save_chkp) assert len(active_set) == n_labelled
def test_no_pool(self): d1 = SSLTestDataset(labeled=True, length=100) al_dataset = ActiveLearningDataset(d1) al_dataset.label_randomly(100) ss_iterator = SemiSupervisedIterator(al_dataset, p=0.1, num_steps=None, batch_size=10) labeled_data = [] unlabeled_data = [] for batch_idx, batch in enumerate(ss_iterator): if SemiSupervisedIterator.is_labeled(batch): batch = SemiSupervisedIterator.get_batch(batch) labeled_data.extend(batch) else: batch = SemiSupervisedIterator.get_batch(batch) unlabeled_data.extend(batch) total = len(labeled_data) + len(unlabeled_data) l_ratio = len(labeled_data) / total u_ratio = len(unlabeled_data) / total assert l_ratio == 1 assert u_ratio == 0
def test_load_state_dict(self): dataset_1 = ActiveLearningDataset(MyDataset(), random_state=50) dataset_1.label_randomly(10) state_dict1 = dataset_1.state_dict() dataset_2 = ActiveLearningDataset(MyDataset(), random_state=None) assert dataset_2.n_labelled == 0 dataset_2.load_state_dict(state_dict1) assert dataset_2.n_labelled == 10 # test if the second lable_randomly call have same behaviour dataset_1.label_randomly(5) dataset_2.label_randomly(5) assert np.allclose(dataset_1._labelled, dataset_2._labelled)
args = ArgumentParser(add_help=False) args.add_argument('--data-root', default='/tmp', type=str, help='Where to download the data') args.add_argument('--gpus', default=torch.cuda.device_count(), type=int) args = PIActiveLearningModel.add_model_specific_args(args) params = args.parse_args() active_set = ActiveLearningDataset( CIFAR10(params.data_root, train=True, transform=PIModel.train_transform, download=True), pool_specifics={'transform': PIModel.test_transform}) active_set.label_randomly(500) print("Active set length: {}".format(len(active_set))) print("Pool set length: {}".format(len(active_set.pool))) heuristic = get_heuristic(params.heuristic) model = vgg16(pretrained=False, num_classes=10) weights = load_state_dict_from_url( 'https://download.pytorch.org/models/vgg16-397923af.pth') weights = {k: v for k, v in weights.items() if 'classifier.6' not in k} model.load_state_dict(weights, strict=False) model = PIActiveLearningModel(network=model, active_dataset=active_set, hparams=params) dp = 'dp' if params.gpus > 1 else None
def a_data_module(a_dataset, hparams): active_set = ActiveLearningDataset(a_dataset) active_set.label_randomly(10) return MyDataModule(active_dataset=active_set, batch_size=hparams['batch_size'])
class ActiveDatasetTest(unittest.TestCase): def setUp(self): self.dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: (x[0], -1)) def test_len(self): assert len(self.dataset) == 0 assert self.dataset.n_unlabelled == 100 assert len(self.dataset.pool) == 100 self.dataset.label(0) assert len(self.dataset) == self.dataset.n_labelled == 1 assert self.dataset.n_unlabelled == 99 assert len(self.dataset.pool) == 99 self.dataset.label(list(range(99))) assert len(self.dataset) == 100 assert self.dataset.n_unlabelled == 0 assert len(self.dataset.pool) == 0 dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=self.dataset._labelled, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool) dummy_lbl = torch.from_numpy(self.dataset._labelled.astype(np.float32)) dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=dummy_lbl, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool) def test_pool(self): self.dataset._dataset.label = unittest.mock.MagicMock() labels_initial = self.dataset.n_labelled self.dataset.can_label = False self.dataset.label(0, value=np.arange(1, 10)) self.dataset._dataset.label.assert_not_called() labels_next_1 = self.dataset.n_labelled assert labels_next_1 == labels_initial + 1 self.dataset.can_label = True self.dataset.label(np.arange(0, 9)) self.dataset._dataset.label.assert_not_called() labels_next_2 = self.dataset.n_labelled assert labels_next_1 == labels_next_2 self.dataset.label(np.arange(0, 9), value=np.arange(1, 10)) assert self.dataset._dataset.label.called_once_with(np.arange(1, 10)) # cleanup del self.dataset._dataset.label self.dataset.can_label = False pool = self.dataset.pool assert np.equal([i for i in pool], [(i, -1) for i in np.arange(2, 100)]).all() assert np.equal([i for i in self.dataset], [(i, i) for i in np.arange(2)]).all() def test_get_raw(self): # check that get_raw returns the same thing regardless of labelling # status i_1 = self.dataset.get_raw(5) self.dataset.label(5) i_2 = self.dataset.get_raw(5) assert i_1 == i_2 def test_state_dict(self): state_dict_1 = self.dataset.state_dict() assert np.equal(state_dict_1["labeled"], np.full((100, ), False)).all() self.dataset.label(0) assert np.equal( state_dict_1["labeled"], np.concatenate((np.array([True]), np.full((99, ), False)))).all() def test_transform(self): train_transform = Lambda(lambda k: 1) test_transform = Lambda(lambda k: 0) dataset = ActiveLearningDataset(MyDataset(train_transform), test_transform, make_unlabelled=lambda x: (x[0], -1)) dataset.label(np.arange(10)) pool = dataset.pool assert np.equal([i for i in pool], [(0, -1) for i in np.arange(10, 100)]).all() assert np.equal([i for i in dataset], [(1, i) for i in np.arange(10)]).all() def test_random(self): self.dataset.label_randomly(50) assert len(self.dataset) == 50 assert len(self.dataset.pool) == 50
help='Where to download the data') args.add_argument('--gpus', default=torch.cuda.device_count(), type=int) args.add_argument('--num_labeled', default=5000, type=int) args.add_argument('--seed', default=None, type=int) args = PIModel.add_model_specific_args(args) params = args.parse_args() seed = seed_everything(params.seed) active_set = ActiveLearningDataset( CIFAR10(params.data_root, train=True, transform=PIModel.train_transform, download=True), pool_specifics={'transform': PIModel.test_transform}) active_set.label_randomly(params.num_labeled) print("Active set length: {}".format(len(active_set))) print("Pool set length: {}".format(len(active_set.pool))) net = vgg11(pretrained=False, num_classes=10) weights = load_state_dict_from_url( 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth') weights = {k: v for k, v in weights.items() if 'classifier.6' not in k} net.load_state_dict(weights, strict=False) system = PIModel(network=net, active_dataset=active_set, hparams=params) trainer = Trainer(num_sanity_val_steps=0, max_epochs=params.epochs,
class ActiveDatasetTest(unittest.TestCase): def setUp(self): self.dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: (x[0], -1)) def test_len(self): assert len(self.dataset) == 0 assert self.dataset.n_unlabelled == 100 assert len(self.dataset.pool) == 100 self.dataset.label(0) assert len(self.dataset) == self.dataset.n_labelled == 1 assert self.dataset.n_unlabelled == 99 assert len(self.dataset.pool) == 99 self.dataset.label(list(range(99))) assert len(self.dataset) == 100 assert self.dataset.n_unlabelled == 0 assert len(self.dataset.pool) == 0 dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=self.dataset._labelled, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool) dummy_lbl = torch.from_numpy(self.dataset._labelled.astype(np.float32)) dummy_dataset = ActiveLearningDataset(MyDataset(), labelled=dummy_lbl, make_unlabelled=lambda x: (x[0], -1)) assert len(dummy_dataset) == len(self.dataset) assert len(dummy_dataset.pool) == len(self.dataset.pool) def test_pool(self): self.dataset._dataset.label = unittest.mock.MagicMock() labels_initial = self.dataset.n_labelled self.dataset.can_label = False self.dataset.label(0, value=np.arange(1, 10)) self.dataset._dataset.label.assert_not_called() labels_next_1 = self.dataset.n_labelled assert labels_next_1 == labels_initial + 1 self.dataset.can_label = True self.dataset.label(np.arange(0, 9)) self.dataset._dataset.label.assert_not_called() labels_next_2 = self.dataset.n_labelled assert labels_next_1 == labels_next_2 self.dataset.label(np.arange(0, 9), value=np.arange(1, 10)) assert self.dataset._dataset.label.called_once_with(np.arange(1, 10)) # cleanup del self.dataset._dataset.label self.dataset.can_label = False pool = self.dataset.pool assert np.equal([i for i in pool], [(i, -1) for i in np.arange(2, 100)]).all() assert np.equal([i for i in self.dataset], [(i, i) for i in np.arange(2)]).all() def test_get_raw(self): # check that get_raw returns the same thing regardless of labelling # status i_1 = self.dataset.get_raw(5) self.dataset.label(5) i_2 = self.dataset.get_raw(5) assert i_1 == i_2 def test_types(self): self.dataset.label_randomly(2) assert self.dataset._pool_to_oracle_index( 1) == self.dataset._pool_to_oracle_index([1]) assert self.dataset._oracle_to_pool_index( 1) == self.dataset._oracle_to_pool_index([1]) def test_state_dict(self): state_dict_1 = self.dataset.state_dict() assert np.equal(state_dict_1["labelled"], np.full((100, ), False)).all() self.dataset.label(0) assert np.equal( state_dict_1["labelled"], np.concatenate((np.array([True]), np.full((99, ), False)))).all() def test_load_state_dict(self): dataset_1 = ActiveLearningDataset(MyDataset(), random_state=50) dataset_1.label_randomly(10) state_dict1 = dataset_1.state_dict() dataset_2 = ActiveLearningDataset(MyDataset(), random_state=None) assert dataset_2.n_labelled == 0 dataset_2.load_state_dict(state_dict1) assert dataset_2.n_labelled == 10 # test if the second lable_randomly call have same behaviour dataset_1.label_randomly(5) dataset_2.label_randomly(5) assert np.allclose(dataset_1._labelled, dataset_2._labelled) def test_transform(self): train_transform = Lambda(lambda k: 1) test_transform = Lambda(lambda k: 0) dataset = ActiveLearningDataset( MyDataset(train_transform), pool_specifics={'transform': test_transform}, make_unlabelled=lambda x: (x[0], -1)) dataset.label(np.arange(10)) pool = dataset.pool assert np.equal([i for i in pool], [(0, -1) for i in np.arange(10, 100)]).all() assert np.equal([i for i in dataset], [(1, i) for i in np.arange(10)]).all() with pytest.warns(DeprecationWarning) as e: ActiveLearningDataset(MyDataset(train_transform), eval_transform=train_transform) assert len(e) == 1 with pytest.raises(ValueError) as e: ActiveLearningDataset(MyDataset(train_transform), pool_specifics={ 'whatever': 123 }).pool def test_random(self): self.dataset.label_randomly(50) assert len(self.dataset) == 50 assert len(self.dataset.pool) == 50 def test_random_state(self): seed = None dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert not np.allclose(dataset_1._labelled, dataset_2._labelled) seed = 50 dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert np.allclose(dataset_1._labelled, dataset_2._labelled) seed = np.random.RandomState(50) dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert not np.allclose(dataset_1._labelled, dataset_2._labelled) def test_label_randomly_full(self): dataset_1 = ActiveLearningDataset(MyDataset()) dataset_1.label_randomly(99) assert dataset_1.n_unlabelled == 1 assert len(dataset_1.pool) == 1 dataset_1.label_randomly(1) assert dataset_1.n_unlabelled == 0 assert dataset_1.n_labelled == 100
def test_random_state(self): seed = None dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert not np.allclose(dataset_1._labelled, dataset_2._labelled) seed = 50 dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert np.allclose(dataset_1._labelled, dataset_2._labelled) seed = np.random.RandomState(50) dataset_1 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_1.label_randomly(10) dataset_2 = ActiveLearningDataset(MyDataset(), random_state=seed) dataset_2.label_randomly(10) assert not np.allclose(dataset_1._labelled, dataset_2._labelled)