def test_active_learning_mixin():
    hparams = None
    dataset = DummyDataset()
    active_set = ActiveLearningDataset(dataset)
    active_set.label_randomly(10)
    model = DummyPytorchLightning(active_set, hparams)
    assert (len(model.pool_loader()) == 2)
Exemple #2
0
def main(hparams):
    train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                          transforms.ToTensor()])
    test_transform = transforms.Compose([transforms.ToTensor()])

    active_set = ActiveLearningDataset(
        CIFAR10(hparams.data_root, train=True, transform=train_transform, download=True),
        pool_specifics={
            'transform': test_transform
        })
    active_set.label_randomly(10)
    heuristic = BALD()
    model = VGG16(active_set, hparams)
    dp = 'dp' if hparams.n_gpus > 1 else None
    trainer = BaalTrainer(max_epochs=3, default_root_dir=hparams.data_root,
                          gpus=hparams.n_gpus, distributed_backend=dp,
                          # The weights of the model will change as it gets
                          # trained; we need to keep a copy (deepcopy) so that
                          # we can reset them.
                          callbacks=[ResetCallback(copy.deepcopy(model.state_dict()))])
    loop = ActiveLearningLoop(active_set, get_probabilities=trainer.predict_on_dataset_generator,
                              heuristic=heuristic,
                              ndata_to_label=hparams.query_size)

    AL_STEPS = 100
    for al_step in range(AL_STEPS):
        print(f'Step {al_step} Dataset size {len(active_set)}')
        trainer.fit(model)
        should_continue = loop.step()
        if not should_continue:
            break
def get_datasets(initial_pool):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(30),
        transforms.ToTensor(),
        transforms.Normalize(3 * [0.5], 3 * [0.5]),
    ])
    test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(3 * [0.5], 3 * [0.5]),
    ])
    # Note: We use the test set here as an example. You should make your own validation set.
    train_ds = datasets.CIFAR10(".",
                                train=True,
                                transform=transform,
                                target_transform=None,
                                download=True)
    test_set = datasets.CIFAR10(".",
                                train=False,
                                transform=test_transform,
                                target_transform=None,
                                download=True)

    active_set = ActiveLearningDataset(
        train_ds, pool_specifics={"transform": test_transform})

    # We start labeling randomly.
    active_set.label_randomly(initial_pool)
    return active_set, test_set
Exemple #4
0
    def test_len(self):
        assert len(self.dataset) == 0
        assert self.dataset.n_unlabelled == 100
        assert len(self.dataset.pool) == 100
        self.dataset.label(0)
        assert len(self.dataset) == self.dataset.n_labelled == 1
        assert self.dataset.n_unlabelled == 99
        assert len(self.dataset.pool) == 99
        self.dataset.label(list(range(99)))
        assert len(self.dataset) == 100
        assert self.dataset.n_unlabelled == 0
        assert len(self.dataset.pool) == 0

        dummy_dataset = ActiveLearningDataset(MyDataset(),
                                              labelled=self.dataset._labelled,
                                              make_unlabelled=lambda x:
                                              (x[0], -1))
        assert len(dummy_dataset) == len(self.dataset)
        assert len(dummy_dataset.pool) == len(self.dataset.pool)

        dummy_lbl = torch.from_numpy(self.dataset._labelled.astype(np.float32))
        dummy_dataset = ActiveLearningDataset(MyDataset(),
                                              labelled=dummy_lbl,
                                              make_unlabelled=lambda x:
                                              (x[0], -1))
        assert len(dummy_dataset) == len(self.dataset)
        assert len(dummy_dataset.pool) == len(self.dataset.pool)
Exemple #5
0
class SSLModuleTest(unittest.TestCase):
    def setUp(self):
        d1_len = 100
        d2_len = 1000
        d1 = SSLTestDataset(labeled=True, length=d1_len)
        d2 = SSLTestDataset(labeled=False, length=d2_len)
        dataset = ConcatDataset([d1, d2])

        print(len(dataset))

        self.al_dataset = ActiveLearningDataset(dataset)
        self.al_dataset.label(list(
            range(d1_len)))  # Label data from d1 (even numbers)

    def test_epoch(self):
        hparams = {
            'p': None,
            'num_steps': None,
            'batch_size': 10,
            'workers': 0
        }

        module = TestSSLModule(self.al_dataset, Namespace(**hparams))
        trainer = Trainer(max_epochs=1,
                          num_sanity_val_steps=0,
                          progress_bar_refresh_rate=0,
                          logger=False,
                          checkpoint_callback=False)
        trainer.fit(module)

        assert len(module.labeled_data) == len(module.unlabeled_data)
        assert torch.all(torch.tensor(module.labeled_data) % 2 == 0)
        assert torch.all(torch.tensor(module.unlabeled_data) % 2 != 0)
Exemple #6
0
def test_arrowds():
    dataset = HFdata.load_dataset('glue', 'sst2')['test']
    dataset = ActiveLearningDataset(dataset)
    dataset.label(np.arange(10))
    assert len(dataset) == 10
    assert len(dataset.pool) == 1811
    data = dataset.pool[0]
    assert [k in ['idx', 'label', 'sentence'] for k, v in data.items()]
Exemple #7
0
def test_calibration_integration():
    transform_pipeline = Compose([Resize((64, 64)), ToTensor()])
    cifar10_train = DummyDataset(transform_pipeline)
    cifar10_test = DummyDataset(transform_pipeline)

    # we don't create different trainset for calibration since the goal is not
    # to calibrate
    al_dataset = ActiveLearningDataset(
        cifar10_train, pool_specifics={'transform': transform_pipeline})
    al_dataset.label_randomly(10)
    use_cuda = False
    model = vgg.vgg16(pretrained=False, num_classes=10)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=0.001,
                          momentum=0.9,
                          weight_decay=0.0005)

    wrapper = ModelWrapper(model, criterion)
    calibrator = DirichletCalibrator(wrapper=wrapper,
                                     num_classes=10,
                                     lr=0.001,
                                     reg_factor=0.01)

    for step in range(2):
        wrapper.train_on_dataset(al_dataset,
                                 optimizer=optimizer,
                                 batch_size=10,
                                 epoch=1,
                                 use_cuda=use_cuda,
                                 workers=0)

        wrapper.test_on_dataset(cifar10_test,
                                batch_size=10,
                                use_cuda=use_cuda,
                                workers=0)

        before_calib_param = list(
            map(lambda x: x.clone(), wrapper.model.parameters()))

        calibrator.calibrate(al_dataset,
                             cifar10_test,
                             batch_size=10,
                             epoch=5,
                             use_cuda=use_cuda,
                             double_fit=False,
                             workers=0)

        after_calib_param = list(map(lambda x: x.clone(), model.parameters()))

        assert all([
            np.allclose(i.detach(), j.detach())
            for i, j in zip(before_calib_param, after_calib_param)
        ])

        assert len(list(wrapper.model.modules())) < len(
            list(calibrator.calibrated_model.modules()))
Exemple #8
0
def test_integration():
    transform_pipeline = Compose([Resize((64, 64)), ToTensor()])
    cifar10_train = DummyDataset(transform_pipeline)
    cifar10_test = DummyDataset(transform_pipeline)

    al_dataset = ActiveLearningDataset(
        cifar10_train, pool_specifics={'transform': transform_pipeline})
    al_dataset.label_randomly(10)

    use_cuda = False
    model = vgg.vgg16(pretrained=False, num_classes=10)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=0.001,
                          momentum=0.9,
                          weight_decay=0.0005)

    # We can now use BaaL to create the active learning loop.

    model = ModelWrapper(model, criterion)
    # We create an ActiveLearningLoop that will automatically label the most uncertain samples.
    # In this case, we use the widely used BALD heuristic.

    active_loop = ActiveLearningLoop(al_dataset,
                                     model.predict_on_dataset,
                                     heuristic=heuristics.BALD(),
                                     ndata_to_label=10,
                                     batch_size=10,
                                     iterations=10,
                                     use_cuda=use_cuda,
                                     workers=4)

    # We're all set!
    num_steps = 10
    for step in range(num_steps):
        old_param = list(map(lambda x: x.clone(), model.model.parameters()))
        model.train_on_dataset(al_dataset,
                               optimizer=optimizer,
                               batch_size=10,
                               epoch=5,
                               use_cuda=use_cuda,
                               workers=2)
        model.test_on_dataset(cifar10_test,
                              batch_size=10,
                              use_cuda=use_cuda,
                              workers=2)

        if not active_loop.step():
            break
        new_param = list(map(lambda x: x.clone(), model.model.parameters()))
        assert any([
            not np.allclose(i.detach(), j.detach())
            for i, j in zip(old_param, new_param)
        ])
    assert step == 4  # 10 + (4 * 10) = 50, so it stops at iterations 4
Exemple #9
0
    def setUp(self):
        d1_len = 100
        d2_len = 1000
        d1 = SSLTestDataset(labeled=True, length=d1_len)
        d2 = SSLTestDataset(labeled=False, length=d2_len)
        dataset = ConcatDataset([d1, d2])

        print(len(dataset))

        self.al_dataset = ActiveLearningDataset(dataset)
        self.al_dataset.label(list(range(d1_len)))  # Label data from d1 (even numbers)
Exemple #10
0
def test_sad(max_sample, expected):
    dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1)
    active_loop = ActiveLearningLoop(dataset,
                                     get_probs_iter,
                                     heuristics.Random(),
                                     max_sample=max_sample,
                                     query_size=10,
                                     dummy_param=1)
    dataset.label_randomly(10)
    active_loop.step()
    assert len(dataset) == 10 + expected
Exemple #11
0
 def test_transform(self):
     train_transform = Lambda(lambda k: 1)
     test_transform = Lambda(lambda k: 0)
     dataset = ActiveLearningDataset(MyDataset(train_transform),
                                     test_transform,
                                     make_unlabelled=lambda x: (x[0], -1))
     dataset.label(np.arange(10))
     pool = dataset.pool
     assert np.equal([i for i in pool],
                     [(0, -1) for i in np.arange(10, 100)]).all()
     assert np.equal([i for i in dataset],
                     [(1, i) for i in np.arange(10)]).all()
Exemple #12
0
    def test_transform(self):
        train_transform = Lambda(lambda k: 1)
        test_transform = Lambda(lambda k: 0)
        dataset = ActiveLearningDataset(MyDataset(train_transform), make_unlabelled=lambda x: (x[0], -1),
                                        pool_specifics={'transform': test_transform})
        dataset.label(np.arange(10))
        pool = dataset.pool
        assert np.equal([i for i in pool], [(0, -1) for i in np.arange(10, 100)]).all()
        assert np.equal([i for i in dataset], [(1, i) for i in np.arange(10)]).all()

        with pytest.raises(ValueError) as e:
            ActiveLearningDataset(MyDataset(train_transform), pool_specifics={'whatever': 123}).pool
def test_on_load_checkpoint():
    hparams = None
    dataset = DummyDataset()
    active_set = ActiveLearningDataset(dataset)
    active_set.label_randomly(10)
    model = DummyPytorchLightning(active_set, hparams)
    ckpt = {}
    save_chkp = model.on_save_checkpoint(ckpt)
    assert ('active_dataset' in ckpt)
    active_set_2 = ActiveLearningDataset(dataset)
    model_2 = DummyPytorchLightning(active_set_2, hparams)
    on_load_chkp = model_2.on_load_checkpoint(ckpt)
    assert (len(active_set) == len(active_set_2))
Exemple #14
0
 def setUp(self):
     self.lbls = None
     self.transform = Compose([Resize(60), RandomRotation(90), ToTensor()])
     testtransform = Compose([Resize(32), ToTensor()])
     self.dataset = FileDataset(self.paths,
                                self.lbls,
                                transform=self.transform)
     self.lbls = self.generate_labels(len(self.paths), 10)
     self.dataset = FileDataset(self.paths,
                                self.lbls,
                                transform=self.transform)
     self.active = ActiveLearningDataset(
         self.dataset,
         labelled=(np.array(self.lbls) != -1),
         pool_specifics={'transform': testtransform})
Exemple #15
0
def active_pascal(
        path="/tmp",
        *args,
        transform=transforms.ToTensor(),
        test_transform=transforms.ToTensor(),
        **kwargs,
):
    """Get active Pascal-VOC 2102 datasets.
    Arguments:
        path : str
            The root folder for the Pascal dataset
    Returns:
        ActiveLearningDataset
            the active learning dataset, training data
        Dataset
            the evaluation dataset
    """

    return (
        ActiveLearningDataset(
            datasets.VOCSegmentation(path,
                                     image_set='train',
                                     transform=transform,
                                     download=False,
                                     *args,
                                     **kwargs)),
        datasets.VOCSegmentation(path,
                                 image_set='val',
                                 transform=test_transform,
                                 download=False,
                                 *args,
                                 **kwargs),
    )
Exemple #16
0
def test_should_stop_iter(heur):
    dataset = ActiveLearningDataset(MyDataset(), make_unlabelled=lambda x: -1)
    active_loop = ActiveLearningLoop(dataset,
                                     get_probs_iter,
                                     heur,
                                     query_size=10,
                                     dummy_param=1)
    dataset.label_randomly(10)
    step = 0
    for _ in range(15):
        flg = active_loop.step()
        step += 1
        if not flg:
            break

    assert step == 10
def test_on_load_checkpoint(a_data_module, a_dataset, hparams):
    ckpt = {}
    _ = a_data_module.on_save_checkpoint(ckpt)
    assert ('active_dataset' in ckpt)
    active_set_2 = ActiveLearningDataset(a_dataset)
    data_mdoule_2 = MyDataModule(active_set_2, hparams['batch_size'])
    on_load_chkp = data_mdoule_2.on_load_checkpoint(ckpt)
    assert (len(a_data_module.active_dataset) == len(active_set_2))
Exemple #18
0
    def setUp(self):
        d1_len = 100
        d2_len = 1000
        d1 = SSLTestDataset(labeled=True, length=d1_len)
        d2 = SSLTestDataset(labeled=False, length=d2_len)
        dataset = ConcatDataset([d1, d2])

        print(len(dataset))

        self.al_dataset = ActiveLearningDataset(dataset)
        self.al_dataset.label(list(
            range(d1_len)))  # Label data from d1 (even numbers)

        self.ss_iterator = SemiSupervisedIterator(self.al_dataset,
                                                  p=None,
                                                  num_steps=None,
                                                  batch_size=10)
Exemple #19
0
def test_labelled_map():
    ds = ActiveLearningDataset(MyDataset())
    assert ds.current_al_step == 0
    ds.label_randomly(10)
    assert ds.current_al_step == 1
    ds.label_randomly(10)
    assert ds.labelled_map.max() == 2 and np.equal(ds.labelled, ds.labelled_map > 0).all()

    st = ds.state_dict()
    ds2 = ActiveLearningDataset(MyDataset(), labelled=st["labelled"])
    assert ds2.current_al_step == ds.current_al_step
def test_pl_step():
    hparams = HParams()
    dataset = DummyDataset()
    active_set = ActiveLearningDataset(dataset)
    active_set.label_randomly(10)
    model = DummyPytorchLightning(active_set, hparams)
    ckpt = {}
    save_chkp = model.on_save_checkpoint(ckpt)
    trainer = BaalTrainer(dataset=active_set,
                          max_epochs=3, default_root_dir='/tmp',
                          ndata_to_label=hparams.query_size,
                          callbacks=[ResetCallback(copy.deepcopy(save_chkp))])
    trainer.model = model

    before = len(active_set)
    trainer.step()
    after = len(active_set)

    assert after - before == hparams.query_size
Exemple #21
0
def test_file_saving(tmpdir):
    tmpdir = str(tmpdir)
    heur = heuristics.BALD()
    ds = MyDataset()
    dataset = ActiveLearningDataset(ds, make_unlabelled=lambda x: -1)
    active_loop = ActiveLearningLoop(dataset,
                                     get_probs_iter,
                                     heur,
                                     uncertainty_folder=tmpdir,
                                     query_size=10,
                                     dummy_param=1)
    dataset.label_randomly(10)
    _ = active_loop.step()
    assert len(os.listdir(tmpdir)) == 1
    file = pjoin(tmpdir, os.listdir(tmpdir)[0])
    assert "pool=90" in file and "labelled=10" in file
    data = pickle.load(open(file, 'rb'))
    assert len(data['uncertainty']) == 90
    # The diff between the current state and the step before is the newly labelled item.
    assert (data['dataset']['labelled'] != dataset.labelled).sum() == 10
Exemple #22
0
def test_deprecation():
    heur = heuristics.BALD()
    ds = MyDataset()
    dataset = ActiveLearningDataset(ds, make_unlabelled=lambda x: -1)
    with warnings.catch_warnings(record=True) as w:
        active_loop = ActiveLearningLoop(dataset,
                                         get_probs_iter,
                                         heur,
                                         ndata_to_label=10,
                                         dummy_param=1)
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "ndata_to_label" in str(w[-1].message)
 def __init__(self, data_root, batch_size):
     train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                           transforms.ToTensor()])
     test_transform = transforms.Compose([transforms.ToTensor()])
     active_set = ActiveLearningDataset(
         CIFAR10(data_root, train=True, transform=train_transform, download=True),
         pool_specifics={
             'transform': test_transform
         })
     self.test_set = CIFAR10(data_root, train=False, transform=test_transform, download=True)
     super().__init__(active_dataset=active_set, batch_size=batch_size,
                      train_transforms=train_transform,
                      test_transforms=test_transform)
Exemple #24
0
 def test_label_randomly_full(self):
     dataset_1 = ActiveLearningDataset(MyDataset())
     dataset_1.label_randomly(99)
     assert dataset_1.n_unlabelled == 1
     assert len(dataset_1.pool) == 1
     dataset_1.label_randomly(1)
     assert dataset_1.n_unlabelled == 0
     assert dataset_1.n_labelled == 100
Exemple #25
0
def test_last_active_step():
    ds = ActiveLearningDataset(MyDataset(), last_active_steps=1)
    assert len(ds) == 0
    ds.label_randomly(10)
    assert len(ds) == 10
    ds.label_randomly(10)
    # We only iterate over the items labelled at step 2.
    assert len(ds) == 10
    assert all(ds.labelled_map[x] == 2 for x, _ in ds)
Exemple #26
0
    def test_no_pool(self):
        d1 = SSLTestDataset(labeled=True, length=100)
        al_dataset = ActiveLearningDataset(d1)
        al_dataset.label_randomly(100)
        ss_iterator = SemiSupervisedIterator(al_dataset,
                                             p=0.1,
                                             num_steps=None,
                                             batch_size=10)

        labeled_data = []
        unlabeled_data = []
        for batch_idx, batch in enumerate(ss_iterator):
            if SemiSupervisedIterator.is_labeled(batch):
                batch = SemiSupervisedIterator.get_batch(batch)
                labeled_data.extend(batch)
            else:
                batch = SemiSupervisedIterator.get_batch(batch)
                unlabeled_data.extend(batch)

        total = len(labeled_data) + len(unlabeled_data)
        l_ratio = len(labeled_data) / total
        u_ratio = len(unlabeled_data) / total
        assert l_ratio == 1
        assert u_ratio == 0
Exemple #27
0
 def setUp(self):
     self.lbls = None
     self.transform = Compose([Resize(60), RandomRotation(90), ToTensor()])
     testtransform = Compose([Resize(32), ToTensor()])
     self.dataset = FileDataset(self.paths,
                                self.lbls,
                                transform=self.transform)
     self.lbls = self.generate_labels(len(self.paths), 10)
     self.dataset = FileDataset(self.paths,
                                self.lbls,
                                transform=self.transform)
     self.active = ActiveLearningDataset(
         self.dataset,
         eval_transform=testtransform,
         labelled=torch.from_numpy(
             (np.array(self.lbls) != -1).astype(np.uint8)))
Exemple #28
0
def test_warning_raised_on_label():
    class DS(Dataset):
        def __init__(self):
            self.x = [1, 2, 3]
            self.label = [1, 1, 1]

        def __len__(self):
            return len(self.x)

        def __getitem__(self, item):
            return self.x[item], self.y[item]

    with warnings.catch_warnings(record=True) as w:
        al = ActiveLearningDataset(DS())
        assert not al.can_label
        assert len(w) == 1
        assert "label" in str(w[-1].message)
Exemple #29
0
def active_huggingface_dataset(dataset,
                               tokenizer=None,
                               target_key: str = "label",
                               input_key: str = "sentence",
                               max_seq_len: int = 128,
                               **kwargs):
    """
    Wrapping huggingface.datasets with baal.active.ActiveLearningDataset.

    Args:
        dataset (torch.utils.data.Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter sequences.
        kwargs (Dict): Parameters forwarded to 'ActiveLearningDataset'.

    Returns:
        an baal.active.ActiveLearningDataset object.
    """

    return ActiveLearningDataset(
        HuggingFaceDatasets(dataset, tokenizer, target_key, input_key,
                            max_seq_len), **kwargs)
def get_dataset(split, dataset_dict):
    if dataset_dict["name"] == "active_learning":
        transform = tt.Compose([tt.ToPILImage(), tt.ToTensor()])
        n_classes = dataset_dict.get('n_classes')
        n_classes = n_classes or (52
                                  if dataset_dict['task'] == 'char' else 1002)
        path = get_data_path_or_download(dataset_dict["path"], DATA_ROOT)
        dataset = AleatoricSynbols(
            path=path,
            split=split,
            key=dataset_dict["task"],
            transform=transform,
            p=dataset_dict.get('p', 0.0),
            seed=dataset_dict.get('seed', 666),
            n_classes=n_classes,
        )
        if split == 'train':
            # Make an AL dataset and label randomly.
            dataset = ActiveLearningDataset(
                dataset, pool_specifics={'transform': transform})
            dataset.label_randomly(dataset_dict['initial_pool'])
        return dataset
    else:
        raise ValueError("Dataset %s not found" % dataset_dict["name"])