Ejemplo n.º 1
0
 def DataBundler(self, add=True, name=None):
     from thexp import DataBundler
     bundler = DataBundler()
     if add:
         return bundler.add(self, name)
     else:
         return bundler.cycle(self, name)
Ejemplo n.º 2
0
    def datasets(self, params: SemiSupervisedParams):
        dataset_fn = datasets[params.dataset]

        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True)

        indexs, un_indexs, val_indexs = splits.semi_split(
            train_y,
            n_percls=params.n_percls,
            val_size=params.val_size,
            repeat_sup=False)
        self.logger.info('sup/unsup/val : {}'.format(
            (len(indexs), len(un_indexs), len(val_indexs))))
        mean, std = norm_val.get(params.dataset, [None, None])
        toTensor = ToNormTensor(mean, std)
        weak = Weak(mean, std)
        strong = Strong(mean, std)

        sup_set = (DatasetBuilder(
            train_x, train_y).add_x(transform=weak).add_y().subset(indexs))
        if len(sup_set) < params.batch_size:
            sup_set.virtual_sample(params.batch_size)

        unsup_set = (DatasetBuilder(train_x, train_y).toggle_id().add_x(
            transform=weak).add_x(transform=strong).add_y().subset(un_indexs))
        self.cl_set = unsup_set

        sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size,
                                            num_workers=params.num_workers,
                                            shuffle=True)
        self.sup_dataloader = sup_dataloader

        unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size *
                                                params.uratio,
                                                num_workers=1,
                                                shuffle=True)

        self.unsup_dataloader = DataBundler().add(unsup_dataloader).to(
            self.device)

        val_dataloader = (DatasetBuilder(
            train_x[val_indexs],
            train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add(
            unsup_dataloader).zip_mode(),
                                eval=val_dataloader,
                                test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 3
0
    def on_train_epoch_begin(self, trainer: Trainer, func,
                             params: DivideMixParams, *args, **kwargs):
        if params.eidx <= params.warm_up:
            pass
        else:
            self.logger.info('create semi dataset')
            if params.eidx % 2 == 1:
                prob = self.eval_train(self.model1,
                                       target_mem=self.target_mem1,
                                       plabel_mem=self.plabel_mem1,
                                       false_pred_mem=self.false_pred_mem1,
                                       noisy_cls_mem=self.noisy_cls_mem1
                                       )  # type: np.ndarray, list
            else:
                prob = self.eval_train(self.model2,
                                       target_mem=self.target_mem2,
                                       plabel_mem=self.plabel_mem2,
                                       false_pred_mem=self.false_pred_mem2,
                                       noisy_cls_mem=self.noisy_cls_mem2
                                       )  # type: np.ndarray, list
            pred = (prob > params.p_threshold)

            pred_idx = pred.nonzero()[0]
            unpred_idx = (1 - pred).nonzero()[0]

            train_x, train_y = self.train_set_pack

            mean, std = norm_val.get(params.dataset, [None, None])
            weak = BigWeak(mean, std)

            self.labeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_labels(prob, source_name='nprob').add_x(
                    transform=weak).add_x(transform=weak).add_y().add_y(
                        source='nprob').subset(pred_idx).DataLoader(
                            params.batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=params.num_workers))

            self.unlabeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_x(transform=weak).add_x(
                    transform=weak).add_y().subset(unpred_idx).DataLoader(
                        params.batch_size,
                        shuffle=True,
                        drop_last=True,
                        num_workers=params.num_workers))
            self.unlabeled_dataloader_iter = None
            bundler = DataBundler()
            bundler.add(self.labeled_dataloader
                        )  # .cycle(self.unlabeled_dataloader).zip_mode()
            self.logger.info('new training dataset', bundler,
                             len(self.unlabeled_dataloader))
            self.regist_databundler(train=bundler.to(self.device))
Ejemplo n.º 4
0
    def on_train_epoch_begin(self, trainer: Trainer, func,
                             params: DivideMixParams, *args, **kwargs):
        if params.eidx < params.warm_up:
            pass
        else:
            if params.eidx % 2 == 0:
                prob, self.all_loss[1] = self.eval_train(
                    self.model2, self.all_loss[1])  # type: np.ndarray, list
                pred = (prob > params.p_threshold)
            else:
                prob, self.all_loss[0] = self.eval_train(
                    self.model, self.all_loss[0])  # type: np.ndarray, list
                pred = (prob > params.p_threshold)

            pred_idx = pred.nonzero()[0]
            unpred_idx = (1 - pred).nonzero()[0]

            train_x, train_y, noisy_y = self.train_set_pack
            clean = (noisy_y == train_y)
            acc = (pred[clean]).mean()
            self.logger.info('Numer of labeled samples', pred.sum(),
                             'clean ratio = {}'.format(acc))

            mean, std = norm_val.get(params.dataset, [None, None])
            weak = Weak(mean, std)

            labeled_dataloader = (DatasetBuilder(train_x, train_y).add_labels(
                noisy_y, source_name='nys').add_labels(
                    prob, source_name='nprob').add_x(transform=weak).add_x(
                        transform=weak).add_y().add_y(source='nys').add_y(
                            source='nprob').subset(pred_idx).DataLoader(
                                params.batch_size,
                                shuffle=True,
                                drop_last=True,
                                num_workers=params.num_workers))

            unlabeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_labels(noisy_y, source_name='nys').add_x(
                    transform=weak).add_x(transform=weak).add_y().add_y(
                        source='nys').subset(unpred_idx).DataLoader(
                            params.batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=params.num_workers))
            bundler = DataBundler()
            bundler.add(labeled_dataloader).cycle(
                unlabeled_dataloader).zip_mode()
            self.logger.info('new training dataset', bundler)
            self.regist_databundler(train=bundler.to(self.device))
Ejemplo n.º 5
0
 def _regist_databundler(self, key, val):
     from torch.utils.data import DataLoader
     assert isinstance(val, (DataBundler, DataLoader))
     if isinstance(val, DataLoader):
         val = DataBundler().add(val)
     if key in self._databundler_dict:
         del self._databundler_dict[key]
     self._databundler_dict[key] = val
Ejemplo n.º 6
0
    def datasets(self, params: DivideMixParams):
        from data.dataxy import datasets
        dataset_fn = datasets[params.dataset]

        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True)

        mean, std = norm_val.get(params.dataset, [None, None])
        toTensor = ToNormTensor(mean, std)
        weak = Weak(mean, std)
        strong = Strong(mean, std)

        if params.noisy_type == 'asymmetric':
            from data.noisy import asymmetric_noisy
            noisy_y = asymmetric_noisy(train_y,
                                       params.noisy_ratio,
                                       n_classes=params.n_classes)

        elif params.noisy_type == 'symmetric':
            from data.noisy import symmetric_noisy
            noisy_y = symmetric_noisy(train_y,
                                      params.noisy_ratio,
                                      n_classes=params.n_classes)

        else:
            assert False, params.noisy_type
        self.train_set_pack = [train_x, np.array(train_y), noisy_y]

        self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean()))

        train_set = (DatasetBuilder(train_x, train_y).add_labels(
            noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x(
                transform=strong).add_y().add_y(source='noisy_y'))
        train_dataloader = train_set.DataLoader(batch_size=params.batch_size *
                                                2,
                                                num_workers=params.num_workers,
                                                shuffle=True)
        from thexp import DataBundler

        self.eval_train_dataloader = (DataBundler().add(
            DatasetBuilder(train_x, noisy_y).toggle_id().add_x(
                transform=toTensor).add_y().DataLoader(
                    batch_size=params.batch_size,
                    num_workers=params.num_workers // 2,
                    shuffle=False)).to(self.device))

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers // 2,
                shuffle=False))

        self.regist_databundler(train=train_dataloader, test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 7
0
    def datasets(self, params: DivideMixParams):

        from data.dataxy_noisylabel import clothing1m_balance
        dataset_fn = clothing1m_balance

        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True, params.cut_size)

        mean, std = norm_val.get('clothing1m', [None, None])
        toTensor = BigToTensor(mean, std)
        weak = BigWeak(mean, std)
        # strong = BigStrong(mean, std)

        self.train_set_pack = [train_x, np.array(train_y)]

        train_set = (
            DatasetBuilder(train_x, train_y)
                .toggle_id()
                .add_x(transform=weak)
                # .add_x(transform=strong)
                .add_y()
        )
        train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2,
                                                num_workers=params.num_workers,
                                                shuffle=True)
        from thexp import DataBundler
        self.train_set = train_set
        self.train_size = len(train_set)

        self.eval_train_dataloader = (
            DataBundler()
                .add(
                DatasetBuilder(train_x, train_y)
                    .toggle_id()
                    .add_x(transform=toTensor)
                    .add_y()
                    .DataLoader(batch_size=params.batch_size,
                                num_workers=params.num_workers // 2,
                                shuffle=False)
            ).to(self.device)
        )

        test_dataloader = (
            DatasetBuilder(test_x, test_y)
                .add_x(transform=toTensor).add_y()
                .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)
        )

        self.regist_databundler(train=train_dataloader,
                                test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 8
0
    def datasets(self, params: SemiSupervisedParams):
        dataset_fn = datasets[params.dataset]

        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True)

        indexs, un_indexs, val_indexs = splits.semi_split(
            train_y, n_percls=params.n_percls, val_size=5000, repeat_sup=False)

        mean, std = norm_val.get(params.dataset, [None, None])
        toTensor = ToNormTensor(mean, std)
        weak = Weak(mean, std)

        sup_set = (DatasetBuilder(
            train_x, train_y).add_x(transform=weak).add_y().subset(indexs))

        params.K = params.default(2, True)
        unsup_set = DatasetBuilder(train_x, train_y)
        for _ in range(params.K):
            unsup_set.add_x(transform=weak)
        unsup_set = unsup_set.add_y().subset(un_indexs)

        sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size,
                                            num_workers=params.num_workers,
                                            shuffle=True)

        unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size,
                                                num_workers=params.num_workers,
                                                shuffle=True)

        val_dataloader = (DatasetBuilder(
            train_x[val_indexs],
            train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add(
            unsup_dataloader).zip_mode(),
                                eval=val_dataloader,
                                test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 9
0
 def datasets(self, params: ImblanceParams):
     super().datasets(params)
     data_loader = get_mnist_loader(params.batch_size,
                                    classes=params.classes,
                                    proportion=params.train_proportion,
                                    mode="train")
     test_loader = get_mnist_loader(params.batch_size,
                                    classes=params.classes,
                                    proportion=0.5,
                                    mode="test")
     val_dataset = ValSet(data_loader.dataset.data_val,
                          data_loader.dataset.labels_val)
     val_loader = DataLoader(val_dataset,
                             params.batch_size,
                             drop_last=False,
                             shuffle=True)
     train_loader = DataBundler().add(data_loader).cycle(
         val_loader).zip_mode()
     self.regist_databundler(train=train_loader, test=test_loader)
     self.to(self.device)
Ejemplo n.º 10
0
"""

"""
from torch.utils.data.dataloader import DataLoader
from torchvision.datasets.fakedata import FakeData
from torchvision.transforms import ToTensor

from thexp import DataBundler

bundler = DataBundler()

sub = DataBundler()
sub.add(DataLoader(FakeData(transform=ToTensor()), batch_size=10)) \
    .add(DataLoader(FakeData(image_size=(3, 32, 32), transform=ToTensor()), batch_size=10)) \
    .zip_mode()

bundler.add(sub) \
    .add(DataLoader(FakeData(image_size=(3, 28, 28), transform=ToTensor()), batch_size=10)) \
    .zip_mode()

for ((i1, l1), (i2, l2)), (i3, l3) in bundler:
    print(i1.shape, l1.shape, i2.shape, l2.shape, i3.shape, l3.shape)

bundler = (DataBundler().cycle(
    DataLoader(FakeData(size=10, image_size=(3, 28, 28), transform=ToTensor()),
               batch_size=10)).add(
                   DataLoader(FakeData(size=1000,
                                       image_size=(3, 28, 28),
                                       transform=ToTensor()),
                              batch_size=10)).zip_mode())
Ejemplo n.º 11
0
    def datasets(self, params: MnistImblanceParams):
        super().datasets(params)
        from data.dataxy import mnist

        test_x, test_y = mnist(False)
        train_x, train_y = mnist(True)

        train_y = np.array(train_y, dtype=np.float32)
        test_y = np.array(test_y, dtype=np.float32)

        # search and mask sample with class [4, 9]
        train_mask_lis = [
            np.where(train_y == i)[0] for i in params.train_classes
        ]
        test_mask_lis = [
            np.where(test_y == i)[0] for i in params.train_classes
        ]

        for new_cls, i in enumerate(params.train_classes):
            train_y[train_mask_lis[new_cls]] = new_cls
            test_y[test_mask_lis[new_cls]] = new_cls

        train_mask = np.concatenate(train_mask_lis)
        test_mask = np.concatenate(test_mask_lis)

        test_x, test_y = test_x[test_mask], test_y[test_mask]
        train_x, train_y = train_x[train_mask], train_y[train_mask]

        # split train/val dataset
        train_ids, val_ids = splits.train_val_split(train_y,
                                                    val_size=params.val_size)

        train_x, val_x = train_x[train_ids], train_x[val_ids]
        train_y, val_y = train_y[train_ids], train_y[val_ids]

        # reduce size of second class
        train_mask_lis = [
            np.where(train_y == i)[0] for i in range(len(params.train_classes))
        ]
        sec_cls_size = int(
            (1 - params.train_proportion) * len(train_mask_lis[0]))
        train_mask_lis[1] = train_mask_lis[1][:sec_cls_size]
        train_mask = np.concatenate(train_mask_lis)
        train_x, train_y = train_x[train_mask], train_y[train_mask]

        toTensor = ToNormTensor((0.1307, ), (0.3081, ))

        train_dataloader = (DatasetBuilder(
            train_x, train_y).add_x(toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        val_dataloader = (DatasetBuilder(
            val_x, val_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        self.regist_databundler(train=DataBundler().add(
            train_dataloader).cycle(val_dataloader).zip_mode(),
                                eval=val_dataloader,
                                test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 12
0
    def datasets(self, params: NoisyParams):
        params.noisy_type = params.default('symmetric', True)
        params.noisy_ratio = params.default(0.2, True)

        import numpy as np
        dataset_fn = datasets[params.dataset]
        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True)

        # train_ids, query_ids = splits.train_val_split(train_y, val_size=params.val_size)

        query_ids, train_ids, eval_ids = splits.semi_split(
            train_y,
            params.query_size // params.n_classes,
            val_size=params.val_size,
            repeat_sup=False)

        # train_ids = train_ids[:3000]
        self.train_size = len(train_ids)
        train_x, query_x, eval_x = train_x[train_ids], train_x[
            query_ids], train_x[eval_ids]
        train_y, query_y, eval_y = train_y[train_ids], train_y[
            query_ids], train_y[eval_ids]

        mean, std = norm_val.get(params.dataset, [None, None])
        toTensor = ToNormTensor(mean, std)
        weak = Weak(mean, std)
        strong = Strong(mean, std)

        if params.noisy_type == 'asymmetric':
            from data.noisy import asymmetric_noisy
            noisy_y = asymmetric_noisy(train_y,
                                       params.noisy_ratio,
                                       n_classes=params.n_classes)

        elif params.noisy_type == 'symmetric':
            from data.noisy import symmetric_noisy
            noisy_y = symmetric_noisy(train_y,
                                      params.noisy_ratio,
                                      n_classes=params.n_classes)

        else:
            assert False

        self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean()))
        self.rnd.shuffle()

        self.logger.info(len(train_y), len(train_x), len(noisy_y))
        train_set = (DatasetBuilder(train_x, train_y).add_labels(
            noisy_y, 'noisy_y').toggle_id().add_x(transform=strong))

        params.K = params.default(0, True)
        for _ in range(params.K):
            train_set.add_x(transform=weak)

        train_set = (train_set.add_y().add_y(source='noisy_y'))

        if params.distributed:
            from torch.utils.data.distributed import DistributedSampler
            sampler = DistributedSampler(train_set, num_replicas=4)
            self.sampler_a = sampler
        else:
            sampler = None
        train_set = train_set.DataLoader(batch_size=params.batch_size,
                                         num_workers=params.num_workers,
                                         sampler=sampler,
                                         shuffle=not params.distributed)

        query_set = (DatasetBuilder(query_x,
                                    query_y).add_x(transform=strong).add_y())
        if params.distributed:
            from torch.utils.data.distributed import DistributedSampler
            sampler = DistributedSampler(train_set, num_replicas=4)
            self.sampler_b = sampler
        else:
            sampler = None
        query_set = query_set.DataLoader(batch_size=params.batch_size,
                                         num_workers=params.num_workers,
                                         sampler=sampler,
                                         shuffle=not params.distributed)

        val_dataloader = (
            DatasetBuilder(eval_x, eval_y).add_x(transform=toTensor).add_y().
            DataLoader(
                batch_size=params.batch_size,
                shuffle=
                False,  # do not shuffle # no shuffle for probe, so a batch is class balanced.(?)
                num_workers=params.num_workers))

        train_dataloader = DataBundler().add(train_set).cycle(
            query_set).zip_mode()

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=True))

        self.regist_databundler(train=train_dataloader,
                                eval=val_dataloader,
                                test=test_dataloader)
        self.to(self.device)
Ejemplo n.º 13
0
    def change_dataset(self):
        """
        根据当前的 filter_mem,按 thresh 将其分为有监督和无监督
        :return:
        """
        from data.constant import norm_val
        from data.transforms import ToNormTensor, Weak, Strong

        train_x, train_y, noisy_y = self.train_set

        filter_prob = self.filter_mem.cpu().numpy()
        clean_mask = filter_prob > 0.5
        self.logger.info('sup size', clean_mask.sum())
        if clean_mask.all() or not np.logical_not(clean_mask).any():
            return

        clean_ids = np.where(clean_mask)[0]
        noisy_ids = np.where(np.logical_not(clean_mask))[0]

        mean, std = norm_val.get(params.dataset, [None, None])
        weak = Weak(mean, std)
        strong = Strong(mean, std)

        supervised_dataloader = (
            DatasetBuilder(train_x, train_y)
                .add_labels(noisy_y, source_name='ny')
                .toggle_id()
                .add_x(strong)
                .add_y()
                .add_y(source='ny')
                .subset(clean_ids)
                .DataLoader(params.batch_size // 2,
                            shuffle=True,
                            num_workers=0,
                            drop_last=True)
        )

        unsupervised_dataloader = (
            DatasetBuilder(train_x, train_y)
                .add_labels(noisy_y, source_name='ny')
                .add_labels(filter_prob, source_name='nprob')
                .toggle_id()
                .add_x(strong)
                .add_x(strong)
                .add_y()
                .add_y(source='ny')
                .add_y(source='nprob')
                .subset(noisy_ids)
                .DataLoader(params.batch_size // 2,
                            shuffle=True,
                            num_workers=0,
                            drop_last=True)
        )
        if len(supervised_dataloader) > len(unsupervised_dataloader):
            train_dataloader = (
                DataBundler()
                    .add(supervised_dataloader)
                    .cycle(unsupervised_dataloader)
            )
        else:
            train_dataloader = (
                DataBundler()
                    .cycle(supervised_dataloader)
                    .add(unsupervised_dataloader)
            )
        if len(unsupervised_dataloader) == 0 or len(supervised_dataloader) == 0:
            self.ssl_dataloader = None
            return

        self.ssl_dataloader = train_dataloader.zip_mode().to(self.device)
        self.logger.info('ssl loader size', train_dataloader)
        self.ssl_loaderiter = iter(self.ssl_dataloader)