Beispiel #1
0
    def on_train_epoch_begin(self, trainer: Trainer, func,
                             params: DivideMixParams, *args, **kwargs):
        if params.eidx <= params.warm_up:
            pass
        else:
            self.logger.info('create semi dataset')
            if params.eidx % 2 == 1:
                prob = self.eval_train(self.model1,
                                       target_mem=self.target_mem1,
                                       plabel_mem=self.plabel_mem1,
                                       false_pred_mem=self.false_pred_mem1,
                                       noisy_cls_mem=self.noisy_cls_mem1
                                       )  # type: np.ndarray, list
            else:
                prob = self.eval_train(self.model2,
                                       target_mem=self.target_mem2,
                                       plabel_mem=self.plabel_mem2,
                                       false_pred_mem=self.false_pred_mem2,
                                       noisy_cls_mem=self.noisy_cls_mem2
                                       )  # type: np.ndarray, list
            pred = (prob > params.p_threshold)

            pred_idx = pred.nonzero()[0]
            unpred_idx = (1 - pred).nonzero()[0]

            train_x, train_y = self.train_set_pack

            mean, std = norm_val.get(params.dataset, [None, None])
            weak = BigWeak(mean, std)

            self.labeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_labels(prob, source_name='nprob').add_x(
                    transform=weak).add_x(transform=weak).add_y().add_y(
                        source='nprob').subset(pred_idx).DataLoader(
                            params.batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=params.num_workers))

            self.unlabeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_x(transform=weak).add_x(
                    transform=weak).add_y().subset(unpred_idx).DataLoader(
                        params.batch_size,
                        shuffle=True,
                        drop_last=True,
                        num_workers=params.num_workers))
            self.unlabeled_dataloader_iter = None
            bundler = DataBundler()
            bundler.add(self.labeled_dataloader
                        )  # .cycle(self.unlabeled_dataloader).zip_mode()
            self.logger.info('new training dataset', bundler,
                             len(self.unlabeled_dataloader))
            self.regist_databundler(train=bundler.to(self.device))
Beispiel #2
0
    def datasets(self, params: DivideMixParams):

        from data.dataxy_noisylabel import clothing1m_balance
        dataset_fn = clothing1m_balance

        test_x, test_y = dataset_fn(False)
        train_x, train_y = dataset_fn(True, params.cut_size)

        mean, std = norm_val.get('clothing1m', [None, None])
        toTensor = BigToTensor(mean, std)
        weak = BigWeak(mean, std)
        # strong = BigStrong(mean, std)

        self.train_set_pack = [train_x, np.array(train_y)]

        train_set = (
            DatasetBuilder(train_x, train_y)
                .toggle_id()
                .add_x(transform=weak)
                # .add_x(transform=strong)
                .add_y()
        )
        train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2,
                                                num_workers=params.num_workers,
                                                shuffle=True)
        from thexp import DataBundler
        self.train_set = train_set
        self.train_size = len(train_set)

        self.eval_train_dataloader = (
            DataBundler()
                .add(
                DatasetBuilder(train_x, train_y)
                    .toggle_id()
                    .add_x(transform=toTensor)
                    .add_y()
                    .DataLoader(batch_size=params.batch_size,
                                num_workers=params.num_workers // 2,
                                shuffle=False)
            ).to(self.device)
        )

        test_dataloader = (
            DatasetBuilder(test_x, test_y)
                .add_x(transform=toTensor).add_y()
                .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)
        )

        self.regist_databundler(train=train_dataloader,
                                test=test_dataloader)
        self.to(self.device)
    def on_train_epoch_begin(self, trainer: Trainer, func,
                             params: DivideMixParams, *args, **kwargs):
        if params.eidx < params.warm_up:
            pass
        else:
            if params.eidx % 2 == 0:
                prob = self.eval_train(self.model2)  # type: np.ndarray, list
                pred = (prob > params.p_threshold)
            else:
                prob = self.eval_train(self.model)  # type: np.ndarray, list
                pred = (prob > params.p_threshold)

            pred_idx = pred.nonzero()[0]
            unpred_idx = (1 - pred).nonzero()[0]

            train_x, train_y, noisy_y = self.train_set_pack
            clean = (noisy_y == train_y)
            acc = (pred[clean]).mean()
            self.logger.info('Numer of labeled samples', pred.sum(),
                             'clean ratio = {}'.format(acc))

            mean, std = norm_val.get(params.dataset, [None, None])
            weak = BigWeak(mean, std)

            labeled_dataloader = (DatasetBuilder(train_x, train_y).add_labels(
                noisy_y, source_name='nys').add_labels(
                    prob, source_name='nprob').add_x(transform=weak).add_x(
                        transform=weak).add_y().add_y(source='nys').add_y(
                            source='nprob').subset(pred_idx).DataLoader(
                                params.batch_size,
                                shuffle=True,
                                drop_last=True,
                                num_workers=params.num_workers))

            unlabeled_dataloader = (DatasetBuilder(
                train_x, train_y).add_labels(noisy_y, source_name='nys').add_x(
                    transform=weak).add_x(transform=weak).add_y().add_y(
                        source='nys').subset(unpred_idx).DataLoader(
                            params.batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=params.num_workers))
            bundler = DataBundler()
            bundler.add(labeled_dataloader).cycle(
                unlabeled_dataloader).zip_mode()
            self.logger.info('new training dataset', bundler)
            self.regist_databundler(train=bundler.to(self.device))
Beispiel #4
0
    def datasets(self, params: GlobalParams):
        self.rnd.mark('assign')
        from data.dataxy_noisylabel import (clothing1m_clean_train,
                                            clothing1m_balance,
                                            clothing1m_clean_dividimix_train,
                                            clothing1m_clean_ema_train)
        from data.transforms import BigStrong, BigWeak, BigToTensor
        params.noisy_type = params.default('symmetric', True)
        params.noisy_ratio = params.default(0.2, True)
        params.cut_size = params.default(3360, True)

        mean, std = norm_val.get('clothing1m', [None, None])
        toTensor = BigToTensor(mean, std)
        weak = BigWeak(mean, std)
        strong = BigStrong(mean, std)

        test_x, test_y = clothing1m_balance(False, per_cls=3360)
        # train_x, noisy_y = clothing1m_clean_train()
        train_x, noisy_y = clothing1m_clean_ema_train()
        val_dataloader = None
        self.logger.info(train_x[:2])

        sub_ids = np.random.permutation(len(train_x))

        train_set = (DatasetBuilder(train_x, noisy_y).toggle_id().add_x(
            transform=weak).add_x(transform=strong).add_y().subset(sub_ids))
        self.train_set = train_set

        self.train_size = len(train_set)
        train_dataloader = train_set.DataLoader(batch_size=params.batch_size,
                                                num_workers=params.num_workers,
                                                drop_last=True,
                                                shuffle=False)

        test_dataloader = (DatasetBuilder(
            test_x, test_y).add_x(transform=toTensor).add_y().DataLoader(
                batch_size=params.batch_size,
                num_workers=params.num_workers,
                shuffle=False))

        self.regist_databundler(train=train_dataloader,
                                eval=val_dataloader,
                                test=test_dataloader)
        self.to(self.device)