def on_train_epoch_begin(self, trainer: Trainer, func, params: DivideMixParams, *args, **kwargs): if params.eidx <= params.warm_up: pass else: self.logger.info('create semi dataset') if params.eidx % 2 == 1: prob = self.eval_train(self.model1, target_mem=self.target_mem1, plabel_mem=self.plabel_mem1, false_pred_mem=self.false_pred_mem1, noisy_cls_mem=self.noisy_cls_mem1 ) # type: np.ndarray, list else: prob = self.eval_train(self.model2, target_mem=self.target_mem2, plabel_mem=self.plabel_mem2, false_pred_mem=self.false_pred_mem2, noisy_cls_mem=self.noisy_cls_mem2 ) # type: np.ndarray, list pred = (prob > params.p_threshold) pred_idx = pred.nonzero()[0] unpred_idx = (1 - pred).nonzero()[0] train_x, train_y = self.train_set_pack mean, std = norm_val.get(params.dataset, [None, None]) weak = BigWeak(mean, std) self.labeled_dataloader = (DatasetBuilder( train_x, train_y).add_labels(prob, source_name='nprob').add_x( transform=weak).add_x(transform=weak).add_y().add_y( source='nprob').subset(pred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) self.unlabeled_dataloader = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_x( transform=weak).add_y().subset(unpred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) self.unlabeled_dataloader_iter = None bundler = DataBundler() bundler.add(self.labeled_dataloader ) # .cycle(self.unlabeled_dataloader).zip_mode() self.logger.info('new training dataset', bundler, len(self.unlabeled_dataloader)) self.regist_databundler(train=bundler.to(self.device))
def datasets(self, params: DivideMixParams): from data.dataxy_noisylabel import clothing1m_balance dataset_fn = clothing1m_balance test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True, params.cut_size) mean, std = norm_val.get('clothing1m', [None, None]) toTensor = BigToTensor(mean, std) weak = BigWeak(mean, std) # strong = BigStrong(mean, std) self.train_set_pack = [train_x, np.array(train_y)] train_set = ( DatasetBuilder(train_x, train_y) .toggle_id() .add_x(transform=weak) # .add_x(transform=strong) .add_y() ) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.train_set = train_set self.train_size = len(train_set) self.eval_train_dataloader = ( DataBundler() .add( DatasetBuilder(train_x, train_y) .toggle_id() .add_x(transform=toTensor) .add_y() .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False) ).to(self.device) ) test_dataloader = ( DatasetBuilder(test_x, test_y) .add_x(transform=toTensor).add_y() .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False) ) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def on_train_epoch_begin(self, trainer: Trainer, func, params: DivideMixParams, *args, **kwargs): if params.eidx < params.warm_up: pass else: if params.eidx % 2 == 0: prob = self.eval_train(self.model2) # type: np.ndarray, list pred = (prob > params.p_threshold) else: prob = self.eval_train(self.model) # type: np.ndarray, list pred = (prob > params.p_threshold) pred_idx = pred.nonzero()[0] unpred_idx = (1 - pred).nonzero()[0] train_x, train_y, noisy_y = self.train_set_pack clean = (noisy_y == train_y) acc = (pred[clean]).mean() self.logger.info('Numer of labeled samples', pred.sum(), 'clean ratio = {}'.format(acc)) mean, std = norm_val.get(params.dataset, [None, None]) weak = BigWeak(mean, std) labeled_dataloader = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, source_name='nys').add_labels( prob, source_name='nprob').add_x(transform=weak).add_x( transform=weak).add_y().add_y(source='nys').add_y( source='nprob').subset(pred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) unlabeled_dataloader = (DatasetBuilder( train_x, train_y).add_labels(noisy_y, source_name='nys').add_x( transform=weak).add_x(transform=weak).add_y().add_y( source='nys').subset(unpred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) bundler = DataBundler() bundler.add(labeled_dataloader).cycle( unlabeled_dataloader).zip_mode() self.logger.info('new training dataset', bundler) self.regist_databundler(train=bundler.to(self.device))
def datasets(self, params: GlobalParams): self.rnd.mark('assign') from data.dataxy_noisylabel import (clothing1m_clean_train, clothing1m_balance, clothing1m_clean_dividimix_train, clothing1m_clean_ema_train) from data.transforms import BigStrong, BigWeak, BigToTensor params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) params.cut_size = params.default(3360, True) mean, std = norm_val.get('clothing1m', [None, None]) toTensor = BigToTensor(mean, std) weak = BigWeak(mean, std) strong = BigStrong(mean, std) test_x, test_y = clothing1m_balance(False, per_cls=3360) # train_x, noisy_y = clothing1m_clean_train() train_x, noisy_y = clothing1m_clean_ema_train() val_dataloader = None self.logger.info(train_x[:2]) sub_ids = np.random.permutation(len(train_x)) train_set = (DatasetBuilder(train_x, noisy_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(sub_ids)) self.train_set = train_set self.train_size = len(train_set) train_dataloader = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=False) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=False)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)