def DataBundler(self, add=True, name=None): from thexp import DataBundler bundler = DataBundler() if add: return bundler.add(self, name) else: return bundler.cycle(self, name)
def datasets(self, params: SemiSupervisedParams): dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) indexs, un_indexs, val_indexs = splits.semi_split( train_y, n_percls=params.n_percls, val_size=params.val_size, repeat_sup=False) self.logger.info('sup/unsup/val : {}'.format( (len(indexs), len(un_indexs), len(val_indexs)))) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) sup_set = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_y().subset(indexs)) if len(sup_set) < params.batch_size: sup_set.virtual_sample(params.batch_size) unsup_set = (DatasetBuilder(train_x, train_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(un_indexs)) self.cl_set = unsup_set sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) self.sup_dataloader = sup_dataloader unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size * params.uratio, num_workers=1, shuffle=True) self.unsup_dataloader = DataBundler().add(unsup_dataloader).to( self.device) val_dataloader = (DatasetBuilder( train_x[val_indexs], train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add( unsup_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def on_train_epoch_begin(self, trainer: Trainer, func, params: DivideMixParams, *args, **kwargs): if params.eidx <= params.warm_up: pass else: self.logger.info('create semi dataset') if params.eidx % 2 == 1: prob = self.eval_train(self.model1, target_mem=self.target_mem1, plabel_mem=self.plabel_mem1, false_pred_mem=self.false_pred_mem1, noisy_cls_mem=self.noisy_cls_mem1 ) # type: np.ndarray, list else: prob = self.eval_train(self.model2, target_mem=self.target_mem2, plabel_mem=self.plabel_mem2, false_pred_mem=self.false_pred_mem2, noisy_cls_mem=self.noisy_cls_mem2 ) # type: np.ndarray, list pred = (prob > params.p_threshold) pred_idx = pred.nonzero()[0] unpred_idx = (1 - pred).nonzero()[0] train_x, train_y = self.train_set_pack mean, std = norm_val.get(params.dataset, [None, None]) weak = BigWeak(mean, std) self.labeled_dataloader = (DatasetBuilder( train_x, train_y).add_labels(prob, source_name='nprob').add_x( transform=weak).add_x(transform=weak).add_y().add_y( source='nprob').subset(pred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) self.unlabeled_dataloader = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_x( transform=weak).add_y().subset(unpred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) self.unlabeled_dataloader_iter = None bundler = DataBundler() bundler.add(self.labeled_dataloader ) # .cycle(self.unlabeled_dataloader).zip_mode() self.logger.info('new training dataset', bundler, len(self.unlabeled_dataloader)) self.regist_databundler(train=bundler.to(self.device))
def on_train_epoch_begin(self, trainer: Trainer, func, params: DivideMixParams, *args, **kwargs): if params.eidx < params.warm_up: pass else: if params.eidx % 2 == 0: prob, self.all_loss[1] = self.eval_train( self.model2, self.all_loss[1]) # type: np.ndarray, list pred = (prob > params.p_threshold) else: prob, self.all_loss[0] = self.eval_train( self.model, self.all_loss[0]) # type: np.ndarray, list pred = (prob > params.p_threshold) pred_idx = pred.nonzero()[0] unpred_idx = (1 - pred).nonzero()[0] train_x, train_y, noisy_y = self.train_set_pack clean = (noisy_y == train_y) acc = (pred[clean]).mean() self.logger.info('Numer of labeled samples', pred.sum(), 'clean ratio = {}'.format(acc)) mean, std = norm_val.get(params.dataset, [None, None]) weak = Weak(mean, std) labeled_dataloader = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, source_name='nys').add_labels( prob, source_name='nprob').add_x(transform=weak).add_x( transform=weak).add_y().add_y(source='nys').add_y( source='nprob').subset(pred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) unlabeled_dataloader = (DatasetBuilder( train_x, train_y).add_labels(noisy_y, source_name='nys').add_x( transform=weak).add_x(transform=weak).add_y().add_y( source='nys').subset(unpred_idx).DataLoader( params.batch_size, shuffle=True, drop_last=True, num_workers=params.num_workers)) bundler = DataBundler() bundler.add(labeled_dataloader).cycle( unlabeled_dataloader).zip_mode() self.logger.info('new training dataset', bundler) self.regist_databundler(train=bundler.to(self.device))
def _regist_databundler(self, key, val): from torch.utils.data import DataLoader assert isinstance(val, (DataBundler, DataLoader)) if isinstance(val, DataLoader): val = DataBundler().add(val) if key in self._databundler_dict: del self._databundler_dict[key] self._databundler_dict[key] = val
def datasets(self, params: DivideMixParams): from data.dataxy import datasets dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False, params.noisy_type self.train_set_pack = [train_x, np.array(train_y), noisy_y] self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.eval_train_dataloader = (DataBundler().add( DatasetBuilder(train_x, noisy_y).toggle_id().add_x( transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)).to(self.device)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: DivideMixParams): from data.dataxy_noisylabel import clothing1m_balance dataset_fn = clothing1m_balance test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True, params.cut_size) mean, std = norm_val.get('clothing1m', [None, None]) toTensor = BigToTensor(mean, std) weak = BigWeak(mean, std) # strong = BigStrong(mean, std) self.train_set_pack = [train_x, np.array(train_y)] train_set = ( DatasetBuilder(train_x, train_y) .toggle_id() .add_x(transform=weak) # .add_x(transform=strong) .add_y() ) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.train_set = train_set self.train_size = len(train_set) self.eval_train_dataloader = ( DataBundler() .add( DatasetBuilder(train_x, train_y) .toggle_id() .add_x(transform=toTensor) .add_y() .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False) ).to(self.device) ) test_dataloader = ( DatasetBuilder(test_x, test_y) .add_x(transform=toTensor).add_y() .DataLoader(batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False) ) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: SemiSupervisedParams): dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) indexs, un_indexs, val_indexs = splits.semi_split( train_y, n_percls=params.n_percls, val_size=5000, repeat_sup=False) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) sup_set = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_y().subset(indexs)) params.K = params.default(2, True) unsup_set = DatasetBuilder(train_x, train_y) for _ in range(params.K): unsup_set.add_x(transform=weak) unsup_set = unsup_set.add_y().subset(un_indexs) sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) val_dataloader = (DatasetBuilder( train_x[val_indexs], train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add( unsup_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: ImblanceParams): super().datasets(params) data_loader = get_mnist_loader(params.batch_size, classes=params.classes, proportion=params.train_proportion, mode="train") test_loader = get_mnist_loader(params.batch_size, classes=params.classes, proportion=0.5, mode="test") val_dataset = ValSet(data_loader.dataset.data_val, data_loader.dataset.labels_val) val_loader = DataLoader(val_dataset, params.batch_size, drop_last=False, shuffle=True) train_loader = DataBundler().add(data_loader).cycle( val_loader).zip_mode() self.regist_databundler(train=train_loader, test=test_loader) self.to(self.device)
""" """ from torch.utils.data.dataloader import DataLoader from torchvision.datasets.fakedata import FakeData from torchvision.transforms import ToTensor from thexp import DataBundler bundler = DataBundler() sub = DataBundler() sub.add(DataLoader(FakeData(transform=ToTensor()), batch_size=10)) \ .add(DataLoader(FakeData(image_size=(3, 32, 32), transform=ToTensor()), batch_size=10)) \ .zip_mode() bundler.add(sub) \ .add(DataLoader(FakeData(image_size=(3, 28, 28), transform=ToTensor()), batch_size=10)) \ .zip_mode() for ((i1, l1), (i2, l2)), (i3, l3) in bundler: print(i1.shape, l1.shape, i2.shape, l2.shape, i3.shape, l3.shape) bundler = (DataBundler().cycle( DataLoader(FakeData(size=10, image_size=(3, 28, 28), transform=ToTensor()), batch_size=10)).add( DataLoader(FakeData(size=1000, image_size=(3, 28, 28), transform=ToTensor()), batch_size=10)).zip_mode())
def datasets(self, params: MnistImblanceParams): super().datasets(params) from data.dataxy import mnist test_x, test_y = mnist(False) train_x, train_y = mnist(True) train_y = np.array(train_y, dtype=np.float32) test_y = np.array(test_y, dtype=np.float32) # search and mask sample with class [4, 9] train_mask_lis = [ np.where(train_y == i)[0] for i in params.train_classes ] test_mask_lis = [ np.where(test_y == i)[0] for i in params.train_classes ] for new_cls, i in enumerate(params.train_classes): train_y[train_mask_lis[new_cls]] = new_cls test_y[test_mask_lis[new_cls]] = new_cls train_mask = np.concatenate(train_mask_lis) test_mask = np.concatenate(test_mask_lis) test_x, test_y = test_x[test_mask], test_y[test_mask] train_x, train_y = train_x[train_mask], train_y[train_mask] # split train/val dataset train_ids, val_ids = splits.train_val_split(train_y, val_size=params.val_size) train_x, val_x = train_x[train_ids], train_x[val_ids] train_y, val_y = train_y[train_ids], train_y[val_ids] # reduce size of second class train_mask_lis = [ np.where(train_y == i)[0] for i in range(len(params.train_classes)) ] sec_cls_size = int( (1 - params.train_proportion) * len(train_mask_lis[0])) train_mask_lis[1] = train_mask_lis[1][:sec_cls_size] train_mask = np.concatenate(train_mask_lis) train_x, train_y = train_x[train_mask], train_y[train_mask] toTensor = ToNormTensor((0.1307, ), (0.3081, )) train_dataloader = (DatasetBuilder( train_x, train_y).add_x(toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) val_dataloader = (DatasetBuilder( val_x, val_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().add( train_dataloader).cycle(val_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) import numpy as np dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) # train_ids, query_ids = splits.train_val_split(train_y, val_size=params.val_size) query_ids, train_ids, eval_ids = splits.semi_split( train_y, params.query_size // params.n_classes, val_size=params.val_size, repeat_sup=False) # train_ids = train_ids[:3000] self.train_size = len(train_ids) train_x, query_x, eval_x = train_x[train_ids], train_x[ query_ids], train_x[eval_ids] train_y, query_y, eval_y = train_y[train_ids], train_y[ query_ids], train_y[eval_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() self.logger.info(len(train_y), len(train_x), len(noisy_y)) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=strong)) params.K = params.default(0, True) for _ in range(params.K): train_set.add_x(transform=weak) train_set = (train_set.add_y().add_y(source='noisy_y')) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_a = sampler else: sampler = None train_set = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) query_set = (DatasetBuilder(query_x, query_y).add_x(transform=strong).add_y()) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_b = sampler else: sampler = None query_set = query_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_dataloader = ( DatasetBuilder(eval_x, eval_y).add_x(transform=toTensor).add_y(). DataLoader( batch_size=params.batch_size, shuffle= False, # do not shuffle # no shuffle for probe, so a batch is class balanced.(?) num_workers=params.num_workers)) train_dataloader = DataBundler().add(train_set).cycle( query_set).zip_mode() test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def change_dataset(self): """ 根据当前的 filter_mem,按 thresh 将其分为有监督和无监督 :return: """ from data.constant import norm_val from data.transforms import ToNormTensor, Weak, Strong train_x, train_y, noisy_y = self.train_set filter_prob = self.filter_mem.cpu().numpy() clean_mask = filter_prob > 0.5 self.logger.info('sup size', clean_mask.sum()) if clean_mask.all() or not np.logical_not(clean_mask).any(): return clean_ids = np.where(clean_mask)[0] noisy_ids = np.where(np.logical_not(clean_mask))[0] mean, std = norm_val.get(params.dataset, [None, None]) weak = Weak(mean, std) strong = Strong(mean, std) supervised_dataloader = ( DatasetBuilder(train_x, train_y) .add_labels(noisy_y, source_name='ny') .toggle_id() .add_x(strong) .add_y() .add_y(source='ny') .subset(clean_ids) .DataLoader(params.batch_size // 2, shuffle=True, num_workers=0, drop_last=True) ) unsupervised_dataloader = ( DatasetBuilder(train_x, train_y) .add_labels(noisy_y, source_name='ny') .add_labels(filter_prob, source_name='nprob') .toggle_id() .add_x(strong) .add_x(strong) .add_y() .add_y(source='ny') .add_y(source='nprob') .subset(noisy_ids) .DataLoader(params.batch_size // 2, shuffle=True, num_workers=0, drop_last=True) ) if len(supervised_dataloader) > len(unsupervised_dataloader): train_dataloader = ( DataBundler() .add(supervised_dataloader) .cycle(unsupervised_dataloader) ) else: train_dataloader = ( DataBundler() .cycle(supervised_dataloader) .add(unsupervised_dataloader) ) if len(unsupervised_dataloader) == 0 or len(supervised_dataloader) == 0: self.ssl_dataloader = None return self.ssl_dataloader = train_dataloader.zip_mode().to(self.device) self.logger.info('ssl loader size', train_dataloader) self.ssl_loaderiter = iter(self.ssl_dataloader)