def datasets(self, params: SemiSupervisedParams): dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) indexs, un_indexs, val_indexs = splits.semi_split( train_y, n_percls=params.n_percls, val_size=params.val_size, repeat_sup=False) self.logger.info('sup/unsup/val : {}'.format( (len(indexs), len(un_indexs), len(val_indexs)))) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) sup_set = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_y().subset(indexs)) if len(sup_set) < params.batch_size: sup_set.virtual_sample(params.batch_size) unsup_set = (DatasetBuilder(train_x, train_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(un_indexs)) self.cl_set = unsup_set sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) self.sup_dataloader = sup_dataloader unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size * params.uratio, num_workers=1, shuffle=True) self.unsup_dataloader = DataBundler().add(unsup_dataloader).to( self.device) val_dataloader = (DatasetBuilder( train_x[val_indexs], train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add( unsup_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: DivideMixParams): from data.dataxy import datasets dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False, params.noisy_type self.train_set_pack = [train_x, np.array(train_y), noisy_y] self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.eval_train_dataloader = (DataBundler().add( DatasetBuilder(train_x, noisy_y).toggle_id().add_x( transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)).to(self.device)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: GlobalParams): dataset_fn = datasets[params.dataset] test_x, testy = dataset_fn(False) train_x, train_y = dataset_fn(True) train_idx, val_idx = splits.train_val_split(train_y, val_size=params.val_size) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) test_dataloader = (DatasetBuilder( test_x, testy).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) train_dataloader = (DatasetBuilder(train_x, train_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(train_idx)) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_dataloader) else: sampler = None self.train_size = len(train_dataloader) train_dataloader = train_dataloader.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_datalaoder = (DatasetBuilder(train_x, train_y).add_x( transform=toTensor).add_y().subset(val_idx).DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) self.regist_databundler(train=train_dataloader, eval=val_datalaoder, test=test_dataloader) print('dataloader in rank {}'.format(self.params.local_rank)) print(self.params.local_rank, self.train_dataloader) print(self.params.local_rank, self._databundler_dict) print(self.params.local_rank, train_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) import numpy as np dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) # train_ids, query_ids = splits.train_val_split(train_y, val_size=params.val_size) query_ids, train_ids, eval_ids = splits.semi_split( train_y, params.query_size // params.n_classes, val_size=params.val_size, repeat_sup=False) # train_ids = train_ids[:3000] self.train_size = len(train_ids) train_x, query_x, eval_x = train_x[train_ids], train_x[ query_ids], train_x[eval_ids] train_y, query_y, eval_y = train_y[train_ids], train_y[ query_ids], train_y[eval_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() self.logger.info(len(train_y), len(train_x), len(noisy_y)) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=strong)) params.K = params.default(0, True) for _ in range(params.K): train_set.add_x(transform=weak) train_set = (train_set.add_y().add_y(source='noisy_y')) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_a = sampler else: sampler = None train_set = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) query_set = (DatasetBuilder(query_x, query_y).add_x(transform=strong).add_y()) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_b = sampler else: sampler = None query_set = query_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_dataloader = ( DatasetBuilder(eval_x, eval_y).add_x(transform=toTensor).add_y(). DataLoader( batch_size=params.batch_size, shuffle= False, # do not shuffle # no shuffle for probe, so a batch is class balanced.(?) num_workers=params.num_workers)) train_dataloader = DataBundler().add(train_set).cycle( query_set).zip_mode() test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): self.rnd.mark('fix_noisy') params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) train_ids, val_ids = splits.train_val_split(train_y, val_size=5000) train_x, val_x = train_x[train_ids], train_x[val_ids] train_y, val_y = train_y[train_ids], train_y[val_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False clean_mask = train_y == noisy_y self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) from thextra.noisy_sampler import NoisySampler sampler = None if params.order_sampler: sampler = NoisySampler(train_set, clean_mask) self.train_size = len(train_set) train_dataloader = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, sampler=sampler, shuffle=True) val_dataloader = (DatasetBuilder( val_x, val_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def change_dataset(self): """ 根据当前的 filter_mem,按 thresh 将其分为有监督和无监督 :return: """ from data.constant import norm_val from data.transforms import ToNormTensor, Weak, Strong train_x, train_y, noisy_y = self.train_set filter_prob = self.filter_mem.cpu().numpy() clean_mask = filter_prob > 0.5 self.logger.info('sup size', clean_mask.sum()) if clean_mask.all() or not np.logical_not(clean_mask).any(): return clean_ids = np.where(clean_mask)[0] noisy_ids = np.where(np.logical_not(clean_mask))[0] mean, std = norm_val.get(params.dataset, [None, None]) weak = Weak(mean, std) strong = Strong(mean, std) supervised_dataloader = ( DatasetBuilder(train_x, train_y) .add_labels(noisy_y, source_name='ny') .toggle_id() .add_x(strong) .add_y() .add_y(source='ny') .subset(clean_ids) .DataLoader(params.batch_size // 2, shuffle=True, num_workers=0, drop_last=True) ) unsupervised_dataloader = ( DatasetBuilder(train_x, train_y) .add_labels(noisy_y, source_name='ny') .add_labels(filter_prob, source_name='nprob') .toggle_id() .add_x(strong) .add_x(strong) .add_y() .add_y(source='ny') .add_y(source='nprob') .subset(noisy_ids) .DataLoader(params.batch_size // 2, shuffle=True, num_workers=0, drop_last=True) ) if len(supervised_dataloader) > len(unsupervised_dataloader): train_dataloader = ( DataBundler() .add(supervised_dataloader) .cycle(unsupervised_dataloader) ) else: train_dataloader = ( DataBundler() .cycle(supervised_dataloader) .add(unsupervised_dataloader) ) if len(unsupervised_dataloader) == 0 or len(supervised_dataloader) == 0: self.ssl_dataloader = None return self.ssl_dataloader = train_dataloader.zip_mode().to(self.device) self.logger.info('ssl loader size', train_dataloader) self.ssl_loaderiter = iter(self.ssl_dataloader)