def datasets(self, params: FitEvalParams): from thexp import DatasetBuilder from data.constant import norm_val from data.transforms import ToNormTensor, Weak, Strong mean, std = norm_val.get(params.dataset, [None, None]) weak = Weak(mean, std) dataset_fn = datasets.datasets[params.dataset] train_x, train_y = dataset_fn(True) train_y = np.array(train_y) from data.noisy import symmetric_noisy train_x, train_y = train_x[:params.train_size], train_y[:params.train_size] part_size = params.train_size // 2 noisy_x1, noisy_true_y1 = train_x[:part_size], train_y[:part_size] noisy_x2, noisy_true_y2 = train_x[part_size:], train_y[part_size:] noisy_ratio = 0.9 noisy_y1 = symmetric_noisy(noisy_true_y1, noisy_ratio, n_classes=params.n_classes) noisy_y2 = symmetric_noisy(noisy_true_y2, noisy_ratio, n_classes=params.n_classes) self.logger.info('noisy dataset ratio: ', (noisy_true_y1 == noisy_y1).mean(), (noisy_true_y2 == noisy_y2).mean()) noisy_set1 = ( DatasetBuilder(noisy_x1, noisy_true_y1) .add_labels(noisy_y2, source_name='noisy_y') .toggle_id() .add_x(transform=weak) .add_y() .add_y(source='noisy_y') ) noisy_set2 = ( DatasetBuilder(noisy_x2, noisy_true_y2) .add_labels(noisy_y2, source_name='noisy_y') .toggle_id() .add_x(transform=weak) .add_y() .add_y(source='noisy_y') ) self.noisy_loader1 = noisy_set1.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.noisy_loader2 = noisy_set2.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.eval_state = 1 - int(params.eval_mode == 'clean') self.toggle_dataset((self.eval_state % 2) == 0) self.to(self.device)
def datasets(self, params: DivideMixParams): from data.dataxy import datasets dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False, params.noisy_type self.train_set_pack = [train_x, np.array(train_y), noisy_y] self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.eval_train_dataloader = (DataBundler().add( DatasetBuilder(train_x, noisy_y).toggle_id().add_x( transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)).to(self.device)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): self.rnd.mark('kk') params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) from data.constant import norm_val mean, std = norm_val.get(params.dataset, [None, None]) from data.transforms import ToNormTensor toTensor = ToNormTensor(mean, std) from data.transforms import Weak weak = Weak(mean, std) from data.transforms import Strong dataset_fn = datasets.datasets[params.dataset] train_x, train_y = dataset_fn(True) train_y = np.array(train_y) from thexp import DatasetBuilder from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) clean_mask = (train_y == noisy_y) noisy_mask = np.logical_not(clean_mask) noisy_mask = np.where(noisy_mask)[0] nmask_a = noisy_mask[:len(noisy_mask) // 2] nmask_b = noisy_mask[len(noisy_mask) // 2:] clean_x, clean_y = train_x[clean_mask], noisy_y[clean_mask] clean_true_y = train_y[clean_mask] raw_x, raw_true_y = train_x[nmask_a], train_y[nmask_a] raw_y = noisy_y[nmask_a] change_x, change_true_y, change_y = train_x[nmask_b], train_y[ nmask_b], noisy_y[nmask_b] first_x, first_y, first_true_y = ( clean_x + raw_x, np.concatenate([clean_y, raw_y]), np.concatenate([clean_true_y, raw_true_y]), ) second_x, second_y, second_true_y = ( clean_x + change_x, np.concatenate([clean_y, change_y]), np.concatenate([clean_true_y, change_true_y]), ) first_set = (DatasetBuilder(first_x, first_true_y).add_labels( first_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) second_set = (DatasetBuilder(second_x, second_true_y).add_labels( second_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) self.first_dataloader = first_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = second_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = second_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second = False self.regist_databundler(train=self.first_dataloader) self.cur_set = 0 self.to(self.device)
def datasets(self, params: NoisyParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) import numpy as np dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) # train_ids, query_ids = splits.train_val_split(train_y, val_size=params.val_size) query_ids, train_ids, eval_ids = splits.semi_split( train_y, params.query_size // params.n_classes, val_size=params.val_size, repeat_sup=False) # train_ids = train_ids[:3000] self.train_size = len(train_ids) train_x, query_x, eval_x = train_x[train_ids], train_x[ query_ids], train_x[eval_ids] train_y, query_y, eval_y = train_y[train_ids], train_y[ query_ids], train_y[eval_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() self.logger.info(len(train_y), len(train_x), len(noisy_y)) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=strong)) params.K = params.default(0, True) for _ in range(params.K): train_set.add_x(transform=weak) train_set = (train_set.add_y().add_y(source='noisy_y')) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_a = sampler else: sampler = None train_set = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) query_set = (DatasetBuilder(query_x, query_y).add_x(transform=strong).add_y()) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_b = sampler else: sampler = None query_set = query_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_dataloader = ( DatasetBuilder(eval_x, eval_y).add_x(transform=toTensor).add_y(). DataLoader( batch_size=params.batch_size, shuffle= False, # do not shuffle # no shuffle for probe, so a batch is class balanced.(?) num_workers=params.num_workers)) train_dataloader = DataBundler().add(train_set).cycle( query_set).zip_mode() test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): self.rnd.mark('fix_noisy') params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) train_ids, val_ids = splits.train_val_split(train_y, val_size=5000) train_x, val_x = train_x[train_ids], train_x[val_ids] train_y, val_y = train_y[train_ids], train_y[val_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False clean_mask = train_y == noisy_y self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) from thextra.noisy_sampler import NoisySampler sampler = None if params.order_sampler: sampler = NoisySampler(train_set, clean_mask) self.train_size = len(train_set) train_dataloader = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, sampler=sampler, shuffle=True) val_dataloader = (DatasetBuilder( val_x, val_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: EvalParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) from data.constant import norm_val mean, std = norm_val.get(params.dataset, [None, None]) from data.transforms import ToNormTensor toTensor = ToNormTensor(mean, std) from data.transforms import Weak weak = Weak(mean, std) from data.transforms import Strong dataset_fn = datasets.datasets[params.dataset] train_x, train_y = dataset_fn(True) train_y = np.array(train_y) from thexp import DatasetBuilder from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) clean_mask = (train_y == noisy_y) noisy_mask = np.logical_not(clean_mask) if params.eval_mode in ['full', 'same_epoch', 'same_acc']: first_x, first_y = train_x[clean_mask], noisy_y[clean_mask] first_true_y = train_y[clean_mask] elif params.eval_mode in ['mix', 'raw', 'direct']: first_x, first_y = train_x, noisy_y first_true_y = train_y else: assert False second_x, second_true_y = train_x[noisy_mask], train_y[noisy_mask] second_y = noisy_y[noisy_mask] self.logger.info('noisy acc = {}'.format( (first_true_y == first_y).mean())) self.logger.info('noisy acc = {}'.format( (second_true_y == second_y).mean())) self.rnd.shuffle() first_set = (DatasetBuilder(first_x, first_true_y).add_labels( first_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) noisy_set = (DatasetBuilder(second_x, second_true_y).add_labels( second_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) first_dataloader = first_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = noisy_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second = False self.regist_databundler(train=first_dataloader) self.to(self.device) if params.eval_mode == 'direct': self.change_dataset()