def get_affinity(aug, aff_bases, config, augment): C.get() C.get().conf = config # setup - provided augmentation rules C.get()['aug'] = aug load_paths = augment['load_paths'] cv_num = augment["cv_num"] aug_loaders = [] for cv_id in range(cv_num): _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], augment['cv_ratio_test'], split_idx=cv_id) aug_loaders.append(validloader) del tl, tl2 loss_fn = torch.nn.CrossEntropyLoss(reduction='none') aug_accs = [] for cv_id, loader in enumerate(aug_loaders): # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(load_paths[cv_id]) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() metrics = Accumulator() for data, label in loader: data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) # (N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) metrics.add_dict({ 'minus_loss': -1 * np.sum(loss.detach().cpu().numpy()), 'correct': np.sum(correct), 'cnt': len(data) }) del loss, correct, pred, data, label aug_accs.append(metrics['correct'] / metrics['cnt']) del model affs = [] for aug_valid, clean_valid in zip(aug_accs, aff_bases): affs.append(aug_valid - clean_valid) return affs
def eval_tta3(config, augment, reporter): C.get() C.get().conf = config save_path = augment['save_path'] cv_id, gr_id = augment["cv_id"], augment["gr_id"] gr_ids = augment["gr_ids"] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) del ckpt model.eval() loader = get_post_dataloader(C.get()["dataset"], C.get()['batch'], augment["dataroot"], augment['cv_ratio_test'], cv_id, gr_id, gr_ids) start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') for data, label in loader: data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) # (N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) metrics.add_dict({ 'loss': np.sum(loss.detach().cpu().numpy()), 'correct': np.sum(correct), 'cnt': len(data) }) del loss, correct, pred, data, label del model, loader metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(loss=metrics['loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def eval_tta2(config, augment, reporter): C.get() C.get().conf = config cv_ratio_test, cv_id, save_path = augment['cv_ratio_test'], augment['cv_id'], augment['save_path'] gr_id = augment["gr_id"] num_repeat = 1 # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for i in range(num_repeat): _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_id, gr_assign=augment["gr_assign"], gr_id=gr_id) loaders.append(validloader) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') for loader in loaders: for data, label in loader: data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) # (N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) metrics.add_dict({ 'minus_loss': -1 * np.sum(loss.detach().cpu().numpy()), 'correct': np.sum(correct), 'cnt': len(data) }) del loss, correct, pred, data, label del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def eval_tta(config, augment, reporter, num_class, get_model, get_dataloaders): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = ( augment["cv_ratio_test"], augment["cv_fold"], augment["save_path"], ) # setup - provided augmentation rules C.get()["aug"] = policy_decoder(augment, augment["num_policy"], augment["num_op"]) # eval model = get_model(C.get()["model"], num_class(C.get()["dataset"])) ckpt = torch.load(save_path) if "model" in ckpt: model.load_state_dict(ckpt["model"]) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment["num_policy"]): # TODO _, tl, validloader, tl2 = get_dataloaders( C.get()["dataset"], C.get()["batch"], augment["dataroot"], cv_ratio_test, split_idx=cv_fold, ) loaders.append(iter(validloader)) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction="none") try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy()) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = ( pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy() ) corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() metrics.add_dict( { "minus_loss": -1 * np.sum(losses_min), "correct": np.sum(corrects_max), "cnt": len(corrects_max), } ) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / "cnt" gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter( minus_loss=metrics["minus_loss"], top1_valid=metrics["correct"], elapsed_time=gpu_secs, done=True, ) return metrics["correct"]
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=-1, evaluation_interval=5): total_batch = C.get()["batch"] if local_rank >= 0: dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE'])) device = torch.device('cuda', local_rank) torch.cuda.set_device(device) C.get()['lr'] *= dist.get_world_size() logger.info( f'local batch={C.get()["batch"]} world_size={dist.get_world_size()} ----> total batch={C.get()["batch"] * dist.get_world_size()}' ) total_batch = C.get()["batch"] * dist.get_world_size() is_master = local_rank < 0 or dist.get_rank() == 0 if is_master: add_filehandler(logger, 'master' + '.log') if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders( C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0)) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=local_rank) model_ema = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=-1) model_ema.eval() criterion_ce = criterion = CrossEntropyLabelSmooth( num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get().conf.get('mixup', 0.0) > 0.0: criterion = CrossEntropyMixUpLabelSmooth( num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=0.0, nesterov=C.get()['optimizer'].get('nesterov', True)) elif C.get()['optimizer']['type'] == 'rmsprop': optimizer = RMSpropTF(model.parameters(), lr=C.get()['lr'], weight_decay=0.0, alpha=0.9, momentum=0.9, eps=0.001) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'efficientnet': scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda x: 0.97**int( (x + C.get()['lr_schedule']['warmup']['epoch']) / 2.4)) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get( 'warmup', None) and C.get()['lr_schedule']['warmup']['epoch'] > 0: scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test'] ] if C.get()['optimizer']['ema'] > 0.0 and is_master: # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet ema = EMA(C.get()['optimizer']['ema']) else: ema = None result = OrderedDict() epoch_start = 1 #TODO: change only eval=False when without save_path ?? if save_path != 'test.pth': # and is_master: --> should load all data(not able to be broadcasted) if save_path and not os.path.exists(save_path): import torch.utils.model_zoo as model_zoo data = model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth', model_dir=os.path.join(os.getcwd(), 'FastAutoAugment/models')) if C.get()['dataset'] == 'cifar10': data.pop('fc.weight') data.pop('fc.bias') model_dict = model.state_dict() model_dict.update(data) model.load_state_dict(model_dict) torch.save(model_dict, save_path) logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) key = 'model' if 'model' in data else 'state_dict' if 'epoch' not in data: model.load_state_dict(data) else: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, (DataParallel, DistributedDataParallel)): model.load_state_dict({ k.replace('module.', ''): v for k, v in data[key].items() }) else: model.load_state_dict({ k if 'module.' in k else 'module.' + k: v for k, v in data[key].items() }) logger.info('optimizer.load_state_dict+') optimizer.load_state_dict(data['optimizer']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True if ema is not None: ema.shadow = data.get('ema', {}) if isinstance( data.get('ema', {}), dict) else data['ema'].state_dict() del data if local_rank >= 0: for name, x in model.state_dict().items(): dist.broadcast(x, 0) logger.info( f'multinode init. local_rank={dist.get_rank()} is_master={is_master}' ) torch.cuda.synchronize() tqdm_disabled = bool(os.environ.get( 'TASK_NAME', '')) and local_rank != 0 # KakaoBrain Environment if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master) with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master) if ema is not None and len(ema) > 0: model_ema.load_state_dict({ k.replace('module.', ''): v for k, v in ema.state_dict().items() }) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if local_rank >= 0: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=C.get()['optimizer']['decay'], tqdm_disabled=tqdm_disabled) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if ema is not None and C.get( )['optimizer']['ema_interval'] > 0 and epoch % C.get( )['optimizer']['ema_interval'] == 0: logger.info(f'ema synced+ rank={dist.get_rank()}') if ema is not None: model.load_state_dict(ema.state_dict()) for name, x in model.state_dict().items(): # print(name) dist.broadcast(x, 0) torch.cuda.synchronize() logger.info(f'ema synced- rank={dist.get_rank()}') if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch): with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) if ema is not None: model_ema.load_state_dict({ k.replace('module.', ''): v for k, v in ema.state_dict().items() }) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) logger.info( f'epoch={epoch} ' f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} ' f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} ' f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} ' ) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product( ['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter(loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1']) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), 'ema': ema.state_dict() if ema is not None else None, }, save_path) del model result['top1_test'] = best_top1 return result
def eval_tta(config, augment): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[ 'cv_fold'], augment['save_path'] print(augment) # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment['num_policy']): # TODO _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_fold) loaders.append(iter(validloader)) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy()) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() metrics.add_dict({ 'minus_loss': -1 * np.sum(losses_min), 'correct': np.sum(corrects_max), 'cnt': len(corrects_max) }) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() # reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) tune.track.log(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, horovod=False): if horovod: import horovod.torch as hvd hvd.init() device = torch.device('cuda', hvd.local_rank()) torch.cuda.set_device(device) if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, horovod=horovod) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=(not horovod)) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov'] ) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) is_master = True if horovod: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) optimizer._requires_update = set() # issue : https://github.com/horovod/horovod/issues/1099 hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if hvd.rank() != 0: is_master = False logger.debug('is_master=%s' % is_master) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'pyramid': scheduler = adjust_learning_rate_pyramid(optimizer, C.get()['epoch']) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler ) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path): logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) if 'model' in data: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, DataParallel): model.load_state_dict({k.replace('module.', ''): v for k, v in data['model'].items()}) else: model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data['model'].items()}) optimizer.load_state_dict(data['optimizer']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True else: model.load_state_dict({k: v for k, v in data.items()}) del data else: logger.info('"%s" file not found. skip to pretrain weights...' % save_path) if only_eval: logger.warning('model checkpoint not found. only-evaluation mode is off.') only_eval = False if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0]) rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1]) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if horovod: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if epoch % 5 == 0 or epoch == max_epoch: rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter( loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1'] ) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save({ 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model result['top1_test'] = best_top1 return result
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, horovod=False): if horovod: import horovod.torch as hvd hvd.init() device = torch.device('cuda', hvd.local_rank()) torch.cuda.set_device(device) if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] # trainsampler, trainloader, validloader, testloader_ = get_dataloaders(dataroot, C.get()['batch'], horovod=horovod) trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, horovod=horovod) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=(not horovod)) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov'] ) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) is_master = True if horovod: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) optimizer._requires_update = set() # issue : https://github.com/horovod/horovod/issues/1099 hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if hvd.rank() != 0: is_master = False logger.debug('is_master=%s' % is_master) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': t_max = C.get()['epoch'] if C.get()['lr_schedule'].get('warmup', None): t_max -= C.get()['lr_schedule']['warmup']['epoch'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'pyramid': scheduler = adjust_learning_rate_pyramid(optimizer, C.get()['epoch']) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler ) if not tag.strip() or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [SummaryWriter(log_dir='/app/results/logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']] result = OrderedDict() epoch_start = 1 # if save_path and os.path.exists(save_path): # data = torch.load(save_path) # if 'model' in data: # # TODO : patch, horovod trained checkpoint # new_state_dict = {} # for k, v in data['model'].items(): # if not horovod and 'module.' not in k: # new_state_dict['module.' + k] = v # else: # new_state_dict[k] = v # # model.load_state_dict(new_state_dict) # optimizer.load_state_dict(data['optimizer']) # logger.info('ckpt epoch@%d' % data['epoch']) # if data['epoch'] < C.get()['epoch']: # epoch_start = data['epoch'] # else: # only_eval = True # logger.info('epoch=%d' % data['epoch']) # else: # model.load_state_dict(data) # del data if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0]) rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1]) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_valid_loss = 10e10 for epoch in range(epoch_start, max_epoch + 1): if horovod: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler) AugmentationPba.epoch += 1 scheduler.step(epoch) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if epoch % (10 if 'cifar' in C.get()['dataset'] else 30) == 0 or epoch == max_epoch: rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master) if metric == 'last' or rs[metric]['loss'] < best_valid_loss: # TODO if metric != 'last': best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter( loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1'] ) # save checkpoint if is_master and save_path: model_name = C.get()['model']['type'] if 'skip' in model_name: alpha = int(np.log10(1/C.get()['alpha'])) filename = '{}/{}_last_epoch_alpha_{}.pth'.format(save_path, model_name, alpha) else: filename = '{}/{}_last_epoch.pth'.format(save_path, model_name) logger.info('save model@%d to %s' % (epoch, filename)) torch.save({ 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, filename) del model return result
def train(self, policy, config): # gr: group별 optimal policy가 주어질 때 평균 reward가 가장 높도록 나누는 assigner self.model.train() gr_num = self.model.module.gr_num cv_id = config['cv_id'] load_path = config["load_path"] max_step = config["max_step"] childnet = get_model(C.get()['model'], num_class(C.get()['dataset'])).cuda() ckpt = torch.load(load_path) if 'model' in ckpt: childnet.load_state_dict(ckpt['model']) else: childnet.load_state_dict(ckpt) childnet = nn.DataParallel(childnet).cuda() childnet.eval() pol_losses = [] ori_aug = C.get()["aug"] C.get()["aug"] = "clean" _, _, dataloader, _ = get_dataloaders(C.get()['dataset'], C.get()['batch'], config['dataroot'], config['cv_ratio_test'], split_idx=cv_id, rand_val=True) loader_iter = iter(dataloader) reports = [] for step in range(max_step): try: data, label = next(loader_iter) except: loader_iter = iter(dataloader) data, label = next(loader_iter) data = data.cuda() label = label.cuda() logits = self.model(data, label) if self.mode == "supervised": with torch.no_grad(): losses = torch.zeros(gr_num, data.size(0)).cuda() for i in range(gr_num): aug_data = self.augmentation( data, i * torch.ones(data.size(0)), policy) losses[i] = self.loss_fn(childnet(aug_data), label) optimal_gr_ids = losses.min(0)[1] loss = self.loss_fn(logits, optimal_gr_ids).mean() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) report_number = loss else: m = Categorical(logits) gr_ids = m.sample() log_probs = m.log_prob(gr_ids) entropys = m.entropy() with torch.no_grad(): probs = m.log_prob( torch.tensor([[i] for i in range(gr_num)]).cuda()).exp() rewards_list = torch.zeros(gr_num, data.size(0)).cuda() for i in range(gr_num): aug_data = self.augmentation( data, i * torch.ones_like(gr_ids), policy) rewards_list[i] = 1. / ( self.loss_fn(childnet(aug_data), label) + self.eps) rewards = torch.tensor([ rewards_list[gr_id][idx] for idx, gr_id in enumerate(gr_ids) ]).cuda().detach() # value function as baseline baselines = sum([ prob * reward for prob, reward in zip(probs, rewards_list) ]) advantages = rewards - baselines if self.mode == "reinforce": loss = (-log_probs * advantages).mean() elif self.mode == "ppo": old_log_probs = log_probs.detach() gr_ids = m.sample() log_probs = m.log_prob(gr_ids) ratios = (log_probs - old_log_probs).exp() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages loss = -torch.min(surr1, surr2).mean() loss -= self.ent_w * entropys.mean() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) report_number = advantages.mean() self.optimizer.step() self.optimizer.zero_grad() reports.append(float(report_number.cpu().detach())) if step % self.eval_step == 0 or step == max_step - 1: if self.mode == "supervised": entropy = 0. else: entropy = (self.ent_w * entropys.mean()).cpu().detach().data print( f"[step{step}/{max_step}] objective {np.mean(reports):.4f}, entropy {entropy:.4f}" ) C.get()["aug"] = ori_aug return reports
def train_controller(controller, dataloaders, save_path, ctl_save_path): dataset = C.get()['test_dataset'] ctl_train_steps = 1500 ctl_num_aggre = 10 ctl_entropy_w = 1e-5 ctl_ema_weight = 0.95 metrics = Accumulator() cnt = 0 controller.train() test_ratio = 0. _, _, dataloader, _ = dataloaders # validloader optimizer = optim.SGD(controller.parameters(), lr=0.00035, momentum=0.9, weight_decay=0.0, nesterov=True) # optimizer = optim.Adam(controller.parameters(), lr = 0.00035) # create a model & a criterion model = get_model(C.get()['model'], num_class(dataset), local_rank=-1) criterion = CrossEntropyLabelSmooth(num_class(dataset), C.get().conf.get('lb_smooth', 0), reduction="batched_sum").cuda() # load model weights data = torch.load(save_path) key = 'model' if 'model' in data else 'state_dict' if 'epoch' not in data: model.load_state_dict(data) else: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, (DataParallel, DistributedDataParallel)): model.load_state_dict( {k.replace('module.', ''): v for k, v in data[key].items()}) else: model.load_state_dict({ k if 'module.' in k else 'module.' + k: v for k, v in data[key].items() }) del data model.eval() loader_iter = iter(dataloader) # [(image)->ToTensor->Normalize] baseline = None if os.path.isfile(ctl_save_path): logger.info('------Controller load------') checkpoint = torch.load(ctl_save_path) controller.load_state_dict(checkpoint['ctl_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) cnt = checkpoint['cnt'] mean_probs = checkpoint['mean_probs'] accs = checkpoint['accs'] metrics_dict = checkpoint['metrics'] metrics.metrics = metrics_dict init_step = checkpoint['step'] else: logger.info('------Train Controller from scratch------') mean_probs = [] accs = [] init_step = 0 for step in tqdm(range(init_step + 1, ctl_train_steps * ctl_num_aggre + 1)): try: inputs, labels = next(loader_iter) except: loader_iter = iter(dataloader) inputs, labels = next(loader_iter) batch_size = len(labels) inputs, labels = inputs.cuda(), labels.cuda() log_probs, entropys, sampled_policies = controller(inputs) # evaluate model with augmented validation dataset with torch.no_grad(): # compare Accuracy before/after augmentation # ori_preds = model(inputs) # ori_top1, ori_top5 = accuracy(ori_preds, labels, (1, 5)) batch_policies = batch_policy_decoder( sampled_policies ) # (list:list:list:tuple) [batch, num_policy, n_op, 3] aug_inputs, applied_policy = augment_data(inputs, batch_policies) aug_inputs = aug_inputs.cuda() # assert type(aug_inputs) == torch.Tensor, "Augmented Input Type Error: {}".format(type(aug_inputs)) preds = model(aug_inputs) model_losses = criterion(preds, labels) # (tensor)[batch] top1, top5 = accuracy(preds, labels, (1, 5)) # logger.info("Acc B/A Aug, {:.2f}->{:.2f}".format(ori_top1, top1)) # assert model_losses.shape == entropys.shape == log_probs.shape, \ # "[Size miss match] loss: {}, entropy: {}, log_prob: {}".format(model_losses.shape, entropys.shape, log_probs.shape) rewards = -model_losses + ctl_entropy_w * entropys # (tensor)[batch] if baseline is None: baseline = -model_losses.mean() # scalar tensor else: # assert baseline, "len(baseline): {}".format(len(baseline)) baseline = baseline - (1 - ctl_ema_weight) * ( baseline - rewards.mean().detach()) # baseline = 0. loss = -1 * (log_probs * (rewards - baseline)).mean() #scalar tensor # Average gradient over controller_num_aggregate samples loss = loss / ctl_num_aggre loss.backward(retain_graph=True) metrics.add_dict({ 'loss': loss.item() * batch_size, 'top1': top1.item() * batch_size, 'top5': top5.item() * batch_size, }) cnt += batch_size if (step + 1) % ctl_num_aggre == 0: torch.nn.utils.clip_grad_norm_(controller.parameters(), 5.0) optimizer.step() controller.zero_grad() # torch.cuda.empty_cache() logger.info('\n[Train Controller %03d/%03d] log_prob %02f, %s', step, ctl_train_steps*ctl_num_aggre, \ log_probs.mean().item(), metrics / cnt ) if step % 100 == 0 or step == ctl_train_steps * ctl_num_aggre: save_pic(inputs, aug_inputs, labels, applied_policy, batch_policies, step) ps = [] for pol in batch_policies: # (list:list:list:tuple) [batch, num_policy, n_op, 3] for ops in pol: for op in ops: p = op[1] ps.append(p) mean_prob = np.mean(ps) mean_probs.append(mean_prob) accs.append(top1.item()) print("Mean probability: {:.2f}".format(mean_prob)) torch.save( { 'step': step, 'ctl_state_dict': controller.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'metrics': dict(metrics.metrics), 'cnt': cnt, 'mean_probs': mean_probs, 'accs': accs }, ctl_save_path) return metrics, None #baseline.item()
def eval_tta(config, augment, reporter): C.get() C.get().conf = config save_path = augment['save_path'] cv_id, gr_id = augment["cv_id"], augment["gr_id"] gr_ids = augment["gr_ids"] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment['num_policy']): # TODO loader = get_post_dataloader(C.get()["dataset"], C.get()['batch'], augment["dataroot"], augment['cv_ratio_test'], cv_id, gr_id, gr_ids) loaders.append(iter(loader)) start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy().reshape(1, -1)) # (1,N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() # (N,) corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() # (N,) metrics.add_dict({ 'loss': np.sum(losses_min), 'correct': np.sum(corrects_max), 'cnt': corrects_max.size }) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(loss=metrics['loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform( 'prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform( 'level_%d_ %d' % (i, j), 0.0, 1.0) num_process_per_gpu = 2 total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss # load childnet for g childnet = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(paths[0]) if 'model' in ckpt: childnet.load_state_dict(ckpt['model']) else: childnet.load_state_dict(ckpt) # g definition gr_spliter = GrSpliter(childnet, gr_num=args.gr_num, mode=args.mode) del childnet, ckpt gr_results = [] gr_dist_collector = defaultdict(list) # best_configs = defaultdict(lambda: None) # result_to_save = ['timestamp', 'top1_valid', 'loss'] final_policy_group = defaultdict(lambda: []) for r in range(args.repeat): # run multiple times. for cv_id in range(cv_num):