def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=-1, evaluation_interval=5): total_batch = C.get()["batch"] if local_rank >= 0: dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE'])) device = torch.device('cuda', local_rank) torch.cuda.set_device(device) C.get()['lr'] *= dist.get_world_size() logger.info( f'local batch={C.get()["batch"]} world_size={dist.get_world_size()} ----> total batch={C.get()["batch"] * dist.get_world_size()}' ) total_batch = C.get()["batch"] * dist.get_world_size() is_master = local_rank < 0 or dist.get_rank() == 0 if is_master: add_filehandler(logger, 'master' + '.log') if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders( C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0)) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=local_rank) model_ema = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=-1) model_ema.eval() criterion_ce = criterion = CrossEntropyLabelSmooth( num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get().conf.get('mixup', 0.0) > 0.0: criterion = CrossEntropyMixUpLabelSmooth( num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=0.0, nesterov=C.get()['optimizer'].get('nesterov', True)) elif C.get()['optimizer']['type'] == 'rmsprop': optimizer = RMSpropTF(model.parameters(), lr=C.get()['lr'], weight_decay=0.0, alpha=0.9, momentum=0.9, eps=0.001) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'efficientnet': scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda x: 0.97**int( (x + C.get()['lr_schedule']['warmup']['epoch']) / 2.4)) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get( 'warmup', None) and C.get()['lr_schedule']['warmup']['epoch'] > 0: scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test'] ] if C.get()['optimizer']['ema'] > 0.0 and is_master: # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet ema = EMA(C.get()['optimizer']['ema']) else: ema = None result = OrderedDict() epoch_start = 1 #TODO: change only eval=False when without save_path ?? if save_path != 'test.pth': # and is_master: --> should load all data(not able to be broadcasted) if save_path and not os.path.exists(save_path): import torch.utils.model_zoo as model_zoo data = model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth', model_dir=os.path.join(os.getcwd(), 'FastAutoAugment/models')) if C.get()['dataset'] == 'cifar10': data.pop('fc.weight') data.pop('fc.bias') model_dict = model.state_dict() model_dict.update(data) model.load_state_dict(model_dict) torch.save(model_dict, save_path) logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) key = 'model' if 'model' in data else 'state_dict' if 'epoch' not in data: model.load_state_dict(data) else: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, (DataParallel, DistributedDataParallel)): model.load_state_dict({ k.replace('module.', ''): v for k, v in data[key].items() }) else: model.load_state_dict({ k if 'module.' in k else 'module.' + k: v for k, v in data[key].items() }) logger.info('optimizer.load_state_dict+') optimizer.load_state_dict(data['optimizer']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True if ema is not None: ema.shadow = data.get('ema', {}) if isinstance( data.get('ema', {}), dict) else data['ema'].state_dict() del data if local_rank >= 0: for name, x in model.state_dict().items(): dist.broadcast(x, 0) logger.info( f'multinode init. local_rank={dist.get_rank()} is_master={is_master}' ) torch.cuda.synchronize() tqdm_disabled = bool(os.environ.get( 'TASK_NAME', '')) and local_rank != 0 # KakaoBrain Environment if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master) with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master) if ema is not None and len(ema) > 0: model_ema.load_state_dict({ k.replace('module.', ''): v for k, v in ema.state_dict().items() }) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if local_rank >= 0: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=C.get()['optimizer']['decay'], tqdm_disabled=tqdm_disabled) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if ema is not None and C.get( )['optimizer']['ema_interval'] > 0 and epoch % C.get( )['optimizer']['ema_interval'] == 0: logger.info(f'ema synced+ rank={dist.get_rank()}') if ema is not None: model.load_state_dict(ema.state_dict()) for name, x in model.state_dict().items(): # print(name) dist.broadcast(x, 0) torch.cuda.synchronize() logger.info(f'ema synced- rank={dist.get_rank()}') if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch): with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) if ema is not None: model_ema.load_state_dict({ k.replace('module.', ''): v for k, v in ema.state_dict().items() }) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) logger.info( f'epoch={epoch} ' f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} ' f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} ' f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} ' ) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product( ['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter(loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1']) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), 'ema': ema.state_dict() if ema is not None else None, }, save_path) del model result['top1_test'] = best_top1 return result
def main(): w = PyStopwatch() parser = ConfigArgumentParser(conflict_handler="resolve") parser.add_argument( "--dataroot", type=str, default="/data/private/pretrainedmodels", help="torchvision data folder", ) parser.add_argument("--until", type=int, default=5) parser.add_argument("--num-op", type=int, default=2) parser.add_argument("--num-policy", type=int, default=5) parser.add_argument("--num-search", type=int, default=200) parser.add_argument("--cv-ratio", type=float, default=0.4) parser.add_argument("--decay", type=float, default=-1) parser.add_argument("--redis", type=str, default="gpu-cloud-vnode30.dakao.io:23655") parser.add_argument("--per-class", action="store_true") parser.add_argument("--resume", action="store_true") parser.add_argument("--smoke-test", action="store_true") args = parser.parse_args() if args.decay > 0: logger.info("decay=%.4f" % args.decay) C.get()["optimizer"]["decay"] = args.decay add_filehandler( logger, os.path.join( "models", "%s_%s_cv%.1f.log" % (C.get()["dataset"], C.get()["model"]["type"], args.cv_ratio), ), ) logger.info("configuration...") logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4)) logger.info("initialize ray...") ray.init(address=args.redis) num_result_per_cv = 10 cv_num = 5 copied_c = copy.deepcopy(C.get().conf) logger.info( "search augmentation policies, dataset=%s model=%s" % (C.get()["dataset"], C.get()["model"]["type"]) ) logger.info( "----- Train without Augmentations cv=%d ratio(test)=%.1f -----" % (cv_num, args.cv_ratio) ) w.start(tag="train_no_aug") paths = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_fold%d" % (args.cv_ratio, i), ) for i in range(cv_num) ] print(paths) reqs = [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, C.get()["aug"], args.cv_ratio, i, save_path=paths[i], skip_exist=True, ) for i in range(cv_num) ] tqdm_epoch = tqdm(range(C.get()["epoch"])) is_done = False for epoch in tqdm_epoch: while True: epochs_per_cv = OrderedDict() for cv_idx in range(cv_num): try: latest_ckpt = torch.load(paths[cv_idx]) if "epoch" not in latest_ckpt: epochs_per_cv["cv%d" % (cv_idx + 1)] = C.get()["epoch"] continue epochs_per_cv["cv%d" % (cv_idx + 1)] = latest_ckpt["epoch"] except Exception as e: continue tqdm_epoch.set_postfix(epochs_per_cv) if ( len(epochs_per_cv) == cv_num and min(epochs_per_cv.values()) >= C.get()["epoch"] ): is_done = True if len(epochs_per_cv) == cv_num and min(epochs_per_cv.values()) >= epoch: break time.sleep(10) if is_done: break logger.info("getting results...") pretrain_results = ray.get(reqs) for r_model, r_cv, r_dict in pretrain_results: logger.info( "model=%s cv=%d top1_train=%.4f top1_valid=%.4f" % (r_model, r_cv + 1, r_dict["top1_train"], r_dict["top1_valid"]) ) logger.info("processed in %.4f secs" % w.pause("train_no_aug")) if args.until == 1: sys.exit(0) logger.info("----- Search Test-Time Augmentation Policies -----") w.start(tag="search") ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space["policy_%d_%d" % (i, j)] = hp.choice( "policy_%d_%d" % (i, j), list(range(0, len(ops))) ) space["prob_%d_%d" % (i, j)] = hp.uniform("prob_%d_ %d" % (i, j), 0.0, 1.0) space["level_%d_%d" % (i, j)] = hp.uniform( "level_%d_ %d" % (i, j), 0.0, 1.0 ) final_policy_set = [] total_computation = 0 reward_attr = "top1_valid" # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % ( C.get()["dataset"], C.get()["model"]["type"], cv_fold, args.cv_ratio, ) print(name) # def train(augs, rpt): def train(config, reporter): return eval_tta( copy.deepcopy(copied_c), config, reporter, num_class, get_model, get_dataloaders ) register_trainable(name, train) algo = HyperOptSearch( space, max_concurrent=4 * 20, metric=reward_attr, mode="max" ) results = run( train, name=name, config={ "dataroot": args.dataroot, "save_path": paths[cv_fold], "cv_ratio_test": args.cv_ratio, "cv_fold": cv_fold, "num_op": args.num_op, "num_policy": args.num_policy, }, num_samples=4 if args.smoke_test else args.num_search, resources_per_trial={"gpu": 1}, stop={"training_iteration": args.num_policy}, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False, ) print() df = results.results_df import pickle with open("results.pickle", "wb") as fp: pickle.dump(results, fp) df.to_csv("df.csv") results = df.sort_values(by=reward_attr, ascending=False) # results = [x for x in results if x.last_result is not None] # results = sorted(results, key=lambda x: x.last_result[reward_attr], reverse=True) # calculate computation usage for _, result in results.iterrows(): total_computation += result["elapsed_time"] for _, result in results.iloc[:num_result_per_cv].iterrows(): final_policy = policy_decoder( result, args.num_policy, args.num_op, prefix="config." ) logger.info( "loss=%.12f top1_valid=%.4f %s" % (result["minus_loss"], result["top1_valid"], final_policy) ) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info("final_policy=%d" % len(final_policy_set)) logger.info( "processed in %.4f secs, gpu hours=%.4f" % (w.pause("search"), total_computation / 3600.0) ) logger.info( "----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----" % (C.get()["model"]["type"], C.get()["dataset"], C.get()["aug"], args.cv_ratio) ) w.start(tag="train_aug") num_experiments = 5 default_path = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_default%d" % (args.cv_ratio, _), ) for _ in range(num_experiments) ] augment_path = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_augment%d" % (args.cv_ratio, _), ) for _ in range(num_experiments) ] reqs = [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, C.get()["aug"], 0.0, 0, save_path=default_path[_], skip_exist=True, ) for _ in range(num_experiments) ] + [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, final_policy_set, 0.0, 0, save_path=augment_path[_], ) for _ in range(num_experiments) ] tqdm_epoch = tqdm(range(C.get()["epoch"])) is_done = False for epoch in tqdm_epoch: while True: epochs = OrderedDict() for exp_idx in range(num_experiments): try: if os.path.exists(default_path[exp_idx]): latest_ckpt = torch.load(default_path[exp_idx]) epochs["default_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"] except: pass try: if os.path.exists(augment_path[exp_idx]): latest_ckpt = torch.load(augment_path[exp_idx]) epochs["augment_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"] except: pass tqdm_epoch.set_postfix(epochs) if ( len(epochs) == num_experiments * 2 and min(epochs.values()) >= C.get()["epoch"] ): is_done = True if len(epochs) == num_experiments * 2 and min(epochs.values()) >= epoch: break time.sleep(10) if is_done: break logger.info("getting results...") final_results = ray.get(reqs) for train_mode in ["default", "augment"]: avg = 0.0 for _ in range(num_experiments): r_model, r_cv, r_dict = final_results.pop(0) logger.info( "[%s] top1_train=%.4f top1_test=%.4f" % (train_mode, r_dict["top1_train"], r_dict["top1_test"]) ) avg += r_dict["top1_test"] avg /= num_experiments logger.info( "[%s] top1_test average=%.4f (#experiments=%d)" % (train_mode, avg, num_experiments) ) logger.info("processed in %.4f secs" % w.pause("train_aug")) logger.info(w)
def train_and_eval( args_save, tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric="last", save_path=None, only_eval=False, local_rank=-1, evaluation_interval=5, get_dataloaders=None, num_class=None, get_model=None, ): assert get_dataloaders is not None assert num_class is not None assert get_model is not None total_batch = C.get()["batch"] if local_rank >= 0: dist.init_process_group( backend="nccl", init_method="env://", world_size=int(os.environ["WORLD_SIZE"]), ) device = torch.device("cuda", local_rank) torch.cuda.set_device(device) C.get()["lr"] *= dist.get_world_size() logger.info( f'local batch={C.get()["batch"]} world_size={dist.get_world_size()} ----> total batch={C.get()["batch"] * dist.get_world_size()}' ) total_batch = C.get()["batch"] * dist.get_world_size() is_master = local_rank < 0 or dist.get_rank() == 0 if is_master: add_filehandler(logger, args_save + ".log") if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()["epoch"] trainsampler, trainloader, validloader, testloader_ = get_dataloaders( C.get()["dataset"], C.get()["batch"], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0), ) # create a model & an optimizer model = get_model( C.get()["model"], num_class(C.get()["dataset"]), local_rank=local_rank ) model_ema = get_model( C.get()["model"], num_class(C.get()["dataset"]), local_rank=-1 ) model_ema.eval() criterion_ce = criterion = CrossEntropyLabelSmooth( num_class(C.get()["dataset"]), C.get().conf.get("lb_smooth", 0) ) if C.get().conf.get("mixup", 0.0) > 0.0: criterion = CrossEntropyMixUpLabelSmooth( num_class(C.get()["dataset"]), C.get().conf.get("lb_smooth", 0) ) if C.get()["optimizer"]["type"] == "sgd": optimizer = optim.SGD( model.parameters(), lr=C.get()["lr"], momentum=C.get()["optimizer"].get("momentum", 0.9), weight_decay=0.0, nesterov=C.get()["optimizer"].get("nesterov", True), ) elif C.get()["optimizer"]["type"] == "rmsprop": optimizer = RMSpropTF( model.parameters(), lr=C.get()["lr"], weight_decay=0.0, alpha=0.9, momentum=0.9, eps=0.001, ) else: raise ValueError("invalid optimizer type=%s" % C.get()["optimizer"]["type"]) lr_scheduler_type = C.get()["lr_schedule"].get("type", "cosine") if lr_scheduler_type == "cosine": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=C.get()["epoch"], eta_min=0.0 ) elif lr_scheduler_type == "resnet": scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == "efficientnet": scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda x: 0.97 ** int((x + C.get()["lr_schedule"]["warmup"]["epoch"]) / 2.4), ) else: raise ValueError("invalid lr_schduler=%s" % lr_scheduler_type) if ( C.get()["lr_schedule"].get("warmup", None) and C.get()["lr_schedule"]["warmup"]["epoch"] > 0 ): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()["lr_schedule"]["warmup"]["multiplier"], total_epoch=C.get()["lr_schedule"]["warmup"]["epoch"], after_scheduler=scheduler, ) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning("tag not provided, no tensorboard log.") else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(log_dir="./logs/%s/%s" % (tag, x)) for x in ["train", "valid", "test"] ] if C.get()["optimizer"]["ema"] > 0.0 and is_master: # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet ema = EMA(C.get()["optimizer"]["ema"]) else: ema = None result = OrderedDict() epoch_start = 1 if ( save_path != "test.pth" ): # and is_master: --> should load all data(not able to be broadcasted) if save_path and os.path.exists(save_path): logger.info("%s file found. loading..." % save_path) data = torch.load(save_path) key = "model" if "model" in data else "state_dict" if "epoch" not in data: model.load_state_dict(data) else: logger.info("checkpoint epoch@%d" % data["epoch"]) if not isinstance(model, (DataParallel, DistributedDataParallel)): model.load_state_dict( {k.replace("module.", ""): v for k, v in data[key].items()} ) else: model.load_state_dict( { k if "module." in k else "module." + k: v for k, v in data[key].items() } ) logger.info("optimizer.load_state_dict+") optimizer.load_state_dict(data["optimizer"]) if data["epoch"] < C.get()["epoch"]: epoch_start = data["epoch"] else: only_eval = True if ema is not None: ema.shadow = ( data.get("ema", {}) if isinstance(data.get("ema", {}), dict) else data["ema"].state_dict() ) del data else: logger.info('"%s" file not found. skip to pretrain weights...' % save_path) if only_eval: logger.warning( "model checkpoint not found. only-evaluation mode is off." ) only_eval = False if local_rank >= 0: for name, x in model.state_dict().items(): dist.broadcast(x, 0) logger.info( f"multinode init. local_rank={dist.get_rank()} is_master={is_master}" ) torch.cuda.synchronize() tqdm_disabled = ( bool(os.environ.get("TASK_NAME", "")) and local_rank != 0 ) # KakaoBrain Environment if only_eval: logger.info("evaluation only+") model.eval() rs = dict() rs["train"] = run_epoch( model, trainloader, criterion, None, desc_default="train", epoch=0, writer=writers[0], is_master=is_master, ) with torch.no_grad(): rs["valid"] = run_epoch( model, validloader, criterion, None, desc_default="valid", epoch=0, writer=writers[1], is_master=is_master, ) rs["test"] = run_epoch( model, testloader_, criterion, None, desc_default="*test", epoch=0, writer=writers[2], is_master=is_master, ) if ema is not None and len(ema) > 0: model_ema.load_state_dict( {k.replace("module.", ""): v for k, v in ema.state_dict().items()} ) rs["valid"] = run_epoch( model_ema, validloader, criterion_ce, None, desc_default="valid(EMA)", epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled, ) rs["test"] = run_epoch( model_ema, testloader_, criterion_ce, None, desc_default="*test(EMA)", epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled, ) for key, setname in itertools.product( ["loss", "top1", "top5"], ["train", "valid", "test"] ): if setname not in rs: continue result["%s_%s" % (key, setname)] = rs[setname][key] result["epoch"] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if local_rank >= 0: trainsampler.set_epoch(epoch) model.train() rs = dict() rs["train"] = run_epoch( model, trainloader, criterion, optimizer, desc_default="train", epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=C.get()["optimizer"]["decay"], tqdm_disabled=tqdm_disabled, ) model.eval() if math.isnan(rs["train"]["loss"]): raise Exception("train loss is NaN.") if ( ema is not None and C.get()["optimizer"]["ema_interval"] > 0 and epoch % C.get()["optimizer"]["ema_interval"] == 0 ): logger.info(f"ema synced+ rank={dist.get_rank()}") if ema is not None: model.load_state_dict(ema.state_dict()) for name, x in model.state_dict().items(): # print(name) dist.broadcast(x, 0) torch.cuda.synchronize() logger.info(f"ema synced- rank={dist.get_rank()}") if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch): with torch.no_grad(): rs["valid"] = run_epoch( model, validloader, criterion_ce, None, desc_default="valid", epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled, ) rs["test"] = run_epoch( model, testloader_, criterion_ce, None, desc_default="*test", epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled, ) if ema is not None: model_ema.load_state_dict( { k.replace("module.", ""): v for k, v in ema.state_dict().items() } ) rs["valid"] = run_epoch( model_ema, validloader, criterion_ce, None, desc_default="valid(EMA)", epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled, ) rs["test"] = run_epoch( model_ema, testloader_, criterion_ce, None, desc_default="*test(EMA)", epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled, ) logger.info( f"epoch={epoch} " f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} ' f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} ' f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} ' ) if metric == "last" or rs[metric]["top1"] > best_top1: if metric != "last": best_top1 = rs[metric]["top1"] for key, setname in itertools.product( ["loss", "top1", "top5"], ["train", "valid", "test"] ): result["%s_%s" % (key, setname)] = rs[setname][key] result["epoch"] = epoch writers[1].add_scalar("valid_top1/best", rs["valid"]["top1"], epoch) writers[2].add_scalar("test_top1/best", rs["test"]["top1"], epoch) reporter( loss_valid=rs["valid"]["loss"], top1_valid=rs["valid"]["top1"], loss_test=rs["test"]["loss"], top1_test=rs["test"]["top1"], ) # save checkpoint if is_master and save_path: logger.info( "save model@%d to %s, err=%.4f" % (epoch, save_path, 1 - best_top1) ) torch.save( { "epoch": epoch, "log": { "train": rs["train"].get_dict(), "valid": rs["valid"].get_dict(), "test": rs["test"].get_dict(), }, "optimizer": optimizer.state_dict(), "model": model.state_dict(), "ema": ema.state_dict() if ema is not None else None, }, save_path, ) del model result["top1_test"] = best_top1 return result
parser.add_argument('--smoke-test', action='store_true') # ? parser.add_argument('--remote', action='store_true', help='whether to use distributed training') args = parser.parse_args() #print('args is ', args) #sys.exit(0) # 当命令行参数中存在有效的decay,就使用它 if args.decay > 0: logger.info('decay=%.4f' % args.decay) C.get()['optimizer']['decay'] = args.decay add_filehandler(logger, os.path.join( 'models', '%s_%s_cv%.1f.log' % (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio))) #logger添加file handler logger.info('configuration...') logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4)) #print config logger.info('initialize ray...') #sys.exit(0) if args.remote: ray.init(redis_address=args.redis) # 启动分布式 num_result_per_cv = 10 # ? 可能是交叉验证那个 #cv_num = 5 # ? 可能是交叉验证那个 cv_num = 1 # ! temp change copied_c = copy.deepcopy(C.get().conf) #copy一份config
def prepare() -> argparse.Namespace: parser = ConfigArgumentParser(conflict_handler='resolve') # parser.add_argument('--dataroot', type=str, default='~/datasets', help='torchvision data folder') parser.add_argument('--until', type=int, default=5) parser.add_argument('--num_fold', type=int, default=5) parser.add_argument('--num_result_per_fold', type=int, default=10) parser.add_argument('--num_op', type=int, default=2) parser.add_argument('--num_policy', type=int, default=5) parser.add_argument('--num_search', type=int, default=200) parser.add_argument('--retrain_times', type=int, default=5) parser.add_argument('--cv_ratio', type=float, default=0.4) parser.add_argument('--decay', type=float, default=-1) parser.add_argument('--redis', type=str, default='') # parser.add_argument('--per_class', action='store_true') parser.add_argument('--resume', action='store_true') parser.add_argument('--smoke_test', action='store_true') args: argparse.Namespace = parser.parse_args() add_filehandler( logger, '%s_%s_cv%.1f.log' % (Config.get()['dataset'], Config.get()['model']['type'], args.cv_ratio)) logger.info('args type: %s' % str(type(args))) global EXEC_ROOT, MODEL_ROOT, MODEL_PATHS, DATASET_ROOT EXEC_ROOT = os.getcwd() # fast-autoaugment/experiments/xxx logger.info('EXEC_ROOT: %s' % EXEC_ROOT) MODEL_ROOT = os.path.join( EXEC_ROOT, 'models') # fast-autoaugment/experiments/xxx/models logger.info('MODEL_ROOT: %s' % MODEL_ROOT) DATASET_ROOT = os.path.abspath( os.path.join(os.path.expanduser('~'), 'datasets', Config.get()['dataset'].lower())) # ~/datasets/cifar10 logger.info('DATASET_ROOT: %s' % DATASET_ROOT) _check_directory(MODEL_ROOT) _check_directory(DATASET_ROOT) MODEL_PATHS = [ _get_model_path( dataset=Config.get()['dataset'], model=Config.get()['model']['type'], config='ratio%.1f_fold%d' % (args.cv_ratio, i) # without_aug ) for i in range(args.num_fold) ] print('MODEL_PATHS:', MODEL_PATHS) logger.info('MODEL_PATHS: %s' % MODEL_PATHS) if args.decay > 0: logger.info('decay=%.4f' % args.decay) Config.get()['optimizer']['decay'] = args.decay logger.info('configuration...') logger.info(json.dumps(Config.get().conf, sort_keys=True, indent=4)) logger.info('initialize ray...') # ray.init(redis_address=args.redis) address_info = ray.init(include_webui=True) logger.info('ray initialization: address information:') logger.info(str(address_info)) logger.info('start searching augmentation policies, dataset=%s model=%s' % (Config.get()['dataset'], Config.get()['model']['type'])) return args
def __init__(self, args=None, paths_ls=None): if args is None: d = yaml.load(open( '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml' ), Loader=yaml.FullLoader) from argparse import Namespace args = Namespace(**d) args.redis = 'gpu-cloud-vnode30.dakao.io:23655' args.per_class = True args.resume = True args.smoke_test = True if args.decay > 0: logger.info('decay=%.4f' % args.decay) C.get()['optimizer']['decay'] = args.decay add_filehandler( logger, os.path.join( 'FastAutoAugment/models', '%s_%s_cv%.1f.log' % (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio))) logger.info('initialize ray...') ray.init(num_cpus=1, num_gpus=1) num_result_per_cv = 10 if not args.smoke_test else 2 cv_num = 5 if paths_ls is None else len(paths_ls) args.version = 1 args._timestamp = '2020/08/30 20:40:10' args.config = '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml' copied_c = copy.deepcopy(args) self.copied_c = copied_c logger.info('search augmentation policies, dataset=%s model=%s' % (C.get()['dataset'], C.get()['model']['type'])) logger.info( '----- Train without Augmentations ratio(test)=%.1f -----' % (args.cv_ratio)) w.start(tag='train_no_aug') if paths_ls is None: paths_ls = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_fold%d' % (args.cv_ratio, i)) for i in range(cv_num) ] print(paths_ls) logger.info('getting results...') pretrain_results = [ train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths_ls[i], skip_exist=args.smoke_test) for i in range(cv_num) ] for r_model, r_cv, r_dict in pretrain_results: logger.info('model=%s cv=%d top1_train=%.4f top1_valid=%.4f' % (r_model, r_cv + 1, r_dict['top1_train'], r_dict['top1_valid'])) logger.info('processed in %.4f secs' % w.pause('train_no_aug')) if args.until == 1: sys.exit(0) logger.info('----- Search Test-Time Augmentation Policies -----') w.start(tag='search') ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform( 'prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform( 'level_%d_ %d' % (i, j), 0.0, 1.0) def eval_t(augs): print(augs) return eval_tta(copy.deepcopy(copied_c), augs) final_policy_set = [] total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % (C.get( )['dataset'], C.get()['model']['type'], cv_fold, args.cv_ratio) print(name) algo = HyperOptSearch(space, max_concurrent=1, metric=reward_attr) aug_config = { 'dataroot': args.dataroot, 'save_path': paths_ls[cv_fold], 'cv_ratio_test': args.cv_ratio, 'cv_fold': cv_fold, 'num_op': args.num_op, 'num_policy': args.num_policy } num_samples = 4 if args.smoke_test else args.num_search print(aug_config) eval_t(aug_config) results = run(eval_t, search_alg=algo, config=aug_config, num_samples=num_samples, resources_per_trial={'gpu': 1}, stop={'training_iteration': args.num_policy}) dataframe = results.dataframe().sort_values(reward_attr, ascending=False) total_computation = dataframe['elapsed_time'].sum() for i in range(num_result_per_cv): config_dict = dataframe.loc[i].filter( like='config').to_dict() new_keys = [ x.replace('config/', '') for x in config_dict.keys() ] new_config_dict = {} for key in new_keys: new_config_dict[key] = config_dict['config/' + key] final_policy = policy_decoder(new_config_dict, args.num_policy, args.num_op) logger.info( 'loss=%.12f top1_valid=%.4f %s' % (dataframe.loc[i]['minus_loss'].item(), dataframe.loc[i]['top1_valid'].item(), final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'], args.cv_ratio)) w.start(tag='train_aug') self.final_policy_set = final_policy_set self.args = args self.paths_ls = paths_ls
def search(args, paths=None): args.redis = 'gpu-cloud-vnode30.dakao.io:23655' args.per_class = True args.resume = True args.smoke_test = True if args.decay > 0: logger.info('decay=%.4f' % args.decay) C.get()['optimizer']['decay'] = args.decay add_filehandler( logger, os.path.join( 'FastAutoAugment/models', '%s_%s_cv%.1f.log' % (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio))) logger.info('configuration...') logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4)) logger.info('initialize ray...') ray.init(num_cpus=1, num_gpus=1) num_result_per_cv = 10 if not args.smoke_test else 2 cv_num = 5 copied_c = copy.deepcopy(C.get().conf) logger.info('search augmentation policies, dataset=%s model=%s' % (C.get()['dataset'], C.get()['model']['type'])) logger.info( '----- Train without Augmentations cv=%d ratio(test)=%.1f -----' % (cv_num, args.cv_ratio)) w.start(tag='train_no_aug') if paths == None: paths = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_fold%d' % (args.cv_ratio, i)) for i in range(cv_num) ] print(paths) tqdm_epoch = tqdm(range(C.get()['epoch'])) logger.info('getting results...') # pretrain_results = [ # train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths[i], # skip_exist=True) for i in range(cv_num)] pretrain_results = [ train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths[i]) for i in range(cv_num) ] for r_model, r_cv, r_dict in pretrain_results: logger.info( 'model=%s cv=%d top1_train=%.4f top1_valid=%.4f' % (r_model, r_cv + 1, r_dict['top1_train'], r_dict['top1_valid'])) logger.info('processed in %.4f secs' % w.pause('train_no_aug')) if args.until == 1: sys.exit(0) logger.info('----- Search Test-Time Augmentation Policies -----') w.start(tag='search') ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform('prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform('level_%d_ %d' % (i, j), 0.0, 1.0) def eval_t(augs): print(augs) return eval_tta(copy.deepcopy(copied_c), augs) final_policy_set = [] total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % (C.get()['dataset'], C.get()['model']['type'], cv_fold, args.cv_ratio) print(name) algo = HyperOptSearch(space, max_concurrent=1, metric=reward_attr) aug_config = { 'dataroot': args.dataroot, 'save_path': paths[cv_fold], 'cv_ratio_test': args.cv_ratio, 'cv_fold': cv_fold, 'num_op': args.num_op, 'num_policy': args.num_policy } num_samples = 4 if args.smoke_test else args.num_search print(aug_config) eval_t(aug_config) results = run(eval_t, search_alg=algo, config=aug_config, num_samples=num_samples, resources_per_trial={'gpu': 1}, stop={'training_iteration': args.num_policy}) dataframe = results.dataframe().sort_values(reward_attr, ascending=False) total_computation = dataframe['elapsed_time'].sum() for i in range(num_result_per_cv): config_dict = dataframe.loc[i].filter(like='config').to_dict() new_keys = [ x.replace('config/', '') for x in config_dict.keys() ] new_config_dict = {} for key in new_keys: new_config_dict[key] = config_dict['config/' + key] final_policy = policy_decoder(new_config_dict, args.num_policy, args.num_op) logger.info( 'loss=%.12f top1_valid=%.4f %s' % (dataframe.loc[i]['minus_loss'].item(), dataframe.loc[i]['top1_valid'].item(), final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'], args.cv_ratio)) w.start(tag='train_aug') num_experiments = 5 default_path = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_default%d' % (args.cv_ratio, _)) for _ in range(num_experiments) ] augment_path = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_augment%d' % (args.cv_ratio, _)) for _ in range(num_experiments) ] tqdm_epoch = tqdm(range(C.get()['epoch'])) is_done = False for epoch in tqdm_epoch: while True: epochs = OrderedDict() for exp_idx in range(num_experiments): try: if os.path.exists(default_path[exp_idx]): latest_ckpt = torch.load(default_path[exp_idx]) if 'epoch' not in latest_ckpt: epochs['default_exp%d' % (exp_idx + 1)] = C.get()['epoch'] else: epochs['default_exp%d' % (exp_idx + 1)] = latest_ckpt['epoch'] except Exception as e: pass try: if os.path.exists(augment_path[exp_idx]): latest_ckpt = torch.load(augment_path[exp_idx]) if 'epoch' not in latest_ckpt: epochs['augment_exp%d' % (exp_idx + 1)] = C.get()['epoch'] else: epochs['augment_exp%d' % (exp_idx + 1)] = latest_ckpt['epoch'] except: pass tqdm_epoch.set_postfix(epochs) if len(epochs) == num_experiments * 2 and min( epochs.values()) >= C.get()['epoch']: is_done = True if len(epochs) == num_experiments * 2 and min( epochs.values()) >= epoch: break time.sleep(10, '-- sleeping for 10 seconds --') if is_done: break logger.info('getting results...') final_results = [train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], 0.0, 0, save_path=default_path[_], skip_exist=True) for _ in range(num_experiments)] + \ [train_model(copy.deepcopy(copied_c), args.dataroot, final_policy_set, 0.0, 0, save_path=augment_path[_]) for _ in range(num_experiments)] for train_mode in ['default', 'augment']: avg = 0. for _ in range(num_experiments): r_model, r_cv, r_dict = final_results.pop(0) logger.info( '[%s] top1_train=%.4f top1_test=%.4f' % (train_mode, r_dict['top1_train'], r_dict['top1_test'])) avg += r_dict['top1_test'] avg /= num_experiments logger.info('[%s] top1_test average=%.4f (#experiments=%d)' % (train_mode, avg, num_experiments)) logger.info('processed in %.4f secs' % w.pause('train_aug')) logger.info(w)