def eval_tta3(config, augment, reporter): C.get() C.get().conf = config save_path = augment['save_path'] cv_id, gr_id = augment["cv_id"], augment["gr_id"] gr_ids = augment["gr_ids"] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) del ckpt model.eval() loader = get_post_dataloader(C.get()["dataset"], C.get()['batch'], augment["dataroot"], augment['cv_ratio_test'], cv_id, gr_id, gr_ids) start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') for data, label in loader: data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) # (N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) metrics.add_dict({ 'loss': np.sum(loss.detach().cpu().numpy()), 'correct': np.sum(correct), 'cnt': len(data) }) del loss, correct, pred, data, label del model, loader metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(loss=metrics['loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def eval_tta2(config, augment, reporter): C.get() C.get().conf = config cv_ratio_test, cv_id, save_path = augment['cv_ratio_test'], augment['cv_id'], augment['save_path'] gr_id = augment["gr_id"] num_repeat = 1 # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for i in range(num_repeat): _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_id, gr_assign=augment["gr_assign"], gr_id=gr_id) loaders.append(validloader) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') for loader in loaders: for data, label in loader: data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) # (N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) metrics.add_dict({ 'minus_loss': -1 * np.sum(loss.detach().cpu().numpy()), 'correct': np.sum(correct), 'cnt': len(data) }) del loss, correct, pred, data, label del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def eval_tta(config, augment, reporter, num_class, get_model, get_dataloaders): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = ( augment["cv_ratio_test"], augment["cv_fold"], augment["save_path"], ) # setup - provided augmentation rules C.get()["aug"] = policy_decoder(augment, augment["num_policy"], augment["num_op"]) # eval model = get_model(C.get()["model"], num_class(C.get()["dataset"])) ckpt = torch.load(save_path) if "model" in ckpt: model.load_state_dict(ckpt["model"]) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment["num_policy"]): # TODO _, tl, validloader, tl2 = get_dataloaders( C.get()["dataset"], C.get()["batch"], augment["dataroot"], cv_ratio_test, split_idx=cv_fold, ) loaders.append(iter(validloader)) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction="none") try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy()) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = ( pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy() ) corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() metrics.add_dict( { "minus_loss": -1 * np.sum(losses_min), "correct": np.sum(corrects_max), "cnt": len(corrects_max), } ) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / "cnt" gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter( minus_loss=metrics["minus_loss"], top1_valid=metrics["correct"], elapsed_time=gpu_secs, done=True, ) return metrics["correct"]
def main(): w = PyStopwatch() parser = ConfigArgumentParser(conflict_handler="resolve") parser.add_argument( "--dataroot", type=str, default="/data/private/pretrainedmodels", help="torchvision data folder", ) parser.add_argument("--until", type=int, default=5) parser.add_argument("--num-op", type=int, default=2) parser.add_argument("--num-policy", type=int, default=5) parser.add_argument("--num-search", type=int, default=200) parser.add_argument("--cv-ratio", type=float, default=0.4) parser.add_argument("--decay", type=float, default=-1) parser.add_argument("--redis", type=str, default="gpu-cloud-vnode30.dakao.io:23655") parser.add_argument("--per-class", action="store_true") parser.add_argument("--resume", action="store_true") parser.add_argument("--smoke-test", action="store_true") args = parser.parse_args() if args.decay > 0: logger.info("decay=%.4f" % args.decay) C.get()["optimizer"]["decay"] = args.decay add_filehandler( logger, os.path.join( "models", "%s_%s_cv%.1f.log" % (C.get()["dataset"], C.get()["model"]["type"], args.cv_ratio), ), ) logger.info("configuration...") logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4)) logger.info("initialize ray...") ray.init(address=args.redis) num_result_per_cv = 10 cv_num = 5 copied_c = copy.deepcopy(C.get().conf) logger.info( "search augmentation policies, dataset=%s model=%s" % (C.get()["dataset"], C.get()["model"]["type"]) ) logger.info( "----- Train without Augmentations cv=%d ratio(test)=%.1f -----" % (cv_num, args.cv_ratio) ) w.start(tag="train_no_aug") paths = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_fold%d" % (args.cv_ratio, i), ) for i in range(cv_num) ] print(paths) reqs = [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, C.get()["aug"], args.cv_ratio, i, save_path=paths[i], skip_exist=True, ) for i in range(cv_num) ] tqdm_epoch = tqdm(range(C.get()["epoch"])) is_done = False for epoch in tqdm_epoch: while True: epochs_per_cv = OrderedDict() for cv_idx in range(cv_num): try: latest_ckpt = torch.load(paths[cv_idx]) if "epoch" not in latest_ckpt: epochs_per_cv["cv%d" % (cv_idx + 1)] = C.get()["epoch"] continue epochs_per_cv["cv%d" % (cv_idx + 1)] = latest_ckpt["epoch"] except Exception as e: continue tqdm_epoch.set_postfix(epochs_per_cv) if ( len(epochs_per_cv) == cv_num and min(epochs_per_cv.values()) >= C.get()["epoch"] ): is_done = True if len(epochs_per_cv) == cv_num and min(epochs_per_cv.values()) >= epoch: break time.sleep(10) if is_done: break logger.info("getting results...") pretrain_results = ray.get(reqs) for r_model, r_cv, r_dict in pretrain_results: logger.info( "model=%s cv=%d top1_train=%.4f top1_valid=%.4f" % (r_model, r_cv + 1, r_dict["top1_train"], r_dict["top1_valid"]) ) logger.info("processed in %.4f secs" % w.pause("train_no_aug")) if args.until == 1: sys.exit(0) logger.info("----- Search Test-Time Augmentation Policies -----") w.start(tag="search") ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space["policy_%d_%d" % (i, j)] = hp.choice( "policy_%d_%d" % (i, j), list(range(0, len(ops))) ) space["prob_%d_%d" % (i, j)] = hp.uniform("prob_%d_ %d" % (i, j), 0.0, 1.0) space["level_%d_%d" % (i, j)] = hp.uniform( "level_%d_ %d" % (i, j), 0.0, 1.0 ) final_policy_set = [] total_computation = 0 reward_attr = "top1_valid" # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % ( C.get()["dataset"], C.get()["model"]["type"], cv_fold, args.cv_ratio, ) print(name) # def train(augs, rpt): def train(config, reporter): return eval_tta( copy.deepcopy(copied_c), config, reporter, num_class, get_model, get_dataloaders ) register_trainable(name, train) algo = HyperOptSearch( space, max_concurrent=4 * 20, metric=reward_attr, mode="max" ) results = run( train, name=name, config={ "dataroot": args.dataroot, "save_path": paths[cv_fold], "cv_ratio_test": args.cv_ratio, "cv_fold": cv_fold, "num_op": args.num_op, "num_policy": args.num_policy, }, num_samples=4 if args.smoke_test else args.num_search, resources_per_trial={"gpu": 1}, stop={"training_iteration": args.num_policy}, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False, ) print() df = results.results_df import pickle with open("results.pickle", "wb") as fp: pickle.dump(results, fp) df.to_csv("df.csv") results = df.sort_values(by=reward_attr, ascending=False) # results = [x for x in results if x.last_result is not None] # results = sorted(results, key=lambda x: x.last_result[reward_attr], reverse=True) # calculate computation usage for _, result in results.iterrows(): total_computation += result["elapsed_time"] for _, result in results.iloc[:num_result_per_cv].iterrows(): final_policy = policy_decoder( result, args.num_policy, args.num_op, prefix="config." ) logger.info( "loss=%.12f top1_valid=%.4f %s" % (result["minus_loss"], result["top1_valid"], final_policy) ) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info("final_policy=%d" % len(final_policy_set)) logger.info( "processed in %.4f secs, gpu hours=%.4f" % (w.pause("search"), total_computation / 3600.0) ) logger.info( "----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----" % (C.get()["model"]["type"], C.get()["dataset"], C.get()["aug"], args.cv_ratio) ) w.start(tag="train_aug") num_experiments = 5 default_path = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_default%d" % (args.cv_ratio, _), ) for _ in range(num_experiments) ] augment_path = [ _get_path( C.get()["dataset"], C.get()["model"]["type"], "ratio%.1f_augment%d" % (args.cv_ratio, _), ) for _ in range(num_experiments) ] reqs = [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, C.get()["aug"], 0.0, 0, save_path=default_path[_], skip_exist=True, ) for _ in range(num_experiments) ] + [ train_model.remote( copy.deepcopy(copied_c), args.dataroot, final_policy_set, 0.0, 0, save_path=augment_path[_], ) for _ in range(num_experiments) ] tqdm_epoch = tqdm(range(C.get()["epoch"])) is_done = False for epoch in tqdm_epoch: while True: epochs = OrderedDict() for exp_idx in range(num_experiments): try: if os.path.exists(default_path[exp_idx]): latest_ckpt = torch.load(default_path[exp_idx]) epochs["default_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"] except: pass try: if os.path.exists(augment_path[exp_idx]): latest_ckpt = torch.load(augment_path[exp_idx]) epochs["augment_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"] except: pass tqdm_epoch.set_postfix(epochs) if ( len(epochs) == num_experiments * 2 and min(epochs.values()) >= C.get()["epoch"] ): is_done = True if len(epochs) == num_experiments * 2 and min(epochs.values()) >= epoch: break time.sleep(10) if is_done: break logger.info("getting results...") final_results = ray.get(reqs) for train_mode in ["default", "augment"]: avg = 0.0 for _ in range(num_experiments): r_model, r_cv, r_dict = final_results.pop(0) logger.info( "[%s] top1_train=%.4f top1_test=%.4f" % (train_mode, r_dict["top1_train"], r_dict["top1_test"]) ) avg += r_dict["top1_test"] avg /= num_experiments logger.info( "[%s] top1_test average=%.4f (#experiments=%d)" % (train_mode, avg, num_experiments) ) logger.info("processed in %.4f secs" % w.pause("train_aug")) logger.info(w)
queue_trials=True, resume=args.resume, raise_on_failed_trial=False) # 参数过时了 #results = run(exp_config, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False) results = [x for x in results if x.last_result is not None] results = sorted(results, key=lambda x: x.last_result[reward_attr], reverse=True) # calculate computation usage for result in results: total_computation += result.last_result['elapsed_time'] for result in results[:num_result_per_cv]: final_policy = policy_decoder(result.config, args.num_policy, args.num_op) logger.info('loss=%.12f top1_valid=%.4f %s' % (result.last_result['minus_loss'], result.last_result['top1_valid'], final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) w.pause('search') print(w) #logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
def eval_tta(config, augment): C.get() C.get().conf = config cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[ 'cv_fold'], augment['save_path'] print(augment) # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment['num_policy']): # TODO _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_fold) loaders.append(iter(validloader)) del tl, tl2 start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy()) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() metrics.add_dict({ 'minus_loss': -1 * np.sum(losses_min), 'correct': np.sum(corrects_max), 'cnt': len(corrects_max) }) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() # reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) tune.track.log(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
config=aug_config, num_samples=num_samples, resources_per_trial={'gpu': 1}, stop={'training_iteration': args.num_policy}) dataframe = results.dataframe().sort_values(reward_attr, ascending=False) total_computation = dataframe['elapsed_time'].sum() for i in range(num_result_per_cv): config_dict = dataframe.loc[i].filter(like='config').to_dict() new_keys = [ x.replace('config/', '') for x in config_dict.keys() ] new_config_dict = {} for key in new_keys: new_config_dict[key] = config_dict['config/' + key] final_policy = policy_decoder(new_config_dict, args.num_policy, args.num_op) logger.info( 'loss=%.12f top1_valid=%.4f %s' % (dataframe.loc[i]['minus_loss'].item(), dataframe.loc[i]['top1_valid'].item(), final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
def search_aug_policy(copied_conf, cv_ratio, num_fold, num_result_per_fold, num_policy, num_op, smoke_test, num_search, resume) -> list: global MODEL_PATHS, DATASET_ROOT global logger, watcher logger.info( '----- [Phase 2.] Search Test-Time Augmentation Policies -----') watcher.start(tag='search') ops = augment_list(False) space = {} for i in range(num_policy): for j in range(num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform('prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform('level_%d_ %d' % (i, j), 0.0, 1.0) final_policy_set = [] total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(num_fold): name = "search_%s_%s_fold%d_ratio%.1f" % (Config.get( )['dataset'], Config.get()['model']['type'], cv_fold, cv_ratio) print(name) register_trainable(name, lambda augs, rpt: eval_tta( copy.deepcopy(copied_conf), augs, rpt) ) # augs: a dict, just like the 'exp_config' algo = HyperOptSearch(space, max_concurrent=4 * 20, reward_attr=reward_attr) exp_config = { name: { 'run': name, 'num_samples': 4 if smoke_test else num_search, 'resources_per_trial': { 'gpu': 1 }, 'stop': { 'training_iteration': num_policy }, 'config': { 'dataroot': DATASET_ROOT, 'save_path': MODEL_PATHS[cv_fold], 'cv_ratio_test': cv_ratio, 'cv_fold': cv_fold, 'num_op': num_op, 'num_policy': num_policy }, } } results = run_experiments(exp_config, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=resume, raise_on_failed_trial=False) print() results = [x for x in results if x.last_result is not None] results = sorted(results, key=lambda x: x.last_result[reward_attr], reverse=True) # calculate computation usage for result in results: total_computation += result.last_result['elapsed_time'] for result in results[:num_result_per_fold]: final_policy = policy_decoder(result.config, num_policy, num_op) logger.info('loss=%.12f top1_valid=%.4f %s' % (result.last_result['minus_loss'], result.last_result['top1_valid'], final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (watcher.pause('search'), total_computation / 3600.)) return final_policy_set
def __init__(self, args=None, paths_ls=None): if args is None: d = yaml.load(open( '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml' ), Loader=yaml.FullLoader) from argparse import Namespace args = Namespace(**d) args.redis = 'gpu-cloud-vnode30.dakao.io:23655' args.per_class = True args.resume = True args.smoke_test = True if args.decay > 0: logger.info('decay=%.4f' % args.decay) C.get()['optimizer']['decay'] = args.decay add_filehandler( logger, os.path.join( 'FastAutoAugment/models', '%s_%s_cv%.1f.log' % (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio))) logger.info('initialize ray...') ray.init(num_cpus=1, num_gpus=1) num_result_per_cv = 10 if not args.smoke_test else 2 cv_num = 5 if paths_ls is None else len(paths_ls) args.version = 1 args._timestamp = '2020/08/30 20:40:10' args.config = '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml' copied_c = copy.deepcopy(args) self.copied_c = copied_c logger.info('search augmentation policies, dataset=%s model=%s' % (C.get()['dataset'], C.get()['model']['type'])) logger.info( '----- Train without Augmentations ratio(test)=%.1f -----' % (args.cv_ratio)) w.start(tag='train_no_aug') if paths_ls is None: paths_ls = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_fold%d' % (args.cv_ratio, i)) for i in range(cv_num) ] print(paths_ls) logger.info('getting results...') pretrain_results = [ train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths_ls[i], skip_exist=args.smoke_test) for i in range(cv_num) ] for r_model, r_cv, r_dict in pretrain_results: logger.info('model=%s cv=%d top1_train=%.4f top1_valid=%.4f' % (r_model, r_cv + 1, r_dict['top1_train'], r_dict['top1_valid'])) logger.info('processed in %.4f secs' % w.pause('train_no_aug')) if args.until == 1: sys.exit(0) logger.info('----- Search Test-Time Augmentation Policies -----') w.start(tag='search') ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform( 'prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform( 'level_%d_ %d' % (i, j), 0.0, 1.0) def eval_t(augs): print(augs) return eval_tta(copy.deepcopy(copied_c), augs) final_policy_set = [] total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % (C.get( )['dataset'], C.get()['model']['type'], cv_fold, args.cv_ratio) print(name) algo = HyperOptSearch(space, max_concurrent=1, metric=reward_attr) aug_config = { 'dataroot': args.dataroot, 'save_path': paths_ls[cv_fold], 'cv_ratio_test': args.cv_ratio, 'cv_fold': cv_fold, 'num_op': args.num_op, 'num_policy': args.num_policy } num_samples = 4 if args.smoke_test else args.num_search print(aug_config) eval_t(aug_config) results = run(eval_t, search_alg=algo, config=aug_config, num_samples=num_samples, resources_per_trial={'gpu': 1}, stop={'training_iteration': args.num_policy}) dataframe = results.dataframe().sort_values(reward_attr, ascending=False) total_computation = dataframe['elapsed_time'].sum() for i in range(num_result_per_cv): config_dict = dataframe.loc[i].filter( like='config').to_dict() new_keys = [ x.replace('config/', '') for x in config_dict.keys() ] new_config_dict = {} for key in new_keys: new_config_dict[key] = config_dict['config/' + key] final_policy = policy_decoder(new_config_dict, args.num_policy, args.num_op) logger.info( 'loss=%.12f top1_valid=%.4f %s' % (dataframe.loc[i]['minus_loss'].item(), dataframe.loc[i]['top1_valid'].item(), final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'], args.cv_ratio)) w.start(tag='train_aug') self.final_policy_set = final_policy_set self.args = args self.paths_ls = paths_ls
def eval_tta(config, augment, reporter): C.get() C.get().conf = config save_path = augment['save_path'] cv_id, gr_id = augment["cv_id"], augment["gr_id"] gr_ids = augment["gr_ids"] # setup - provided augmentation rules C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op']) # eval model = get_model(C.get()['model'], num_class(C.get()['dataset'])) ckpt = torch.load(save_path) if 'model' in ckpt: model.load_state_dict(ckpt['model']) else: model.load_state_dict(ckpt) model.eval() loaders = [] for _ in range(augment['num_policy']): # TODO loader = get_post_dataloader(C.get()["dataset"], C.get()['batch'], augment["dataroot"], augment['cv_ratio_test'], cv_id, gr_id, gr_ids) loaders.append(iter(loader)) start_t = time.time() metrics = Accumulator() loss_fn = torch.nn.CrossEntropyLoss(reduction='none') try: while True: losses = [] corrects = [] for loader in loaders: data, label = next(loader) data = data.cuda() label = label.cuda() pred = model(data) loss = loss_fn(pred, label) losses.append(loss.detach().cpu().numpy().reshape(1, -1)) # (1,N) _, pred = pred.topk(1, 1, True, True) pred = pred.t() correct = pred.eq(label.view( 1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N) corrects.append(correct) del loss, correct, pred, data, label losses = np.concatenate(losses) losses_min = np.min(losses, axis=0).squeeze() # (N,) corrects = np.concatenate(corrects) corrects_max = np.max(corrects, axis=0).squeeze() # (N,) metrics.add_dict({ 'loss': np.sum(losses_min), 'correct': np.sum(corrects_max), 'cnt': corrects_max.size }) del corrects, corrects_max except StopIteration: pass del model metrics = metrics / 'cnt' gpu_secs = (time.time() - start_t) * torch.cuda.device_count() reporter(loss=metrics['loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True) return metrics['correct']
def search(args, paths=None): args.redis = 'gpu-cloud-vnode30.dakao.io:23655' args.per_class = True args.resume = True args.smoke_test = True if args.decay > 0: logger.info('decay=%.4f' % args.decay) C.get()['optimizer']['decay'] = args.decay add_filehandler( logger, os.path.join( 'FastAutoAugment/models', '%s_%s_cv%.1f.log' % (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio))) logger.info('configuration...') logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4)) logger.info('initialize ray...') ray.init(num_cpus=1, num_gpus=1) num_result_per_cv = 10 if not args.smoke_test else 2 cv_num = 5 copied_c = copy.deepcopy(C.get().conf) logger.info('search augmentation policies, dataset=%s model=%s' % (C.get()['dataset'], C.get()['model']['type'])) logger.info( '----- Train without Augmentations cv=%d ratio(test)=%.1f -----' % (cv_num, args.cv_ratio)) w.start(tag='train_no_aug') if paths == None: paths = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_fold%d' % (args.cv_ratio, i)) for i in range(cv_num) ] print(paths) tqdm_epoch = tqdm(range(C.get()['epoch'])) logger.info('getting results...') # pretrain_results = [ # train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths[i], # skip_exist=True) for i in range(cv_num)] pretrain_results = [ train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths[i]) for i in range(cv_num) ] for r_model, r_cv, r_dict in pretrain_results: logger.info( 'model=%s cv=%d top1_train=%.4f top1_valid=%.4f' % (r_model, r_cv + 1, r_dict['top1_train'], r_dict['top1_valid'])) logger.info('processed in %.4f secs' % w.pause('train_no_aug')) if args.until == 1: sys.exit(0) logger.info('----- Search Test-Time Augmentation Policies -----') w.start(tag='search') ops = augment_list(False) space = {} for i in range(args.num_policy): for j in range(args.num_op): space['policy_%d_%d' % (i, j)] = hp.choice( 'policy_%d_%d' % (i, j), list(range(0, len(ops)))) space['prob_%d_%d' % (i, j)] = hp.uniform('prob_%d_ %d' % (i, j), 0.0, 1.0) space['level_%d_%d' % (i, j)] = hp.uniform('level_%d_ %d' % (i, j), 0.0, 1.0) def eval_t(augs): print(augs) return eval_tta(copy.deepcopy(copied_c), augs) final_policy_set = [] total_computation = 0 reward_attr = 'top1_valid' # top1_valid or minus_loss for _ in range(1): # run multiple times. for cv_fold in range(cv_num): name = "search_%s_%s_fold%d_ratio%.1f" % (C.get()['dataset'], C.get()['model']['type'], cv_fold, args.cv_ratio) print(name) algo = HyperOptSearch(space, max_concurrent=1, metric=reward_attr) aug_config = { 'dataroot': args.dataroot, 'save_path': paths[cv_fold], 'cv_ratio_test': args.cv_ratio, 'cv_fold': cv_fold, 'num_op': args.num_op, 'num_policy': args.num_policy } num_samples = 4 if args.smoke_test else args.num_search print(aug_config) eval_t(aug_config) results = run(eval_t, search_alg=algo, config=aug_config, num_samples=num_samples, resources_per_trial={'gpu': 1}, stop={'training_iteration': args.num_policy}) dataframe = results.dataframe().sort_values(reward_attr, ascending=False) total_computation = dataframe['elapsed_time'].sum() for i in range(num_result_per_cv): config_dict = dataframe.loc[i].filter(like='config').to_dict() new_keys = [ x.replace('config/', '') for x in config_dict.keys() ] new_config_dict = {} for key in new_keys: new_config_dict[key] = config_dict['config/' + key] final_policy = policy_decoder(new_config_dict, args.num_policy, args.num_op) logger.info( 'loss=%.12f top1_valid=%.4f %s' % (dataframe.loc[i]['minus_loss'].item(), dataframe.loc[i]['top1_valid'].item(), final_policy)) final_policy = remove_deplicates(final_policy) final_policy_set.extend(final_policy) logger.info(json.dumps(final_policy_set)) logger.info('final_policy=%d' % len(final_policy_set)) logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.)) logger.info( '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----' % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'], args.cv_ratio)) w.start(tag='train_aug') num_experiments = 5 default_path = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_default%d' % (args.cv_ratio, _)) for _ in range(num_experiments) ] augment_path = [ _get_path(C.get()['dataset'], C.get()['model']['type'], 'ratio%.1f_augment%d' % (args.cv_ratio, _)) for _ in range(num_experiments) ] tqdm_epoch = tqdm(range(C.get()['epoch'])) is_done = False for epoch in tqdm_epoch: while True: epochs = OrderedDict() for exp_idx in range(num_experiments): try: if os.path.exists(default_path[exp_idx]): latest_ckpt = torch.load(default_path[exp_idx]) if 'epoch' not in latest_ckpt: epochs['default_exp%d' % (exp_idx + 1)] = C.get()['epoch'] else: epochs['default_exp%d' % (exp_idx + 1)] = latest_ckpt['epoch'] except Exception as e: pass try: if os.path.exists(augment_path[exp_idx]): latest_ckpt = torch.load(augment_path[exp_idx]) if 'epoch' not in latest_ckpt: epochs['augment_exp%d' % (exp_idx + 1)] = C.get()['epoch'] else: epochs['augment_exp%d' % (exp_idx + 1)] = latest_ckpt['epoch'] except: pass tqdm_epoch.set_postfix(epochs) if len(epochs) == num_experiments * 2 and min( epochs.values()) >= C.get()['epoch']: is_done = True if len(epochs) == num_experiments * 2 and min( epochs.values()) >= epoch: break time.sleep(10, '-- sleeping for 10 seconds --') if is_done: break logger.info('getting results...') final_results = [train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], 0.0, 0, save_path=default_path[_], skip_exist=True) for _ in range(num_experiments)] + \ [train_model(copy.deepcopy(copied_c), args.dataroot, final_policy_set, 0.0, 0, save_path=augment_path[_]) for _ in range(num_experiments)] for train_mode in ['default', 'augment']: avg = 0. for _ in range(num_experiments): r_model, r_cv, r_dict = final_results.pop(0) logger.info( '[%s] top1_train=%.4f top1_test=%.4f' % (train_mode, r_dict['top1_train'], r_dict['top1_test'])) avg += r_dict['top1_test'] avg /= num_experiments logger.info('[%s] top1_test average=%.4f (#experiments=%d)' % (train_mode, avg, num_experiments)) logger.info('processed in %.4f secs' % w.pause('train_aug')) logger.info(w)