def main(): torch.set_num_threads(3) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) gpu = ig_utils.pick_gpu_lowest_memory() if args.gpu == 'auto' else int( args.gpu) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) logging.info('gpu device = %d' % gpu) if not args.fast: api = API('../data/NAS-Bench-201-v1_0-e61699.pth') #### model criterion = nn.CrossEntropyLoss() search_space = SearchSpaceNames[args.search_space] if args.method in ['darts', 'blank']: model = TinyNetworkDarts(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=search_space, args=args) elif args.method in ['darts-proj', 'blank-proj']: model = TinyNetworkDartsProj(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=search_space, args=args) model = model.cuda() logging.info("param size = %fMB", ig_utils.count_parameters_in_MB(model)) architect = Architect(model, args) #### data if args.dataset == 'cifar10': train_transform, valid_transform = ig_utils._data_transforms_cifar10( args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) elif args.dataset == 'cifar100': train_transform, valid_transform = ig_utils._data_transforms_cifar100( args) train_data = dset.CIFAR100(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR100(root=args.data, train=False, download=True, transform=valid_transform) elif args.dataset == 'imagenet16-120': import torchvision.transforms as transforms from nasbench201.DownsampledImageNet import ImageNet16 mean = [x / 255 for x in [122.68, 116.66, 104.01]] std = [x / 255 for x in [63.22, 61.26, 65.09]] lists = [ transforms.RandomHorizontalFlip(), transforms.RandomCrop(16, padding=2), transforms.ToTensor(), transforms.Normalize(mean, std) ] train_transform = transforms.Compose(lists) train_data = ImageNet16(root=os.path.join(args.data, 'imagenet16'), train=True, transform=train_transform, use_num_of_class_only=120) valid_data = ImageNet16(root=os.path.join(args.data, 'imagenet16'), train=False, transform=train_transform, use_num_of_class_only=120) assert len(train_data) == 151700 num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True) #### scheduler scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( model.optimizer, float(args.epochs), eta_min=args.learning_rate_min) #### resume start_epoch = 0 if args.resume_epoch != 0: logging.info('loading checkpoint from {}'.format(expid)) file = 'checkpoint.pth.tar' if args.resume_epoch == -1 else 'checkpoint_{}.pth.tar'.format( args.resume_epoch) filename = os.path.join(args.save, file) if os.path.isfile(filename): logging.info("=> loading checkpoint '{}'".format(filename)) checkpoint = torch.load(filename, map_location='cpu') start_epoch = checkpoint['epoch'] # epoch model_state_dict = checkpoint['state_dict'] if '_arch_parameters' in model_state_dict: del model_state_dict['_arch_parameters'] model.load_state_dict(model_state_dict) # model saved_arch_parameters = checkpoint['alpha'] # arch model.set_arch_parameters(saved_arch_parameters) scheduler.load_state_dict(checkpoint['scheduler']) model.optimizer.load_state_dict( checkpoint['optimizer']) # optimizer architect.optimizer.load_state_dict(checkpoint['arch_optimizer']) logging.info("=> loaded checkpoint '{}' (epoch {})".format( filename, start_epoch - 1)) else: print("=> no checkpoint found at '{}'".format(filename)) #### training for epoch in range(start_epoch, args.epochs): lr = scheduler.get_lr()[0] ## data aug if args.cutout: train_transform.transforms[ -1].cutout_prob = args.cutout_prob * epoch / (args.epochs - 1) logging.info('epoch %d lr %e cutout_prob %e', epoch, lr, train_transform.transforms[-1].cutout_prob) else: logging.info('epoch %d lr %e', epoch, lr) ## pre logging genotype = model.genotype() logging.info('genotype = %s', genotype) model.printing(logging) ## training train_acc, train_obj = train(train_queue, valid_queue, model, architect, model.optimizer, lr, epoch) logging.info('train_acc %f', train_acc) logging.info('train_loss %f', train_obj) ## eval valid_acc, valid_obj = infer(valid_queue, model, criterion, log=False) logging.info('valid_acc %f', valid_acc) logging.info('valid_loss %f', valid_obj) ## logging if not args.fast: # nasbench201 cifar10_train, cifar10_test, cifar100_train, cifar100_valid, \ cifar100_test, imagenet16_train, imagenet16_valid, imagenet16_test = query(api, model.genotype(), logging) # tensorboard writer.add_scalars('accuracy', { 'train': train_acc, 'valid': valid_acc }, epoch) writer.add_scalars('loss', { 'train': train_obj, 'valid': valid_obj }, epoch) writer.add_scalars('nasbench201/cifar10', { 'train': cifar10_train, 'test': cifar10_test }, epoch) writer.add_scalars( 'nasbench201/cifar100', { 'train': cifar100_train, 'valid': cifar100_valid, 'test': cifar100_test }, epoch) writer.add_scalars( 'nasbench201/imagenet16', { 'train': imagenet16_train, 'valid': imagenet16_valid, 'test': imagenet16_test }, epoch) #### scheduling scheduler.step() #### saving save_state = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'alpha': model.arch_parameters(), 'optimizer': model.optimizer.state_dict(), 'arch_optimizer': architect.optimizer.state_dict(), 'scheduler': scheduler.state_dict() } if save_state['epoch'] % args.ckpt_interval == 0: ig_utils.save_checkpoint(save_state, False, args.save, per_epoch=True) #### architecture selection / projection if args.dev == 'proj': pt_project(train_queue, valid_queue, model, architect, criterion, model.optimizer, start_epoch, args, infer, query) writer.close()
'avoid loading NASBench2 api an instead load a pickle file with tuple (index, arch_str)' ) args = parser.parse_args() args.device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") return args if __name__ == '__main__': args = parse_arguments() if args.noacc: api = pickle.load(open(args.api_loc, 'rb')) else: from nas_201_api import NASBench201API as API api = API(args.api_loc) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False train_loader, val_loader = get_cifar_dataloaders(args.batch_size, args.batch_size, args.dataset, args.num_data_workers) cached_res = [] pre = 'cf' if 'cifar' in args.dataset else 'im' pfn = f'nb2_{pre}{get_num_classes(args)}_seed{args.seed}_dl{args.dataload}_dlinfo{args.dataload_info}_initw{args.init_w_type}_initb{args.init_b_type}.p' op = os.path.join(args.outdir, pfn)
type=str, help="Folder to save checkpoints and log.") parser.add_argument( "--arch_nas_dataset", type=str, help="The path to load the architecture dataset (tiny-nas-benchmark).", ) parser.add_argument("--print_freq", type=int, help="print frequency (default: 200)") parser.add_argument("--rand_seed", type=int, help="manual seed") args = parser.parse_args() # if args.rand_seed is None or args.rand_seed < 0: args.rand_seed = random.randint(1, 100000) if args.arch_nas_dataset is None or not os.path.isfile( args.arch_nas_dataset): nas_bench = None else: print("{:} build NAS-Benchmark-API from {:}".format( time_string(), args.arch_nas_dataset)) nas_bench = API(args.arch_nas_dataset) if args.rand_seed < 0: save_dir, all_indexes, num = None, [], 500 for i in range(num): print("{:} : {:03d}/{:03d}".format(time_string(), i, num)) args.rand_seed = random.randint(1, 100000) save_dir, index = main(args, nas_bench) all_indexes.append(index) torch.save(all_indexes, save_dir / "results.pth") else: main(args, nas_bench)
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads( xargs.workers ) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1) #config_path = 'configs/nas-benchmark/algos/DARTS.config' config = load_config(xargs.config_path, {'class_num': class_num, 'xshape': xshape}, logger) search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', config.batch_size, xargs.workers) logger.log('||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) if xargs.model_config is None: model_config = dict2config({'name': 'DARTS-V1', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space' : search_space, 'affine' : False, 'track_running_stats': bool(xargs.track_running_stats)}, None) else: model_config = load_config(xargs.model_config, {'num_classes': class_num, 'space' : search_space, 'affine' : False, 'track_running_stats': bool(xargs.track_running_stats)}, None) search_model = get_cell_based_tiny_net(model_config) logger.log('search-model :\n{:}'.format(search_model)) w_optimizer, w_scheduler, criterion = get_optim_scheduler(search_model.get_weights(), config) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) flop, param = get_model_infos(search_model, xshape) #logger.log('{:}'.format(search_model)) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path('info'), logger.path('model'), logger.path('best') network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict( checkpoint['search_model'] ) w_scheduler.load_state_dict ( checkpoint['w_scheduler'] ) w_optimizer.load_state_dict ( checkpoint['w_optimizer'] ) a_optimizer.load_state_dict ( checkpoint['a_optimizer'] ) logger.log("=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, {'best': -1}, {-1: search_model.genotype()} # start training start_time, search_time, epoch_time, total_epoch = time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch-epoch), True) ) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format(epoch_str, need_time, min(w_scheduler.get_lr()))) search_w_loss, search_w_top1, search_w_top5 = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger, xargs.gradient_clip) search_time.update(time.time() - start_time) logger.log('[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s'.format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) valid_a_loss , valid_a_top1 , valid_a_top5 = valid_func(valid_loader, network, criterion) logger.log('[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies['best']: valid_accuracies['best'] = valid_a_top1 genotypes['best'] = search_model.genotype() find_best = True else: find_best = False genotypes[epoch] = search_model.genotype() logger.log('<<<--->>> The {:}-th epoch : {:}'.format(epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint({'epoch' : epoch + 1, 'args' : deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer' : w_optimizer.state_dict(), 'a_optimizer' : a_optimizer.state_dict(), 'w_scheduler' : w_scheduler.state_dict(), 'genotypes' : genotypes, 'valid_accuracies' : valid_accuracies}, model_base_path, logger) last_info = save_checkpoint({ 'epoch': epoch + 1, 'args' : deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) if find_best: logger.log('<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'.format(epoch_str, valid_a_top1)) copy_checkpoint(model_base_path, model_best_path, logger) with torch.no_grad(): #logger.log('arch-parameters :\n{:}'.format( nn.functional.softmax(search_model.arch_parameters, dim=-1).cpu() )) logger.log('{:}'.format(search_model.show_alphas())) if api is not None: logger.log('{:}'.format(api.query_by_arch( genotypes[epoch] ))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log('\n' + '-'*100) logger.log('DARTS-V1 : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(total_epoch, search_time.sum, genotypes[total_epoch-1])) if api is not None: logger.log('{:}'.format( api.query_by_arch(genotypes[total_epoch-1]) )) logger.close()
def check_cor_for_bandit(meta_file, test_epoch, use_less_or_not, is_rand=True, need_print=False): if isinstance(meta_file, API): api = meta_file else: api = API(str(meta_file)) cifar10_currs = [] cifar10_valid = [] cifar10_test = [] cifar100_valid = [] cifar100_test = [] imagenet_test = [] imagenet_valid = [] for idx, arch in enumerate(api): results = api.get_more_info(idx, "cifar10-valid", test_epoch - 1, use_less_or_not, is_rand) cifar10_currs.append(results["valid-accuracy"]) # --->>>>> results = api.get_more_info(idx, "cifar10-valid", None, False, is_rand) cifar10_valid.append(results["valid-accuracy"]) results = api.get_more_info(idx, "cifar10", None, False, is_rand) cifar10_test.append(results["test-accuracy"]) results = api.get_more_info(idx, "cifar100", None, False, is_rand) cifar100_test.append(results["test-accuracy"]) cifar100_valid.append(results["valid-accuracy"]) results = api.get_more_info(idx, "ImageNet16-120", None, False, is_rand) imagenet_test.append(results["test-accuracy"]) imagenet_valid.append(results["valid-accuracy"]) def get_cor(A, B): return float(np.corrcoef(A, B)[0, 1]) cors = [] for basestr, xlist in zip( ["C-010-V", "C-010-T", "C-100-V", "C-100-T", "I16-V", "I16-T"], [ cifar10_valid, cifar10_test, cifar100_valid, cifar100_test, imagenet_valid, imagenet_test, ], ): correlation = get_cor(cifar10_currs, xlist) if need_print: print( "With {:3d}/{:}-epochs-training, the correlation between cifar10-valid and {:} is : {:}" .format( test_epoch, "012" if use_less_or_not else "200", basestr, correlation, )) cors.append(correlation) # print ('With {:3d}/200-epochs-training, the correlation between cifar10-valid and {:} is : {:}'.format(test_epoch, basestr, get_cor(cifar10_valid_200, xlist))) # print('-'*200) # print('*'*230) return cors
def main(): api = API(None) info = api.get_more_info(100, 'cifar100', 199, False, True)
def main(xargs): PID = os.getpid() assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True prepare_seed(xargs.rand_seed) if xargs.timestamp == 'none': xargs.timestamp = "{:}".format(time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time()))) train_data, valid_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1) ##### config & logging ##### config = edict() config.class_num = class_num config.xshape = xshape config.batch_size = xargs.batch_size xargs.save_dir = xargs.save_dir + \ "/repeat%d-prunNum%d-prec%d-%s-batch%d"%( xargs.repeat, xargs.prune_number, xargs.precision, xargs.init, config["batch_size"]) + \ "/{:}/seed{:}".format(xargs.timestamp, xargs.rand_seed) config.save_dir = xargs.save_dir logger = prepare_logger(xargs) ############### if xargs.dataset != 'imagenet-1k': search_loader, train_loader, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/', config.batch_size, xargs.workers) else: train_loader = torch.utils.data.DataLoader(train_data, batch_size=xargs.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) logger.log('||||||| {:10s} ||||||| Train-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(train_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) if xargs.search_space_name == 'nas-bench-201': model_config = edict({'name': 'DARTS-V1', 'C': 3, 'N': 1, 'depth': -1, 'use_stem': True, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'affine': True, 'track_running_stats': bool(xargs.track_running_stats), }) model_config_thin = edict({'name': 'DARTS-V1', 'C': 1, 'N': 1, 'depth': 1, 'use_stem': False, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'affine': True, 'track_running_stats': bool(xargs.track_running_stats), }) elif xargs.search_space_name == 'darts': model_config = edict({'name': 'DARTS-V1', 'C': 1, 'N': 1, 'depth': 2, 'use_stem': True, 'stem_multiplier': 1, 'num_classes': class_num, 'space': search_space, 'affine': True, 'track_running_stats': bool(xargs.track_running_stats), 'super_type': xargs.super_type, 'steps': 4, 'multiplier': 4, }) model_config_thin = edict({'name': 'DARTS-V1', 'C': 1, 'N': 1, 'depth': 2, 'use_stem': False, 'stem_multiplier': 1, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'affine': True, 'track_running_stats': bool(xargs.track_running_stats), 'super_type': xargs.super_type, 'steps': 4, 'multiplier': 4, }) network = get_cell_based_tiny_net(model_config) logger.log('model-config : {:}'.format(model_config)) arch_parameters = [alpha.detach().clone() for alpha in network.get_alphas()] for alpha in arch_parameters: alpha[:, :] = 0 # TODO Linear_Region_Collector lrc_model = Linear_Region_Collector(input_size=(1000, 1, 3, 3), sample_batch=3, dataset=xargs.dataset, data_path=xargs.data_path, seed=xargs.rand_seed) # ### all params trainable (except train_bn) ######################### flop, param = get_model_infos(network, xshape) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log('search-space [{:} ops] : {:}'.format(len(search_space), search_space)) if xargs.arch_nas_dataset is None or xargs.search_space_name == 'darts': api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) network = network.cuda() genotypes = {}; genotypes['arch'] = {-1: network.genotype()} arch_parameters_history = [] arch_parameters_history_npy = [] start_time = time.time() epoch = -1 for alpha in arch_parameters: alpha[:, 0] = -INF arch_parameters_history.append([alpha.detach().clone() for alpha in arch_parameters]) arch_parameters_history_npy.append([alpha.detach().clone().cpu().numpy() for alpha in arch_parameters]) np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"), arch_parameters_history_npy) while not is_single_path(network): epoch += 1 torch.cuda.empty_cache() print("<< ============== JOB (PID = %d) %s ============== >>"%(PID, '/'.join(xargs.save_dir.split("/")[-6:]))) arch_parameters, op_pruned = prune_func_rank(xargs, arch_parameters, model_config, model_config_thin, train_loader, lrc_model, search_space, precision=xargs.precision, prune_number=xargs.prune_number ) # rebuild supernet network = get_cell_based_tiny_net(model_config) network = network.cuda() network.set_alphas(arch_parameters) arch_parameters_history.append([alpha.detach().clone() for alpha in arch_parameters]) arch_parameters_history_npy.append([alpha.detach().clone().cpu().numpy() for alpha in arch_parameters]) np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"), arch_parameters_history_npy) genotypes['arch'][epoch] = network.genotype() logger.log('operators remaining (1s) and prunned (0s)\n{:}'.format('\n'.join([str((alpha > -INF).int()) for alpha in network.get_alphas()]))) if xargs.search_space_name == 'darts': print("===>>> Prune Edge Groups...") arch_parameters = prune_func_rank_group(xargs, arch_parameters, model_config, model_config_thin, train_loader, lrc_model, search_space, edge_groups=[(0, 2), (2, 5), (5, 9), (9, 14)], num_per_group=2, precision=xargs.precision, ) network = get_cell_based_tiny_net(model_config) network = network.cuda() network.set_alphas(arch_parameters) arch_parameters_history.append([alpha.detach().clone() for alpha in arch_parameters]) arch_parameters_history_npy.append([alpha.detach().clone().cpu().numpy() for alpha in arch_parameters]) np.save(os.path.join(xargs.save_dir, "arch_parameters_history.npy"), arch_parameters_history_npy) logger.log('<<<--->>> End: {:}'.format(network.genotype())) logger.log('operators remaining (1s) and prunned (0s)\n{:}'.format('\n'.join([str((alpha > -INF).int()) for alpha in network.get_alphas()]))) end_time = time.time() logger.log('\n' + '-'*100) logger.log("Time spent: %d s"%(end_time - start_time)) # check the performance from the architecture dataset if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes['arch'][epoch]))) logger.close()
def __init__(self, dataset, apiloc): self.dataset = dataset self.api = API(apiloc, verbose=False) self.epochs = '12'
def make(ww="without"): from nas_201_api import NASBench201API as API api = API('/Users/madoibito80/NAS-Bench-201-v1_0-e61699.pth') macc = {} nparams = {} for mode in modes: for way in ways: if mode == "P" and way == "best": continue for trial in trials: fname = str(trial) + "_" + mode arcs = opener("./snapshot/" + ww + "/" + way + "/" + fname + ".txt", freeze=50, mode=mode) accs = [] for i in range(len(arcs)): if i % 100 == 0: print(mode, way, trial, i) index = api.query_index_by_arch(arcs[i]) flg = True try: info = api.query_meta_info_by_index(index) except: print("error: ", mode, way, trial, i) flg = False if flg: res = info.get_metrics('cifar10', 'ori-test', None, False) # cifar10 : training the model on the CIFAR-10 training + validation set. # ,criteria , , False=3average of NAS-Bench) acc = res['accuracy'] accs.append(float(acc)) else: accs.append(accs[-1]) if trial == 0: macc[mode + way] = np.zeros((len(trials), len(accs))) nparams[mode + way] = np.zeros((len(trials), 1)) macc[mode + way][trial] = np.array(accs) # get final performance for table try: print(dir(info)) except: print("no dir") metric = info.get_comput_costs('cifar10') flop, param, latency = metric['flops'], metric[ 'params'], metric['latency'] nparams[mode + way][trial] = float(param) f = open("./" + ww + ".pickle", "wb") pickle.dump(macc, f) pickle.dump(nparams, f) f.close() print(macc) print(nparams) return macc
def __init__(self, data_dir, task='cifar10-valid', log_scale=True, negative=True, use_12_epochs_result=False, seed=None): """ data_dir: data directory that contains NAS-Bench-201-v1_0-e61699.pth file task: the target image tasks. Options: cifar10-valid, cifar100, ImageNet16-120 log_scale: whether output the objective in log scale negative: whether output the objective in negative form use_12_epochs_result: whether use the statistics at the end of training of the 12th epoch instead of all the way till the end. seed: set the random seed to access trained model performance: Options: 0, 1, 2 seed=None will select the seed randomly """ self.api = API(os.path.join(data_dir, 'NAS-Bench-201-v1_1-096897.pth')) if isinstance(task, list): task = task[0] self.task = task self.use_12_epochs_result = use_12_epochs_result if task == 'cifar10-valid': best_val_arch_index = 6111 best_val_acc = 91.60666665039064 / 100 best_test_arch_index = 1459 best_test_acc = 91.52333333333333 / 100 elif task == 'cifar100': best_val_arch_index = 9930 best_val_acc = 73.49333323567708 / 100 best_test_arch_index = 9930 best_test_acc = 73.51333326009114 / 100 elif task == 'ImageNet16-120': best_val_arch_index = 10676 best_val_acc = 46.766666727701825 / 100 best_test_arch_index = 857 best_test_acc = 47.311111097547744 / 100 else: raise NotImplementedError("task" + str(task) + " is not implemented in the dataset.") if log_scale: best_val_acc = np.log(best_val_acc) best_val_err = 1. - best_val_acc best_test_err = 1. - best_test_acc if log_scale: best_val_err = np.log(best_val_err) best_test_err = np.log(best_val_err) if negative: best_val_err = -best_val_err best_test_err = -best_test_err self.best_val_err = best_val_err self.best_test_err = best_test_err self.best_val_acc = best_val_acc self.best_test_acc = best_test_acc super(NAS201, self).__init__(dim=None, optimum_location=best_test_arch_index, optimal_val=best_test_err, bounds=None) self.log_scale = log_scale self.seed = seed self.X = [] self.y_valid_acc = [] self.y_test_acc = [] self.costs = [] self.negative = negative
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) #config_path = 'configs/nas-benchmark/algos/GDAS.config' config = load_config(xargs.config_path, { 'class_num': class_num, 'xshape': xshape }, logger) search_loader, _, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', config.batch_size, xargs.workers) logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}'.format( xargs.dataset, len(search_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) if xargs.model_config is None and not args.constrain: model_config = dict2config( { 'name': 'SNAS', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'inp_size': 0, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats) }, None) elif xargs.model_config is None: model_config = dict2config( { 'name': 'SNAS', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'inp_size': 32, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats) }, None) else: model_config = load_config( xargs.model_config, { 'num_classes': class_num, 'space': search_space, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats) }, None) search_model = get_cell_based_tiny_net(model_config) #logger.log('search-model :\n{:}'.format(search_model)) logger.log('model-config : {:}'.format(model_config)) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.get_weights(), config) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) flop, param = get_model_infos(search_model, xshape) #logger.log('{:}'.format(search_model)) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log('search-space [{:} ops] : {:}'.format(len(search_space), search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') network, criterion = torch.nn.DataParallel( search_model).cuda(), criterion.cuda() #network, criterion = search_model.cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict(checkpoint['search_model']) w_scheduler.load_state_dict(checkpoint['w_scheduler']) w_optimizer.load_state_dict(checkpoint['w_optimizer']) a_optimizer.load_state_dict(checkpoint['a_optimizer']) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, { 'best': -1 }, { -1: search_model.genotype() } # start training start_time, search_time, epoch_time, total_epoch = time.time( ), AverageMeter(), AverageMeter(), config.epochs + config.warmup sampled_weights = [] for epoch in range(start_epoch, total_epoch + config.t_epochs): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time( epoch_time.val * (total_epoch - epoch + config.t_epochs), True)) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) search_model.set_tau(xargs.tau_max - (xargs.tau_max - xargs.tau_min) * epoch / (total_epoch - 1)) logger.log('\n[Search the {:}-th epoch] {:}, tau={:}, LR={:}'.format( epoch_str, need_time, search_model.get_tau(), min(w_scheduler.get_lr()))) if epoch < total_epoch: search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5 \ = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger, xargs.bilevel) else: try: search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5, arch_iter \ = train_func(search_loader, network, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, sampled_weights[0], arch_iter, logger) except IndexError: weights = search_model.sample_weights(100) sampled_weights.append(weights) arch_iter = iter(weights) search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5, arch_iter \ = train_func(search_loader, network, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, sampled_weights[0], arch_iter, logger) search_time.update(time.time() - start_time) logger.log( '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s' .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) logger.log( '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%' .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) if (epoch + 1) % 50 == 0 and not config.t_epochs: weights = search_model.sample_weights(100) sampled_weights.append(weights) elif (epoch + 1) == total_epoch and config.t_epochs: weights = search_model.sample_weights(100) sampled_weights.append(weights) arch_iter = iter(weights) # validate with single arch single_weight = search_model.sample_weights(1)[0] single_valid_acc = AverageMeter() network.eval() for i in range(10): try: val_input, val_target = next(valid_iter) except Exception as e: valid_iter = iter(valid_loader) val_input, val_target = next(valid_iter) n_val = val_input.size(0) with torch.no_grad(): val_target = val_target.cuda(non_blocking=True) _, logits, _ = network(val_input, weights=single_weight) val_acc1, val_acc5 = obtain_accuracy(logits.data, val_target.data, topk=(1, 5)) single_valid_acc.update(val_acc1.item(), n_val) logger.log('[{:}] valid : accuracy = {:.2f}'.format( epoch_str, single_valid_acc.avg)) # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies['best']: valid_accuracies['best'] = valid_a_top1 genotypes['best'] = search_model.genotype() find_best = True else: find_best = False if epoch < total_epoch: genotypes[epoch] = search_model.genotype() logger.log('<<<--->>> The {:}-th epoch : {:}'.format( epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer': w_optimizer.state_dict(), 'a_optimizer': a_optimizer.state_dict(), 'w_scheduler': w_scheduler.state_dict(), 'genotypes': genotypes, 'valid_accuracies': valid_accuracies }, model_base_path, logger) last_info = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) if find_best: logger.log( '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.' .format(epoch_str, valid_a_top1)) copy_checkpoint(model_base_path, model_best_path, logger) with torch.no_grad(): logger.log('{:}'.format(search_model.show_alphas())) if api is not None and epoch < total_epoch: logger.log('{:}'.format(api.query_by_arch(genotypes[epoch]))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() network.eval() # Evaluate the architectures sampled throughout the search for i in range(len(sampled_weights) - 1): logger.log('Sample eval : epoch {}'.format((i + 1) * 50 - 1)) for w in sampled_weights[i]: sample_valid_acc = AverageMeter() for i in range(10): try: val_input, val_target = next(valid_iter) except Exception as e: valid_iter = iter(valid_loader) val_input, val_target = next(valid_iter) n_val = val_input.size(0) with torch.no_grad(): val_target = val_target.cuda(non_blocking=True) _, logits, _ = network(val_input, weights=w) val_acc1, val_acc5 = obtain_accuracy(logits.data, val_target.data, topk=(1, 5)) sample_valid_acc.update(val_acc1.item(), n_val) w_gene = search_model.genotype(w) if api is not None: ind = api.query_index_by_arch(w_gene) info = api.query_meta_info_by_index(ind) metrics = info.get_metrics('cifar10', 'ori-test') acc = metrics['accuracy'] else: acc = 0.0 logger.log( 'sample valid : val_acc = {:.2f} test_acc = {:.2f}'.format( sample_valid_acc.avg, acc)) # Evaluate the final sampling separately to find the top 10 architectures logger.log('Final sample eval') final_archs = [] for w in sampled_weights[-1]: sample_valid_acc = AverageMeter() for i in range(10): try: val_input, val_target = next(valid_iter) except Exception as e: valid_iter = iter(valid_loader) val_input, val_target = next(valid_iter) n_val = val_input.size(0) with torch.no_grad(): val_target = val_target.cuda(non_blocking=True) _, logits, _ = network(val_input, weights=w) val_acc1, val_acc5 = obtain_accuracy(logits.data, val_target.data, topk=(1, 5)) sample_valid_acc.update(val_acc1.item(), n_val) w_gene = search_model.genotype(w) if api is not None: ind = api.query_index_by_arch(w_gene) info = api.query_meta_info_by_index(ind) metrics = info.get_metrics('cifar10', 'ori-test') acc = metrics['accuracy'] else: acc = 0.0 logger.log('sample valid : val_acc = {:.2f} test_acc = {:.2f}'.format( sample_valid_acc.avg, acc)) final_archs.append((w, sample_valid_acc.avg)) top_10 = sorted(final_archs, key=lambda x: x[1], reverse=True)[:10] # Evaluate the top 10 architectures on the entire validation set logger.log('Evaluating top archs') for w, prev_acc in top_10: full_valid_acc = AverageMeter() for val_input, val_target in valid_loader: n_val = val_input.size(0) with torch.no_grad(): val_target = val_target.cuda(non_blocking=True) _, logits, _ = network(val_input, weights=w) val_acc1, val_acc5 = obtain_accuracy(logits.data, val_target.data, topk=(1, 5)) full_valid_acc.update(val_acc1.item(), n_val) w_gene = search_model.genotype(w) logger.log('genotype {}'.format(w_gene)) if api is not None: ind = api.query_index_by_arch(w_gene) info = api.query_meta_info_by_index(ind) metrics = info.get_metrics('cifar10', 'ori-test') acc = metrics['accuracy'] else: acc = 0.0 logger.log( 'full valid : val_acc = {:.2f} test_acc = {:.2f} pval_acc = {:.2f}' .format(full_valid_acc.avg, acc, prev_acc)) logger.log('\n' + '-' * 100) # check the performance from the architecture dataset logger.log( 'SNAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format( total_epoch, search_time.sum, genotypes[total_epoch - 1])) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes[total_epoch - 1]))) logger.close()
def main(): torch.set_num_threads(3) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) if not 'debug' in args.save: api = API('pth file path') criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() if args.method == 'gdas' or args.method == 'snas': # Create the decrease step for the gumbel softmax temperature tau_step = (args.tau_min - args.tau_max) / args.epochs tau_epoch = args.tau_max if args.method == 'gdas': model = TinyNetworkGDAS(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=NAS_BENCH_201) else: model = TinyNetwork(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=NAS_BENCH_201, k=args.k, species='gumbel') elif args.method == 'dirichlet': model = TinyNetwork(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=NAS_BENCH_201, k=args.k, species='dirichlet') elif args.method == 'darts': model = TinyNetwork(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion, search_space=NAS_BENCH_201, k=args.k, species='softmax') model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.get_weights(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) if args.dataset == 'cifar10': train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) elif args.dataset == 'cifar100': train_transform, valid_transform = utils._data_transforms_cifar100( args) train_data = dset.CIFAR100(root=args.data, train=True, download=True, transform=train_transform) elif args.dataset == 'svhn': train_transform, valid_transform = utils._data_transforms_svhn(args) train_data = dset.SVHN(root=args.data, split='train', download=True, transform=train_transform) elif args.dataset == 'imagenet16-120': import torchvision.transforms as transforms from nasbench201.DownsampledImageNet import ImageNet16 mean = [x / 255 for x in [122.68, 116.66, 104.01]] std = [x / 255 for x in [63.22, 61.26, 65.09]] lists = [ transforms.RandomHorizontalFlip(), transforms.RandomCrop(16, padding=2), transforms.ToTensor(), transforms.Normalize(mean, std) ] train_transform = transforms.Compose(lists) train_data = ImageNet16(root=os.path.join(args.data, 'imagenet16'), train=True, transform=train_transform, use_num_of_class_only=120) assert len(train_data) == 151700 num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) for epoch in range(args.epochs): lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() logging.info('genotype = %s', genotype) model.show_arch_parameters() # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch) logging.info('train_acc %f', train_acc) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) if not 'debug' in args.save: # nasbench201 result = api.query_by_arch(model.genotype()) logging.info('{:}'.format(result)) cifar10_train, cifar10_test, cifar100_train, cifar100_valid, \ cifar100_test, imagenet16_train, imagenet16_valid, imagenet16_test = distill(result) logging.info('cifar10 train %f test %f', cifar10_train, cifar10_test) logging.info('cifar100 train %f valid %f test %f', cifar100_train, cifar100_valid, cifar100_test) logging.info('imagenet16 train %f valid %f test %f', imagenet16_train, imagenet16_valid, imagenet16_test) # tensorboard writer.add_scalars('accuracy', { 'train': train_acc, 'valid': valid_acc }, epoch) writer.add_scalars('loss', { 'train': train_obj, 'valid': valid_obj }, epoch) writer.add_scalars('nasbench201/cifar10', { 'train': cifar10_train, 'test': cifar10_test }, epoch) writer.add_scalars( 'nasbench201/cifar100', { 'train': cifar100_train, 'valid': cifar100_valid, 'test': cifar100_test }, epoch) writer.add_scalars( 'nasbench201/imagenet16', { 'train': imagenet16_train, 'valid': imagenet16_valid, 'test': imagenet16_test }, epoch) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'alpha': model.arch_parameters() }, False, args.save) scheduler.step() if args.method == 'gdas' or args.method == 'snas': # Decrease the temperature for the gumbel softmax linearly tau_epoch += tau_step logging.info('tau %f', tau_epoch) model.set_tau(tau_epoch) writer.close()
def __init__(self): """Construct the Nasbench201 class.""" super(Nasbench201, self).__init__() self.args.data_path = FileOps.download_dataset(self.args.data_path) self.nasbench201_api = API('self.args.data_path')
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) if xargs.overwite_epochs is None: extra_info = {'class_num': class_num, 'xshape': xshape} else: extra_info = { 'class_num': class_num, 'xshape': xshape, 'epochs': xargs.overwite_epochs } config = load_config(xargs.config_path, extra_info, logger) search_loader, train_loader, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', (config.batch_size, config.test_batch_size), xargs.workers) logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}' .format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) search_space = get_search_spaces(xargs.search_space, 'nas-bench-301') model_config = dict2config( dict(name='generic', super_type='search-shape', candidate_Cs=search_space['candidates'], max_num_Cs=search_space['numbers'], num_classes=class_num, genotype=args.genotype, affine=bool(xargs.affine), track_running_stats=bool(xargs.track_running_stats)), None) logger.log('search space : {:}'.format(search_space)) logger.log('model config : {:}'.format(model_config)) search_model = get_cell_based_tiny_net(model_config) search_model.set_algo(xargs.algo) logger.log('{:}'.format(search_model)) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.weights, config) a_optimizer = torch.optim.Adam(search_model.alphas, lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay, eps=xargs.arch_eps) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) params = count_parameters_in_MB(search_model) logger.log('The parameters of the search model = {:.2f} MB'.format(params)) logger.log('search-space : {:}'.format(search_space)) if bool(xargs.use_api): api = API(verbose=False) else: api = None logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') network, criterion = search_model.cuda(), criterion.cuda( ) # use a single GPU last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict(checkpoint['search_model']) w_scheduler.load_state_dict(checkpoint['w_scheduler']) w_optimizer.load_state_dict(checkpoint['w_optimizer']) a_optimizer.load_state_dict(checkpoint['a_optimizer']) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, { 'best': -1 }, { -1: network.random } # start training start_time, search_time, epoch_time, total_epoch = time.time( ), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format( epoch_str, need_time, min(w_scheduler.get_lr()))) if xargs.algo == 'fbv2' or xargs.algo == 'tas': network.set_tau(xargs.tau_max - (xargs.tau_max - xargs.tau_min) * epoch / (total_epoch - 1)) logger.log('[RESET tau as : {:}]'.format(network.tau)) search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 \ = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, xargs.algo, epoch_str, xargs.print_freq, logger) search_time.update(time.time() - start_time) logger.log( '[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s' .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) logger.log( '[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%' .format(epoch_str, search_a_loss, search_a_top1, search_a_top5)) genotype = network.genotype logger.log('[{:}] - [get_best_arch] : {:}'.format(epoch_str, genotype)) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion, logger) logger.log( '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}' .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5, genotype)) valid_accuracies[epoch] = valid_a_top1 genotypes[epoch] = genotype logger.log('<<<--->>> The {:}-th epoch : {:}'.format( epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer': w_optimizer.state_dict(), 'a_optimizer': a_optimizer.state_dict(), 'w_scheduler': w_scheduler.state_dict(), 'genotypes': genotypes, 'valid_accuracies': valid_accuracies }, model_base_path, logger) last_info = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) with torch.no_grad(): logger.log('{:}'.format(search_model.show_alphas())) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes[epoch], '90'))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() # the final post procedure : count the time start_time = time.time() genotype = network.genotype search_time.update(time.time() - start_time) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion, logger) logger.log( 'Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%.' .format(genotype, valid_a_top1)) logger.log('\n' + '-' * 100) # check the performance from the architecture dataset logger.log('[{:}] run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format( xargs.algo, total_epoch, search_time.sum, genotype)) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotype, '90'))) logger.close()
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) #config_path = 'configs/nas-benchmark/algos/GDAS.config' config = load_config(xargs.config_path, { 'class_num': class_num, 'xshape': xshape }, logger) search_loader, _, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', config.batch_size, xargs.workers) if xargs.ood_inner or xargs.ood_outer: mean = [x / 255 for x in [125.3, 123.0, 113.9]] std = [x / 255 for x in [63.0, 62.1, 66.7]] # lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)] lists = [transforms.ToTensor(), transforms.Normalize(mean, std)] # lists += [CUTOUT(-1)] ood_transform = transforms.Compose(lists) ood_data = dset.SVHN(root=args.data_path, split='train', download=True, transform=ood_transform) ood_loader = torch.utils.data.DataLoader( ood_data, batch_size=config.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( list(range(len(ood_data)))[:len(train_data)]), pin_memory=True, num_workers=xargs.workers) else: ood_loader = None logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}'.format( xargs.dataset, len(search_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) global search_space search_space = get_search_spaces('cell', xargs.search_space_name) if xargs.model_config is None: model_config = dict2config( { 'name': 'GDAS', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats), }, None) else: model_config = load_config( xargs.model_config, { 'num_classes': class_num, 'space': search_space, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats) }, None) search_model = get_cell_based_tiny_net(model_config) # logger.log('search-model :\n{:}'.format(search_model)) logger.log('model-config : {:}'.format(model_config)) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.get_weights(), config) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) flop, param = get_model_infos(search_model, xshape) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log('search-space [{:} ops] : {:}'.format(len(search_space), search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') # network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda() network, criterion = search_model.cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict(checkpoint['search_model']) w_scheduler.load_state_dict(checkpoint['w_scheduler']) w_optimizer.load_state_dict(checkpoint['w_optimizer']) a_optimizer.load_state_dict(checkpoint['a_optimizer']) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, { 'best': -1 }, { -1: search_model.genotype() } # start training start_time, search_time, epoch_time, total_epoch = time.time( ), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) search_model.set_tau(xargs.tau_max - (xargs.tau_max - xargs.tau_min) * epoch / (total_epoch - 1)) logger.log('\n[Search the {:}-th epoch] {:}, tau={:}, LR={:}'.format( epoch_str, need_time, search_model.get_tau(), min(w_scheduler.get_lr()))) search_w_loss, search_w_top1, search_w_top5, valid_a_loss , valid_a_top1 , valid_a_top5 \ = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs, logger, ood_loader) search_time.update(time.time() - start_time) logger.log( '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s' .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) logger.log( '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%' .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies['best']: valid_accuracies['best'] = valid_a_top1 genotypes['best'] = search_model.genotype() find_best = True else: find_best = False genotypes[epoch] = search_model.genotype() logger.log('<<<--->>> The {:}-th epoch : {:}'.format( epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer': w_optimizer.state_dict(), 'a_optimizer': a_optimizer.state_dict(), 'w_scheduler': w_scheduler.state_dict(), 'genotypes': genotypes, 'valid_accuracies': valid_accuracies }, model_base_path, logger) last_info = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) if find_best: logger.log( '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.' .format(epoch_str, valid_a_top1)) copy_checkpoint(model_base_path, model_best_path, logger) with torch.no_grad(): logger.log('{:}'.format(search_model.show_alphas())) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes[epoch], '200'))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log('\n' + '-' * 100) # check the performance from the architecture dataset logger.log( 'GDAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format( total_epoch, search_time.sum, genotypes[total_epoch - 1])) if api is not None: logger.log('{:}'.format( api.query_by_arch(genotypes[total_epoch - 1], '200'))) logger.close()
import xnas.core.optimizer as optim import xnas.datasets.loader as loader from xnas.core.config import cfg from xnas.search_space.cell_based_nasben1shot import INPUT, OUTPUT, CONV1X1, CONV3X3, MAXPOOL3X3, OUTPUT_NODE import sys from nas_201_api import NASBench201API as API from nasbench import api import ConfigSpace logger = logging.get_logger(__name__) nasbench1shot1_path = 'benchmark/nasbench_full.tfrecord' nasbench201_path = 'benchmark/NAS-Bench-102-v1_0-e61699.pth' api_nasben201 = API(nasbench201_path, verbose=False) nasbench = api.NASBench(nasbench1shot1_path) def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) logger.info(logging.dump_log_data(cfg, "cfg"))
def main(): torch.set_num_threads(3) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) if args.perturb_alpha == 'none': perturb_alpha = None elif args.perturb_alpha == 'pgd_linf': perturb_alpha = Linf_PGD_alpha elif args.perturb_alpha == 'random': perturb_alpha = Random_alpha api = API('/nfs/data/xiangning/data/NAS-Bench-201-v1_0-e61699.pth') criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(C=args.init_channels, N=5, max_nodes=4, num_classes=n_classes, criterion=criterion) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD( model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) if args.dataset == 'cifar10': train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) elif args.dataset == 'cifar100': train_transform, valid_transform = utils._data_transforms_cifar100(args) train_data = dset.CIFAR100(root=args.data, train=True, download=True, transform=train_transform) elif args.dataset == 'svhn': train_transform, valid_transform = utils._data_transforms_svhn(args) train_data = dset.SVHN(root=args.data, split='train', download=True, transform=train_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) if 'debug' in args.save: split = args.batch_size num_train = 2 * args.batch_size train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] if args.cutout: # increase the cutout probability linearly throughout search train_transform.transforms[-1].cutout_prob = args.cutout_prob * epoch / (args.epochs - 1) logging.info('epoch %d lr %e cutout_prob %e', epoch, lr, train_transform.transforms[-1].cutout_prob) else: logging.info('epoch %d lr %e', epoch, lr) if args.perturb_alpha: epsilon_alpha = 0.03 + (args.epsilon_alpha - 0.03) * epoch / args.epochs logging.info('epoch %d epsilon_alpha %e', epoch, epsilon_alpha) genotype = model.genotype() logging.info('genotype = %s', genotype) model.show_alphas() # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, perturb_alpha, epsilon_alpha) logging.info('train_acc %f', train_acc) writer.add_scalar('Acc/train', train_acc, epoch) writer.add_scalar('Obj/train', train_obj, epoch) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) writer.add_scalar('Acc/valid', valid_acc, epoch) writer.add_scalar('Obj/valid', valid_obj, epoch) # nasbench201 result = api.query_by_arch(model.genotype()) logging.info('{:}'.format(result)) cifar10_train, cifar10_test, cifar100_train, cifar100_valid, \ cifar100_test, imagenet16_train, imagenet16_valid, imagenet16_test = distill(result) writer.add_scalars('nasbench201/cifar10', {'train':cifar10_train,'test':cifar10_test}, epoch) writer.add_scalars('nasbench201/cifar100', {'train':cifar100_train,'valid':cifar100_valid, 'test':cifar100_test}, epoch) writer.add_scalars('nasbench201/imagenet16', {'train':imagenet16_train,'valid':imagenet16_valid, 'test':imagenet16_test}, epoch) utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'alpha': model.arch_parameters() }, False, args.save) writer.close()
default=None, help='The path to the NAS-Bench-201 benchmark file.') args = parser.parse_args() vis_save_dir = Path(args.save_dir) vis_save_dir.mkdir(parents=True, exist_ok=True) meta_file = Path(args.api_path) assert meta_file.exists(), 'invalid path for api : {:}'.format(meta_file) #visualize_rank_over_time(str(meta_file), vis_save_dir / 'over-time') #write_video(vis_save_dir / 'over-time') #visualize_info(str(meta_file), 'cifar10' , vis_save_dir) #visualize_info(str(meta_file), 'cifar100', vis_save_dir) #visualize_info(str(meta_file), 'ImageNet16-120', vis_save_dir) #visualize_relative_ranking(vis_save_dir) api = API(args.api_path) #show_reinforce(api, vis_save_dir, 'cifar10-valid' , 'x-valid', 'REINFORCE-CIFAR-10', (85, 92, 2)) #show_rea (api, vis_save_dir, 'cifar10-valid' , 'x-valid', 'REA-CIFAR-10', (88, 92, 1)) #plot_results_nas_v2(api, ('cifar10-valid' , 'x-valid'), ('cifar10' , 'ori-test'), vis_save_dir, 'nas-com-v2-cifar010.pdf', (85,95, 1)) #plot_results_nas_v2(api, ('cifar100' , 'x-valid'), ('cifar100' , 'x-test' ), vis_save_dir, 'nas-com-v2-cifar100.pdf', (60,75, 3)) #plot_results_nas_v2(api, ('ImageNet16-120', 'x-valid'), ('ImageNet16-120', 'x-test' ), vis_save_dir, 'nas-com-v2-imagenet.pdf', (35,48, 2)) show_nas_sharing_w_v2(api, ('cifar10-valid', 'x-valid'), ('cifar10', 'ori-test'), vis_save_dir, 'BN0', 'BN0-DARTS-CIFAR010.pdf', (0, 100, 10), 50) show_nas_sharing_w_v2(api, ('cifar100', 'x-valid'), ('cifar100', 'x-test'), vis_save_dir, 'BN0', 'BN0-DARTS-CIFAR100.pdf', (0, 100, 10), 50) show_nas_sharing_w_v2(api, ('ImageNet16-120', 'x-valid'), ('ImageNet16-120', 'x-test'), vis_save_dir, 'BN0',
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) if xargs.dataset == 'cifar10' or xargs.dataset == 'cifar100': split_Fpath = 'configs/nas-benchmark/cifar-split.txt' cifar_split = load_config(split_Fpath, None, None) train_split, valid_split = cifar_split.train, cifar_split.valid logger.log('Load split file from {:}'.format(split_Fpath)) elif xargs.dataset.startswith('ImageNet16'): split_Fpath = 'configs/nas-benchmark/{:}-split.txt'.format( xargs.dataset) imagenet16_split = load_config(split_Fpath, None, None) train_split, valid_split = imagenet16_split.train, imagenet16_split.valid logger.log('Load split file from {:}'.format(split_Fpath)) else: raise ValueError('invalid dataset : {:}'.format(xargs.dataset)) config_path = 'configs/nas-benchmark/algos/DARTS.config' config = load_config(config_path, { 'class_num': class_num, 'xshape': xshape }, logger) # To split data train_data_v2 = deepcopy(train_data) train_data_v2.transform = valid_data.transform valid_data = train_data_v2 search_data = SearchDataset(xargs.dataset, train_data, train_split, valid_split) # data loader search_loader = torch.utils.data.DataLoader(search_data, batch_size=config.batch_size, shuffle=True, num_workers=xargs.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_data, batch_size=config.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split), num_workers=xargs.workers, pin_memory=True) logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}' .format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) model_config = dict2config( { 'name': 'DARTS-V2', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space }, None) search_model = get_cell_based_tiny_net(model_config) logger.log('search-model :\n{:}'.format(search_model)) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.get_weights(), config) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) flop, param = get_model_infos(search_model, xshape) #logger.log('{:}'.format(search_model)) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') network, criterion = torch.nn.DataParallel( search_model).cuda(), criterion.cuda() logger.close()
def visualize_info(meta_file, dataset, vis_save_dir): print('{:} start to visualize {:} information'.format( time_string(), dataset)) cache_file_path = vis_save_dir / '{:}-cache-info.pth'.format(dataset) if not cache_file_path.exists(): print('Do not find cache file : {:}'.format(cache_file_path)) nas_bench = API(str(meta_file)) params, flops, train_accs, valid_accs, test_accs, otest_accs = [], [], [], [], [], [] for index in range(len(nas_bench)): info = nas_bench.query_by_index(index, use_12epochs_result=False) resx = info.get_comput_costs(dataset) flop, param = resx['flops'], resx['params'] if dataset == 'cifar10': res = info.get_metrics('cifar10', 'train') train_acc = res['accuracy'] res = info.get_metrics('cifar10-valid', 'x-valid') valid_acc = res['accuracy'] res = info.get_metrics('cifar10', 'ori-test') test_acc = res['accuracy'] res = info.get_metrics('cifar10', 'ori-test') otest_acc = res['accuracy'] else: res = info.get_metrics(dataset, 'train') train_acc = res['accuracy'] res = info.get_metrics(dataset, 'x-valid') valid_acc = res['accuracy'] res = info.get_metrics(dataset, 'x-test') test_acc = res['accuracy'] res = info.get_metrics(dataset, 'ori-test') otest_acc = res['accuracy'] if index == 11472: # resnet resnet = { 'params': param, 'flops': flop, 'index': 11472, 'train_acc': train_acc, 'valid_acc': valid_acc, 'test_acc': test_acc, 'otest_acc': otest_acc } flops.append(flop) params.append(param) train_accs.append(train_acc) valid_accs.append(valid_acc) test_accs.append(test_acc) otest_accs.append(otest_acc) #resnet = {'params': 0.559, 'flops': 78.56, 'index': 11472, 'train_acc': 99.99, 'valid_acc': 90.84, 'test_acc': 93.97} info = { 'params': params, 'flops': flops, 'train_accs': train_accs, 'valid_accs': valid_accs, 'test_accs': test_accs, 'otest_accs': otest_accs } info['resnet'] = resnet torch.save(info, cache_file_path) else: print('Find cache file : {:}'.format(cache_file_path)) info = torch.load(cache_file_path) params, flops, train_accs, valid_accs, test_accs, otest_accs = info[ 'params'], info['flops'], info['train_accs'], info[ 'valid_accs'], info['test_accs'], info['otest_accs'] resnet = info['resnet'] print('{:} collect data done.'.format(time_string())) indexes = list(range(len(params))) dpi, width, height = 300, 2600, 2600 figsize = width / float(dpi), height / float(dpi) LabelSize, LegendFontsize = 22, 22 resnet_scale, resnet_alpha = 120, 0.5 fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) plt.xticks(np.arange(0, 1.6, 0.3), fontsize=LegendFontsize) if dataset == 'cifar10': plt.ylim(50, 100) plt.yticks(np.arange(50, 101, 10), fontsize=LegendFontsize) elif dataset == 'cifar100': plt.ylim(25, 75) plt.yticks(np.arange(25, 76, 10), fontsize=LegendFontsize) else: plt.ylim(0, 50) plt.yticks(np.arange(0, 51, 10), fontsize=LegendFontsize) ax.scatter(params, valid_accs, marker='o', s=0.5, c='tab:blue') ax.scatter([resnet['params']], [resnet['valid_acc']], marker='*', s=resnet_scale, c='tab:orange', label='resnet', alpha=0.4) plt.grid(zorder=0) ax.set_axisbelow(True) plt.legend(loc=4, fontsize=LegendFontsize) ax.set_xlabel('#parameters (MB)', fontsize=LabelSize) ax.set_ylabel('the validation accuracy (%)', fontsize=LabelSize) save_path = (vis_save_dir / '{:}-param-vs-valid.pdf'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='pdf') save_path = (vis_save_dir / '{:}-param-vs-valid.png'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='png') print('{:} save into {:}'.format(time_string(), save_path)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) plt.xticks(np.arange(0, 1.6, 0.3), fontsize=LegendFontsize) if dataset == 'cifar10': plt.ylim(50, 100) plt.yticks(np.arange(50, 101, 10), fontsize=LegendFontsize) elif dataset == 'cifar100': plt.ylim(25, 75) plt.yticks(np.arange(25, 76, 10), fontsize=LegendFontsize) else: plt.ylim(0, 50) plt.yticks(np.arange(0, 51, 10), fontsize=LegendFontsize) ax.scatter(params, test_accs, marker='o', s=0.5, c='tab:blue') ax.scatter([resnet['params']], [resnet['test_acc']], marker='*', s=resnet_scale, c='tab:orange', label='resnet', alpha=resnet_alpha) plt.grid() ax.set_axisbelow(True) plt.legend(loc=4, fontsize=LegendFontsize) ax.set_xlabel('#parameters (MB)', fontsize=LabelSize) ax.set_ylabel('the test accuracy (%)', fontsize=LabelSize) save_path = (vis_save_dir / '{:}-param-vs-test.pdf'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='pdf') save_path = (vis_save_dir / '{:}-param-vs-test.png'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='png') print('{:} save into {:}'.format(time_string(), save_path)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) plt.xticks(np.arange(0, 1.6, 0.3), fontsize=LegendFontsize) if dataset == 'cifar10': plt.ylim(50, 100) plt.yticks(np.arange(50, 101, 10), fontsize=LegendFontsize) elif dataset == 'cifar100': plt.ylim(20, 100) plt.yticks(np.arange(20, 101, 10), fontsize=LegendFontsize) else: plt.ylim(25, 76) plt.yticks(np.arange(25, 76, 10), fontsize=LegendFontsize) ax.scatter(params, train_accs, marker='o', s=0.5, c='tab:blue') ax.scatter([resnet['params']], [resnet['train_acc']], marker='*', s=resnet_scale, c='tab:orange', label='resnet', alpha=resnet_alpha) plt.grid() ax.set_axisbelow(True) plt.legend(loc=4, fontsize=LegendFontsize) ax.set_xlabel('#parameters (MB)', fontsize=LabelSize) ax.set_ylabel('the trarining accuracy (%)', fontsize=LabelSize) save_path = (vis_save_dir / '{:}-param-vs-train.pdf'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='pdf') save_path = (vis_save_dir / '{:}-param-vs-train.png'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='png') print('{:} save into {:}'.format(time_string(), save_path)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) plt.xlim(0, max(indexes)) plt.xticks(np.arange(min(indexes), max(indexes), max(indexes) // 5), fontsize=LegendFontsize) if dataset == 'cifar10': plt.ylim(50, 100) plt.yticks(np.arange(50, 101, 10), fontsize=LegendFontsize) elif dataset == 'cifar100': plt.ylim(25, 75) plt.yticks(np.arange(25, 76, 10), fontsize=LegendFontsize) else: plt.ylim(0, 50) plt.yticks(np.arange(0, 51, 10), fontsize=LegendFontsize) ax.scatter(indexes, test_accs, marker='o', s=0.5, c='tab:blue') ax.scatter([resnet['index']], [resnet['test_acc']], marker='*', s=resnet_scale, c='tab:orange', label='resnet', alpha=resnet_alpha) plt.grid() ax.set_axisbelow(True) plt.legend(loc=4, fontsize=LegendFontsize) ax.set_xlabel('architecture ID', fontsize=LabelSize) ax.set_ylabel('the test accuracy (%)', fontsize=LabelSize) save_path = (vis_save_dir / '{:}-test-over-ID.pdf'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='pdf') save_path = (vis_save_dir / '{:}-test-over-ID.png'.format(dataset)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='png') print('{:} save into {:}'.format(time_string(), save_path)) plt.close('all')
def main(): api = API(RAWPATH) r, e = create_record(outpath=OUTPATH, api=api) shuffle_data_n_times_and_store(OUTPATH, len(api))
def visualize_rank_over_time(meta_file, vis_save_dir): print('\n' + '-' * 150) vis_save_dir.mkdir(parents=True, exist_ok=True) print('{:} start to visualize rank-over-time into {:}'.format( time_string(), vis_save_dir)) cache_file_path = vis_save_dir / 'rank-over-time-cache-info.pth' if not cache_file_path.exists(): print('Do not find cache file : {:}'.format(cache_file_path)) nas_bench = API(str(meta_file)) print('{:} load nas_bench done'.format(time_string())) params, flops, train_accs, valid_accs, test_accs, otest_accs = [], [], defaultdict( list), defaultdict(list), defaultdict(list), defaultdict(list) #for iepoch in range(200): for index in range( len(nas_bench) ): for index in tqdm(range(len(nas_bench))): info = nas_bench.query_by_index(index, use_12epochs_result=False) for iepoch in range(200): res = info.get_metrics('cifar10', 'train', iepoch) train_acc = res['accuracy'] res = info.get_metrics('cifar10-valid', 'x-valid', iepoch) valid_acc = res['accuracy'] res = info.get_metrics('cifar10', 'ori-test', iepoch) test_acc = res['accuracy'] res = info.get_metrics('cifar10', 'ori-test', iepoch) otest_acc = res['accuracy'] train_accs[iepoch].append(train_acc) valid_accs[iepoch].append(valid_acc) test_accs[iepoch].append(test_acc) otest_accs[iepoch].append(otest_acc) if iepoch == 0: res = info.get_comput_costs('cifar10') flop, param = res['flops'], res['params'] flops.append(flop) params.append(param) info = { 'params': params, 'flops': flops, 'train_accs': train_accs, 'valid_accs': valid_accs, 'test_accs': test_accs, 'otest_accs': otest_accs } torch.save(info, cache_file_path) else: print('Find cache file : {:}'.format(cache_file_path)) info = torch.load(cache_file_path) params, flops, train_accs, valid_accs, test_accs, otest_accs = info[ 'params'], info['flops'], info['train_accs'], info[ 'valid_accs'], info['test_accs'], info['otest_accs'] print('{:} collect data done.'.format(time_string())) #selected_epochs = [0, 100, 150, 180, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199] selected_epochs = list(range(200)) x_xtests = test_accs[199] indexes = list(range(len(x_xtests))) ord_idxs = sorted(indexes, key=lambda i: x_xtests[i]) for sepoch in selected_epochs: x_valids = valid_accs[sepoch] valid_ord_idxs = sorted(indexes, key=lambda i: x_valids[i]) valid_ord_lbls = [] for idx in ord_idxs: valid_ord_lbls.append(valid_ord_idxs.index(idx)) # labeled data dpi, width, height = 300, 2600, 2600 figsize = width / float(dpi), height / float(dpi) LabelSize, LegendFontsize = 18, 18 fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) plt.xlim(min(indexes), max(indexes)) plt.ylim(min(indexes), max(indexes)) plt.yticks(np.arange(min(indexes), max(indexes), max(indexes) // 6), fontsize=LegendFontsize, rotation='vertical') plt.xticks(np.arange(min(indexes), max(indexes), max(indexes) // 6), fontsize=LegendFontsize) ax.scatter(indexes, valid_ord_lbls, marker='^', s=0.5, c='tab:green', alpha=0.8) ax.scatter(indexes, indexes, marker='o', s=0.5, c='tab:blue', alpha=0.8) ax.scatter([-1], [-1], marker='^', s=100, c='tab:green', label='CIFAR-10 validation') ax.scatter([-1], [-1], marker='o', s=100, c='tab:blue', label='CIFAR-10 test') plt.grid(zorder=0) ax.set_axisbelow(True) plt.legend(loc='upper left', fontsize=LegendFontsize) ax.set_xlabel('architecture ranking in the final test accuracy', fontsize=LabelSize) ax.set_ylabel('architecture ranking in the validation set', fontsize=LabelSize) save_path = (vis_save_dir / 'time-{:03d}.pdf'.format(sepoch)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='pdf') save_path = (vis_save_dir / 'time-{:03d}.png'.format(sepoch)).resolve() fig.savefig(save_path, dpi=dpi, bbox_inches='tight', format='png') print('{:} save into {:}'.format(time_string(), save_path)) plt.close('all')
default="./output/search-cell-nas-bench-201/visuals", help="The base-name of folder to save checkpoints and log.", ) parser.add_argument( "--api_path", type=str, default=None, help="The path to the NAS-Bench-201 benchmark file.", ) args = parser.parse_args() vis_save_dir = Path(args.save_dir) vis_save_dir.mkdir(parents=True, exist_ok=True) meta_file = Path(args.api_path) assert meta_file.exists(), "invalid path for api : {:}".format(meta_file) # check_unique_arch(meta_file) api = API(str(meta_file)) # for iepoch in [11, 25, 50, 100, 150, 175, 200]: # check_cor_for_bandit(api, 6, iepoch) # check_cor_for_bandit(api, 12, iepoch) check_cor_for_bandit_v2(api, 6, True, True) check_cor_for_bandit_v2(api, 12, True, True) check_cor_for_bandit_v2(api, 12, False, True) check_cor_for_bandit_v2(api, 24, False, True) check_cor_for_bandit_v2(api, 100, False, True) check_cor_for_bandit_v2(api, 150, False, True) check_cor_for_bandit_v2(api, 175, False, True) check_cor_for_bandit_v2(api, 200, False, True) print("----")
def main(xargs): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) config = load_config(xargs.config_path, { "class_num": class_num, "xshape": xshape }, logger) search_loader, _, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, "configs/nas-benchmark/", (config.batch_size, config.test_batch_size), xargs.workers, ) logger.log( "||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}" .format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log("||||||| {:10s} ||||||| Config={:}".format( xargs.dataset, config)) search_space = get_search_spaces("cell", xargs.search_space_name) model_config = dict2config( { "name": "RANDOM", "C": xargs.channel, "N": xargs.num_cells, "max_nodes": xargs.max_nodes, "num_classes": class_num, "space": search_space, "affine": False, "track_running_stats": bool(xargs.track_running_stats), }, None, ) search_model = get_cell_based_tiny_net(model_config) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.parameters(), config) logger.log("w-optimizer : {:}".format(w_optimizer)) logger.log("w-scheduler : {:}".format(w_scheduler)) logger.log("criterion : {:}".format(criterion)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log("{:} create API = {:} done".format(time_string(), api)) last_info, model_base_path, model_best_path = ( logger.path("info"), logger.path("model"), logger.path("best"), ) network, criterion = torch.nn.DataParallel( search_model).cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info["epoch"] checkpoint = torch.load(last_info["last_checkpoint"]) genotypes = checkpoint["genotypes"] valid_accuracies = checkpoint["valid_accuracies"] search_model.load_state_dict(checkpoint["search_model"]) w_scheduler.load_state_dict(checkpoint["w_scheduler"]) w_optimizer.load_state_dict(checkpoint["w_optimizer"]) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, {"best": -1}, {} # start training start_time, search_time, epoch_time, total_epoch = ( time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup, ) for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = "Time Left: {:}".format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch) logger.log("\n[Search the {:}-th epoch] {:}, LR={:}".format( epoch_str, need_time, min(w_scheduler.get_lr()))) # selected_arch = search_find_best(valid_loader, network, criterion, xargs.select_num) search_w_loss, search_w_top1, search_w_top5 = search_func( search_loader, network, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, logger, ) search_time.update(time.time() - start_time) logger.log( "[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s" .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion) logger.log( "[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%" .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) cur_arch, cur_valid_acc = search_find_best(valid_loader, network, xargs.select_num) logger.log("[{:}] find-the-best : {:}, accuracy@1={:.2f}%".format( epoch_str, cur_arch, cur_valid_acc)) genotypes[epoch] = cur_arch # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies["best"]: valid_accuracies["best"] = valid_a_top1 find_best = True else: find_best = False # save checkpoint save_path = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(xargs), "search_model": search_model.state_dict(), "w_optimizer": w_optimizer.state_dict(), "w_scheduler": w_scheduler.state_dict(), "genotypes": genotypes, "valid_accuracies": valid_accuracies, }, model_base_path, logger, ) last_info = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(args), "last_checkpoint": save_path, }, logger.path("info"), logger, ) if find_best: logger.log( "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%." .format(epoch_str, valid_a_top1)) copy_checkpoint(model_base_path, model_best_path, logger) if api is not None: logger.log("{:}".format(api.query_by_arch(genotypes[epoch], "200"))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log("\n" + "-" * 200) logger.log("Pre-searching costs {:.1f} s".format(search_time.sum)) start_time = time.time() best_arch, best_acc = search_find_best(valid_loader, network, xargs.select_num) search_time.update(time.time() - start_time) logger.log( "RANDOM-NAS finds the best one : {:} with accuracy={:.2f}%, with {:.1f} s." .format(best_arch, best_acc, search_time.sum)) if api is not None: logger.log("{:}".format(api.query_by_arch(best_arch, "200"))) logger.close()
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) config = load_config(xargs.config_path, { 'class_num': class_num, 'xshape': xshape }, logger) search_loader, _, valid_loader = get_nas_search_loaders(train_data, valid_data, xargs.dataset, 'configs/nas-benchmark/', \ (config.batch_size, config.test_batch_size), xargs.workers) logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}' .format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) model_config = dict2config( { 'name': 'RANDOM', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space': search_space, 'affine': False, 'track_running_stats': bool(xargs.track_running_stats) }, None) search_model = get_cell_based_tiny_net(model_config) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.parameters(), config) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') network, criterion = torch.nn.DataParallel( search_model).cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict(checkpoint['search_model']) w_scheduler.load_state_dict(checkpoint['w_scheduler']) w_optimizer.load_state_dict(checkpoint['w_optimizer']) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = 0, {'best': -1}, {} # start training start_time, search_time, epoch_time, total_epoch = time.time( ), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format( epoch_str, need_time, min(w_scheduler.get_lr()))) # selected_arch = search_find_best(valid_loader, network, criterion, xargs.select_num) search_w_loss, search_w_top1, search_w_top5 = search_func( search_loader, network, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, logger) search_time.update(time.time() - start_time) logger.log( '[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s' .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion) logger.log( '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%' .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) cur_arch, cur_valid_acc = search_find_best(valid_loader, network, xargs.select_num) logger.log('[{:}] find-the-best : {:}, accuracy@1={:.2f}%'.format( epoch_str, cur_arch, cur_valid_acc)) genotypes[epoch] = cur_arch # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies['best']: valid_accuracies['best'] = valid_a_top1 find_best = True else: find_best = False # save checkpoint save_path = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer': w_optimizer.state_dict(), 'w_scheduler': w_scheduler.state_dict(), 'genotypes': genotypes, 'valid_accuracies': valid_accuracies }, model_base_path, logger) last_info = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) if find_best: logger.log( '<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.' .format(epoch_str, valid_a_top1)) copy_checkpoint(model_base_path, model_best_path, logger) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes[epoch], '200'))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log('\n' + '-' * 200) logger.log('Pre-searching costs {:.1f} s'.format(search_time.sum)) start_time = time.time() best_arch, best_acc = search_find_best(valid_loader, network, xargs.select_num) search_time.update(time.time() - start_time) logger.log( 'RANDOM-NAS finds the best one : {:} with accuracy={:.2f}%, with {:.1f} s.' .format(best_arch, best_acc, search_time.sum)) if api is not None: logger.log('{:}'.format(api.query_by_arch(best_arch, '200'))) logger.close()
def main(xargs): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, test_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) logger.log("use config from : {:}".format(xargs.config_path)) config = load_config(xargs.config_path, { "class_num": class_num, "xshape": xshape }, logger) _, train_loader, valid_loader = get_nas_search_loaders( train_data, test_data, xargs.dataset, "configs/nas-benchmark/", config.batch_size, xargs.workers, ) # since ENAS will train the controller on valid-loader, we need to use train transformation for valid-loader valid_loader.dataset.transform = deepcopy(train_loader.dataset.transform) if hasattr(valid_loader.dataset, "transforms"): valid_loader.dataset.transforms = deepcopy( train_loader.dataset.transforms) # data loader logger.log( "||||||| {:10s} ||||||| Train-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}" .format(xargs.dataset, len(train_loader), len(valid_loader), config.batch_size)) logger.log("||||||| {:10s} ||||||| Config={:}".format( xargs.dataset, config)) search_space = get_search_spaces("cell", xargs.search_space_name) model_config = dict2config( { "name": "ENAS", "C": xargs.channel, "N": xargs.num_cells, "max_nodes": xargs.max_nodes, "num_classes": class_num, "space": search_space, "affine": False, "track_running_stats": bool(xargs.track_running_stats), }, None, ) shared_cnn = get_cell_based_tiny_net(model_config) controller = shared_cnn.create_controller() w_optimizer, w_scheduler, criterion = get_optim_scheduler( shared_cnn.parameters(), config) a_optimizer = torch.optim.Adam( controller.parameters(), lr=config.controller_lr, betas=config.controller_betas, eps=config.controller_eps, ) logger.log("w-optimizer : {:}".format(w_optimizer)) logger.log("a-optimizer : {:}".format(a_optimizer)) logger.log("w-scheduler : {:}".format(w_scheduler)) logger.log("criterion : {:}".format(criterion)) # flop, param = get_model_infos(shared_cnn, xshape) # logger.log('{:}'.format(shared_cnn)) # logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log("search-space : {:}".format(search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log("{:} create API = {:} done".format(time_string(), api)) shared_cnn, controller, criterion = ( torch.nn.DataParallel(shared_cnn).cuda(), controller.cuda(), criterion.cuda(), ) last_info, model_base_path, model_best_path = ( logger.path("info"), logger.path("model"), logger.path("best"), ) if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info["epoch"] checkpoint = torch.load(last_info["last_checkpoint"]) genotypes = checkpoint["genotypes"] baseline = checkpoint["baseline"] valid_accuracies = checkpoint["valid_accuracies"] shared_cnn.load_state_dict(checkpoint["shared_cnn"]) controller.load_state_dict(checkpoint["controller"]) w_scheduler.load_state_dict(checkpoint["w_scheduler"]) w_optimizer.load_state_dict(checkpoint["w_optimizer"]) a_optimizer.load_state_dict(checkpoint["a_optimizer"]) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes, baseline = 0, { "best": -1 }, {}, None # start training start_time, search_time, epoch_time, total_epoch = ( time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup, ) for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = "Time Left: {:}".format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch) logger.log( "\n[Search the {:}-th epoch] {:}, LR={:}, baseline={:}".format( epoch_str, need_time, min(w_scheduler.get_lr()), baseline)) cnn_loss, cnn_top1, cnn_top5 = train_shared_cnn( train_loader, shared_cnn, controller, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, logger, ) logger.log( "[{:}] shared-cnn : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%" .format(epoch_str, cnn_loss, cnn_top1, cnn_top5)) ctl_loss, ctl_acc, ctl_baseline, ctl_reward, baseline = train_controller( valid_loader, shared_cnn, controller, criterion, a_optimizer, dict2config( { "baseline": baseline, "ctl_train_steps": xargs.controller_train_steps, "ctl_num_aggre": xargs.controller_num_aggregate, "ctl_entropy_w": xargs.controller_entropy_weight, "ctl_bl_dec": xargs.controller_bl_dec, }, None, ), epoch_str, xargs.print_freq, logger, ) search_time.update(time.time() - start_time) logger.log( "[{:}] controller : loss={:.2f}, accuracy={:.2f}%, baseline={:.2f}, reward={:.2f}, current-baseline={:.4f}, time-cost={:.1f} s" .format( epoch_str, ctl_loss, ctl_acc, ctl_baseline, ctl_reward, baseline, search_time.sum, )) best_arch, _ = get_best_arch(controller, shared_cnn, valid_loader) shared_cnn.module.update_arch(best_arch) _, best_valid_acc, _ = valid_func(valid_loader, shared_cnn, criterion) genotypes[epoch] = best_arch # check the best accuracy valid_accuracies[epoch] = best_valid_acc if best_valid_acc > valid_accuracies["best"]: valid_accuracies["best"] = best_valid_acc genotypes["best"] = best_arch find_best = True else: find_best = False logger.log("<<<--->>> The {:}-th epoch : {:}".format( epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(xargs), "baseline": baseline, "shared_cnn": shared_cnn.state_dict(), "controller": controller.state_dict(), "w_optimizer": w_optimizer.state_dict(), "a_optimizer": a_optimizer.state_dict(), "w_scheduler": w_scheduler.state_dict(), "genotypes": genotypes, "valid_accuracies": valid_accuracies, }, model_base_path, logger, ) last_info = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(args), "last_checkpoint": save_path, }, logger.path("info"), logger, ) if find_best: logger.log( "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%." .format(epoch_str, best_valid_acc)) copy_checkpoint(model_base_path, model_best_path, logger) if api is not None: logger.log("{:}".format(api.query_by_arch(genotypes[epoch], "200"))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log("\n" + "-" * 100) logger.log("During searching, the best architecture is {:}".format( genotypes["best"])) logger.log("Its accuracy is {:.2f}%".format(valid_accuracies["best"])) logger.log("Randomly select {:} architectures and select the best.".format( xargs.controller_num_samples)) start_time = time.time() final_arch, _ = get_best_arch(controller, shared_cnn, valid_loader, xargs.controller_num_samples) search_time.update(time.time() - start_time) shared_cnn.module.update_arch(final_arch) final_loss, final_top1, final_top5 = valid_func(valid_loader, shared_cnn, criterion) logger.log("The Selected Final Architecture : {:}".format(final_arch)) logger.log("Loss={:.3f}, Accuracy@1={:.2f}%, Accuracy@5={:.2f}%".format( final_loss, final_top1, final_top5)) logger.log( "ENAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format( total_epoch, search_time.sum, final_arch)) if api is not None: logger.log("{:}".format(api.query_by_arch(final_arch))) logger.close()
def main(xargs): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1 ) # config_path = 'configs/nas-benchmark/algos/GDAS.config' config = load_config( xargs.config_path, {"class_num": class_num, "xshape": xshape}, logger ) search_loader, _, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, "configs/nas-benchmark/", config.batch_size, xargs.workers, ) logger.log( "||||||| {:10s} ||||||| Search-Loader-Num={:}, batch size={:}".format( xargs.dataset, len(search_loader), config.batch_size ) ) logger.log("||||||| {:10s} ||||||| Config={:}".format(xargs.dataset, config)) search_space = get_search_spaces("cell", xargs.search_space_name) if xargs.model_config is None: model_config = dict2config( { "name": "GDAS", "C": xargs.channel, "N": xargs.num_cells, "max_nodes": xargs.max_nodes, "num_classes": class_num, "space": search_space, "affine": False, "track_running_stats": bool(xargs.track_running_stats), }, None, ) else: model_config = load_config( xargs.model_config, { "num_classes": class_num, "space": search_space, "affine": False, "track_running_stats": bool(xargs.track_running_stats), }, None, ) search_model = get_cell_based_tiny_net(model_config) logger.log("search-model :\n{:}".format(search_model)) logger.log("model-config : {:}".format(model_config)) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.get_weights(), config ) a_optimizer = torch.optim.Adam( search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay, ) logger.log("w-optimizer : {:}".format(w_optimizer)) logger.log("a-optimizer : {:}".format(a_optimizer)) logger.log("w-scheduler : {:}".format(w_scheduler)) logger.log("criterion : {:}".format(criterion)) flop, param = get_model_infos(search_model, xshape) logger.log("FLOP = {:.2f} M, Params = {:.2f} MB".format(flop, param)) logger.log("search-space [{:} ops] : {:}".format(len(search_space), search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log("{:} create API = {:} done".format(time_string(), api)) last_info, model_base_path, model_best_path = ( logger.path("info"), logger.path("model"), logger.path("best"), ) network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log( "=> loading checkpoint of the last-info '{:}' start".format(last_info) ) last_info = torch.load(last_info) start_epoch = last_info["epoch"] checkpoint = torch.load(last_info["last_checkpoint"]) genotypes = checkpoint["genotypes"] valid_accuracies = checkpoint["valid_accuracies"] search_model.load_state_dict(checkpoint["search_model"]) w_scheduler.load_state_dict(checkpoint["w_scheduler"]) w_optimizer.load_state_dict(checkpoint["w_optimizer"]) a_optimizer.load_state_dict(checkpoint["a_optimizer"]) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format( last_info, start_epoch ) ) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes = ( 0, {"best": -1}, {-1: search_model.genotype()}, ) # start training start_time, search_time, epoch_time, total_epoch = ( time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup, ) for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = "Time Left: {:}".format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True) ) epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch) search_model.set_tau( xargs.tau_max - (xargs.tau_max - xargs.tau_min) * epoch / (total_epoch - 1) ) logger.log( "\n[Search the {:}-th epoch] {:}, tau={:}, LR={:}".format( epoch_str, need_time, search_model.get_tau(), min(w_scheduler.get_lr()) ) ) ( search_w_loss, search_w_top1, search_w_top5, valid_a_loss, valid_a_top1, valid_a_top5, ) = search_func( search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger, ) search_time.update(time.time() - start_time) logger.log( "[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s".format( epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum ) ) logger.log( "[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%".format( epoch_str, valid_a_loss, valid_a_top1, valid_a_top5 ) ) # check the best accuracy valid_accuracies[epoch] = valid_a_top1 if valid_a_top1 > valid_accuracies["best"]: valid_accuracies["best"] = valid_a_top1 genotypes["best"] = search_model.genotype() find_best = True else: find_best = False genotypes[epoch] = search_model.genotype() logger.log( "<<<--->>> The {:}-th epoch : {:}".format(epoch_str, genotypes[epoch]) ) # save checkpoint save_path = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(xargs), "search_model": search_model.state_dict(), "w_optimizer": w_optimizer.state_dict(), "a_optimizer": a_optimizer.state_dict(), "w_scheduler": w_scheduler.state_dict(), "genotypes": genotypes, "valid_accuracies": valid_accuracies, }, model_base_path, logger, ) last_info = save_checkpoint( { "epoch": epoch + 1, "args": deepcopy(args), "last_checkpoint": save_path, }, logger.path("info"), logger, ) if find_best: logger.log( "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.".format( epoch_str, valid_a_top1 ) ) copy_checkpoint(model_base_path, model_best_path, logger) with torch.no_grad(): logger.log("{:}".format(search_model.show_alphas())) if api is not None: logger.log("{:}".format(api.query_by_arch(genotypes[epoch], "200"))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log("\n" + "-" * 100) # check the performance from the architecture dataset logger.log( "GDAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format( total_epoch, search_time.sum, genotypes[total_epoch - 1] ) ) if api is not None: logger.log("{:}".format(api.query_by_arch(genotypes[total_epoch - 1], "200"))) logger.close()
args = parser.parse_args() args.verbose = True if args.verbose == 'True' else False args.fix_seed = True if args.fix_seed == 'True' else False max_budget = args.max_budget dataset = args.dataset # Directory where files will be written if args. async is None: folder = "de_pop{}".format(args.pop_size) else: folder = "ade_{}_pop{}".format(args. async, args.pop_size) output_path = os.path.join(args.output_path, args.dataset, folder) os.makedirs(output_path, exist_ok=True) # Loading NAS-201 api = API(args.data_dir) search_space = get_search_spaces('cell', 'nas-bench-201') # Parameter space to be used by DE cs = get_configuration_space(args.max_nodes, search_space) dimensions = len(cs.get_hyperparameters()) config2structure = config2structure_func(args.max_nodes) y_star_valid, y_star_test = find_nas201_best(api, dataset) inc_config = cs.get_default_configuration().get_array().tolist() # Custom objective function for DE to interface NASBench-201 def f(config, budget=max_budget): global dataset, api structure = config2structure(config)
def main(xargs, myargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(xargs.workers) prepare_seed(xargs.rand_seed) logger = prepare_logger(xargs) train_data, valid_data, xshape, class_num = get_datasets( xargs.dataset, xargs.data_path, -1) config = load_config(xargs.config_path, { 'class_num': class_num, 'xshape': xshape }, logger) search_loader, _, valid_loader = get_nas_search_loaders( train_data, valid_data, xargs.dataset, 'AutoDL-Projects/configs/nas-benchmark/', (config.batch_size, config.test_batch_size), xargs.num_worker) logger.log( '||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}' .format(xargs.dataset, len(search_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format( xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) if not hasattr(xargs, 'model_config') or xargs.model_config is None: model_config = dict2config( dict(name='SETN', C=xargs.channel, N=xargs.num_cells, max_nodes=xargs.max_nodes, num_classes=class_num, space=search_space, affine=False, track_running_stats=bool(xargs.track_running_stats)), None) else: model_config = load_config( xargs.model_config, dict(num_classes=class_num, space=search_space, affine=False, track_running_stats=bool(xargs.track_running_stats)), None) logger.log('search space : {:}'.format(search_space)) search_model = get_cell_based_tiny_net(model_config) w_optimizer, w_scheduler, criterion = get_optim_scheduler( search_model.get_weights(), config) a_optimizer = torch.optim.Adam(search_model.get_alphas(), lr=xargs.arch_learning_rate, betas=(0.5, 0.999), weight_decay=xargs.arch_weight_decay) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) flop, param = get_model_infos(search_model, xshape) logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log('search-space : {:}'.format(search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) last_info, model_base_path, model_best_path = logger.path( 'info'), logger.path('model'), logger.path('best') network, criterion = torch.nn.DataParallel( search_model).cuda(), criterion.cuda() if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] valid_accuracies = checkpoint['valid_accuracies'] search_model.load_state_dict(checkpoint['search_model']) w_scheduler.load_state_dict(checkpoint['w_scheduler']) w_optimizer.load_state_dict(checkpoint['w_optimizer']) a_optimizer.load_state_dict(checkpoint['a_optimizer']) logger.log( "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch." .format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) init_genotype, _ = get_best_arch(valid_loader, network, xargs.select_num) start_epoch, valid_accuracies, genotypes = 0, { 'best': -1 }, { -1: init_genotype } # start training start_time, search_time, epoch_time, total_epoch = time.time( ), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch - epoch), True)) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) logger.log('\n[Search the {:}-th epoch] {:}, LR={:}'.format( epoch_str, need_time, min(w_scheduler.get_lr()))) search_w_loss, search_w_top1, search_w_top5, search_a_loss, search_a_top1, search_a_top5 \ = search_func(search_loader, network, criterion, w_scheduler, w_optimizer, a_optimizer, epoch_str, xargs.print_freq, logger) search_time.update(time.time() - start_time) logger.log( '[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s' .format(epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum)) logger.log( '[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%' .format(epoch_str, search_a_loss, search_a_top1, search_a_top5)) genotype, temp_accuracy = get_best_arch(valid_loader, network, xargs.select_num) network.module.set_cal_mode('dynamic', genotype) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion) logger.log( '[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}' .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5, genotype)) #search_model.set_cal_mode('urs') #valid_a_loss , valid_a_top1 , valid_a_top5 = valid_func(valid_loader, network, criterion) #logger.log('[{:}] URS---evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) #search_model.set_cal_mode('joint') #valid_a_loss , valid_a_top1 , valid_a_top5 = valid_func(valid_loader, network, criterion) #logger.log('[{:}] JOINT-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) #search_model.set_cal_mode('select') #valid_a_loss , valid_a_top1 , valid_a_top5 = valid_func(valid_loader, network, criterion) #logger.log('[{:}] Selec-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5)) # check the best accuracy valid_accuracies[epoch] = valid_a_top1 genotypes[epoch] = genotype logger.log('<<<--->>> The {:}-th epoch : {:}'.format( epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'search_model': search_model.state_dict(), 'w_optimizer': w_optimizer.state_dict(), 'a_optimizer': a_optimizer.state_dict(), 'w_scheduler': w_scheduler.state_dict(), 'genotypes': genotypes, 'valid_accuracies': valid_accuracies }, model_base_path, logger) last_info = save_checkpoint( { 'epoch': epoch + 1, 'args': deepcopy(xargs), 'last_checkpoint': save_path, }, logger.path('info'), logger) with torch.no_grad(): logger.log('{:}'.format(search_model.show_alphas())) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotypes[epoch], '200'))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() # the final post procedure : count the time start_time = time.time() genotype, temp_accuracy = get_best_arch(valid_loader, network, xargs.select_num) search_time.update(time.time() - start_time) network.module.set_cal_mode('dynamic', genotype) valid_a_loss, valid_a_top1, valid_a_top5 = valid_func( valid_loader, network, criterion) logger.log( 'Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%.' .format(genotype, valid_a_top1)) logger.log('\n' + '-' * 100) # check the performance from the architecture dataset logger.log( 'SETN : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format( total_epoch, search_time.sum, genotype)) if api is not None: logger.log('{:}'.format(api.query_by_arch(genotype, '200'))) logger.close()
def main(xargs): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads( xargs.workers ) prepare_seed(xargs.rand_seed) logger = prepare_logger(args) train_data, test_data, xshape, class_num = get_datasets(xargs.dataset, xargs.data_path, -1) logger.log('use config from : {:}'.format(xargs.config_path)) config = load_config(xargs.config_path, {'class_num': class_num, 'xshape': xshape}, logger) _, train_loader, valid_loader = get_nas_search_loaders(train_data, test_data, xargs.dataset, 'configs/nas-benchmark/', config.batch_size, xargs.workers) # since ENAS will train the controller on valid-loader, we need to use train transformation for valid-loader valid_loader.dataset.transform = deepcopy(train_loader.dataset.transform) if hasattr(valid_loader.dataset, 'transforms'): valid_loader.dataset.transforms = deepcopy(train_loader.dataset.transforms) # data loader logger.log('||||||| {:10s} ||||||| Train-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}'.format(xargs.dataset, len(train_loader), len(valid_loader), config.batch_size)) logger.log('||||||| {:10s} ||||||| Config={:}'.format(xargs.dataset, config)) search_space = get_search_spaces('cell', xargs.search_space_name) model_config = dict2config({'name': 'ENAS', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': class_num, 'space' : search_space, 'affine' : False, 'track_running_stats': bool(xargs.track_running_stats)}, None) shared_cnn = get_cell_based_tiny_net(model_config) controller = shared_cnn.create_controller() w_optimizer, w_scheduler, criterion = get_optim_scheduler(shared_cnn.parameters(), config) a_optimizer = torch.optim.Adam(controller.parameters(), lr=config.controller_lr, betas=config.controller_betas, eps=config.controller_eps) logger.log('w-optimizer : {:}'.format(w_optimizer)) logger.log('a-optimizer : {:}'.format(a_optimizer)) logger.log('w-scheduler : {:}'.format(w_scheduler)) logger.log('criterion : {:}'.format(criterion)) #flop, param = get_model_infos(shared_cnn, xshape) #logger.log('{:}'.format(shared_cnn)) #logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) logger.log('search-space : {:}'.format(search_space)) if xargs.arch_nas_dataset is None: api = None else: api = API(xargs.arch_nas_dataset) logger.log('{:} create API = {:} done'.format(time_string(), api)) shared_cnn, controller, criterion = torch.nn.DataParallel(shared_cnn).cuda(), controller.cuda(), criterion.cuda() last_info, model_base_path, model_best_path = logger.path('info'), logger.path('model'), logger.path('best') if last_info.exists(): # automatically resume from previous checkpoint logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] checkpoint = torch.load(last_info['last_checkpoint']) genotypes = checkpoint['genotypes'] baseline = checkpoint['baseline'] valid_accuracies = checkpoint['valid_accuracies'] shared_cnn.load_state_dict( checkpoint['shared_cnn'] ) controller.load_state_dict( checkpoint['controller'] ) w_scheduler.load_state_dict ( checkpoint['w_scheduler'] ) w_optimizer.load_state_dict ( checkpoint['w_optimizer'] ) a_optimizer.load_state_dict ( checkpoint['a_optimizer'] ) logger.log("=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format(last_info, start_epoch)) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch, valid_accuracies, genotypes, baseline = 0, {'best': -1}, {}, None # start training start_time, search_time, epoch_time, total_epoch = time.time(), AverageMeter(), AverageMeter(), config.epochs + config.warmup for epoch in range(start_epoch, total_epoch): w_scheduler.update(epoch, 0.0) need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.val * (total_epoch-epoch), True) ) epoch_str = '{:03d}-{:03d}'.format(epoch, total_epoch) logger.log('\n[Search the {:}-th epoch] {:}, LR={:}, baseline={:}'.format(epoch_str, need_time, min(w_scheduler.get_lr()), baseline)) cnn_loss, cnn_top1, cnn_top5 = train_shared_cnn(train_loader, shared_cnn, controller, criterion, w_scheduler, w_optimizer, epoch_str, xargs.print_freq, logger) logger.log('[{:}] shared-cnn : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, cnn_loss, cnn_top1, cnn_top5)) ctl_loss, ctl_acc, ctl_baseline, ctl_reward, baseline \ = train_controller(valid_loader, shared_cnn, controller, criterion, a_optimizer, \ dict2config({'baseline': baseline, 'ctl_train_steps': xargs.controller_train_steps, 'ctl_num_aggre': xargs.controller_num_aggregate, 'ctl_entropy_w': xargs.controller_entropy_weight, 'ctl_bl_dec' : xargs.controller_bl_dec}, None), \ epoch_str, xargs.print_freq, logger) search_time.update(time.time() - start_time) logger.log('[{:}] controller : loss={:.2f}, accuracy={:.2f}%, baseline={:.2f}, reward={:.2f}, current-baseline={:.4f}, time-cost={:.1f} s'.format(epoch_str, ctl_loss, ctl_acc, ctl_baseline, ctl_reward, baseline, search_time.sum)) best_arch, _ = get_best_arch(controller, shared_cnn, valid_loader) shared_cnn.module.update_arch(best_arch) _, best_valid_acc, _ = valid_func(valid_loader, shared_cnn, criterion) genotypes[epoch] = best_arch # check the best accuracy valid_accuracies[epoch] = best_valid_acc if best_valid_acc > valid_accuracies['best']: valid_accuracies['best'] = best_valid_acc genotypes['best'] = best_arch find_best = True else: find_best = False logger.log('<<<--->>> The {:}-th epoch : {:}'.format(epoch_str, genotypes[epoch])) # save checkpoint save_path = save_checkpoint({'epoch' : epoch + 1, 'args' : deepcopy(xargs), 'baseline' : baseline, 'shared_cnn' : shared_cnn.state_dict(), 'controller' : controller.state_dict(), 'w_optimizer' : w_optimizer.state_dict(), 'a_optimizer' : a_optimizer.state_dict(), 'w_scheduler' : w_scheduler.state_dict(), 'genotypes' : genotypes, 'valid_accuracies' : valid_accuracies}, model_base_path, logger) last_info = save_checkpoint({ 'epoch': epoch + 1, 'args' : deepcopy(args), 'last_checkpoint': save_path, }, logger.path('info'), logger) if find_best: logger.log('<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.'.format(epoch_str, best_valid_acc)) copy_checkpoint(model_base_path, model_best_path, logger) if api is not None: logger.log('{:}'.format(api.query_by_arch( genotypes[epoch] ))) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.log('\n' + '-'*100) logger.log('During searching, the best architecture is {:}'.format(genotypes['best'])) logger.log('Its accuracy is {:.2f}%'.format(valid_accuracies['best'])) logger.log('Randomly select {:} architectures and select the best.'.format(xargs.controller_num_samples)) start_time = time.time() final_arch, _ = get_best_arch(controller, shared_cnn, valid_loader, xargs.controller_num_samples) search_time.update(time.time() - start_time) shared_cnn.module.update_arch(final_arch) final_loss, final_top1, final_top5 = valid_func(valid_loader, shared_cnn, criterion) logger.log('The Selected Final Architecture : {:}'.format(final_arch)) logger.log('Loss={:.3f}, Accuracy@1={:.2f}%, Accuracy@5={:.2f}%'.format(final_loss, final_top1, final_top5)) logger.log('ENAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.'.format(total_epoch, search_time.sum, final_arch)) if api is not None: logger.log('{:}'.format( api.query_by_arch(final_arch) )) logger.close()