def account_one_arch(arch_index, arch_str, checkpoints, datasets, dataloader_dict): information = ArchResults(arch_index, arch_str) for checkpoint_path in checkpoints: checkpoint = torch.load(checkpoint_path, map_location='cpu') used_seed = checkpoint_path.name.split('-')[-1].split('.')[0] for dataset in datasets: assert dataset in checkpoint, 'Can not find {:} in arch-{:} from {:}'.format(dataset, arch_index, checkpoint_path) results = checkpoint[dataset] assert results['finish-train'], 'This {:} arch seed={:} does not finish train on {:} ::: {:}'.format(arch_index, used_seed, dataset, checkpoint_path) arch_config = {'channel': results['channel'], 'num_cells': results['num_cells'], 'arch_str': arch_str, 'class_num': results['config']['class_num']} xresult = ResultsCount(dataset, results['net_state_dict'], results['train_acc1es'], results['train_losses'], \ results['param'], results['flop'], arch_config, used_seed, results['total_epoch'], None) if dataset == 'cifar10-valid': xresult.update_eval('x-valid' , results['valid_acc1es'], results['valid_losses']) elif dataset == 'cifar10': xresult.update_eval('ori-test', results['valid_acc1es'], results['valid_losses']) elif dataset == 'cifar100' or dataset == 'ImageNet16-120': xresult.update_eval('ori-test', results['valid_acc1es'], results['valid_losses']) net_config = dict2config({'name': 'infer.tiny', 'C': arch_config['channel'], 'N': arch_config['num_cells'], 'genotype': CellStructure.str2structure(arch_config['arch_str']), 'num_classes':arch_config['class_num']}, None) network = get_cell_based_tiny_net(net_config) network.load_state_dict(xresult.get_net_param()) network = network.cuda() loss, top1, top5, latencies = pure_evaluate(dataloader_dict['{:}@{:}'.format(dataset, 'valid')], network) xresult.update_eval('x-valid', {results['total_epoch']-1: top1}, {results['total_epoch']-1: loss}) loss, top1, top5, latencies = pure_evaluate(dataloader_dict['{:}@{:}'.format(dataset, 'test')], network) xresult.update_eval('x-test' , {results['total_epoch']-1: top1}, {results['total_epoch']-1: loss}) xresult.update_latency(latencies) else: raise ValueError('invalid dataset name : {:}'.format(dataset)) information.update(dataset, int(used_seed), xresult) return information
def evaluate_one_shot(model, xloader, api, cal_mode, seed=111): print( "This is an old version of codes to use NAS-Bench-API, and should be modified to align with the new version. Please contact me for more details if you use this function." ) weights = deepcopy(model.state_dict()) model.train(cal_mode) with torch.no_grad(): logits = nn.functional.log_softmax(model.arch_parameters, dim=-1) archs = CellStructure.gen_all(model.op_names, model.max_nodes, False) probs, accuracies, gt_accs_10_valid, gt_accs_10_test = [], [], [], [] loader_iter = iter(xloader) random.seed(seed) random.shuffle(archs) for idx, arch in enumerate(archs): arch_index = api.query_index_by_arch(arch) metrics = api.get_more_info(arch_index, "cifar10-valid", None, False, False) gt_accs_10_valid.append(metrics["valid-accuracy"]) metrics = api.get_more_info(arch_index, "cifar10", None, False, False) gt_accs_10_test.append(metrics["test-accuracy"]) select_logits = [] for i, node_info in enumerate(arch.nodes): for op, xin in node_info: node_str = "{:}<-{:}".format(i + 1, xin) op_index = model.op_names.index(op) select_logits.append(logits[model.edge2index[node_str], op_index]) cur_prob = sum(select_logits).item() probs.append(cur_prob) cor_prob_valid = np.corrcoef(probs, gt_accs_10_valid)[0, 1] cor_prob_test = np.corrcoef(probs, gt_accs_10_test)[0, 1] print( "{:} correlation for probabilities : {:.6f} on CIFAR-10 validation and {:.6f} on CIFAR-10 test" .format(time_string(), cor_prob_valid, cor_prob_test)) for idx, arch in enumerate(archs): model.set_cal_mode("dynamic", arch) try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) _, logits = model(inputs.cuda()) _, preds = torch.max(logits, dim=-1) correct = (preds == targets.cuda()).float() accuracies.append(correct.mean().item()) if idx != 0 and (idx % 500 == 0 or idx + 1 == len(archs)): cor_accs_valid = np.corrcoef(accuracies, gt_accs_10_valid[:idx + 1])[0, 1] cor_accs_test = np.corrcoef(accuracies, gt_accs_10_test[:idx + 1])[0, 1] print( "{:} {:05d}/{:05d} mode={:5s}, correlation : accs={:.5f} for CIFAR-10 valid, {:.5f} for CIFAR-10 test." .format(time_string(), idx, len(archs), "Train" if cal_mode else "Eval", cor_accs_valid, cor_accs_test)) model.load_state_dict(weights) return archs, probs, accuracies
def create_result_count(used_seed, dataset, arch_config, results, dataloader_dict): xresult = ResultsCount(dataset, results['net_state_dict'], results['train_acc1es'], results['train_losses'], \ results['param'], results['flop'], arch_config, used_seed, results['total_epoch'], None) net_config = dict2config( { 'name': 'infer.tiny', 'C': arch_config['channel'], 'N': arch_config['num_cells'], 'genotype': CellStructure.str2structure(arch_config['arch_str']), 'num_classes': arch_config['class_num'] }, None) network = get_cell_based_tiny_net(net_config) network.load_state_dict(xresult.get_net_param()) if 'train_times' in results: # new version xresult.update_train_info(results['train_acc1es'], results['train_acc5es'], results['train_losses'], results['train_times']) xresult.update_eval(results['valid_acc1es'], results['valid_losses'], results['valid_times']) else: if dataset == 'cifar10-valid': xresult.update_OLD_eval('x-valid', results['valid_acc1es'], results['valid_losses']) loss, top1, top5, latencies = pure_evaluate( dataloader_dict['{:}@{:}'.format('cifar10', 'test')], network.cuda()) xresult.update_OLD_eval('ori-test', {results['total_epoch'] - 1: top1}, {results['total_epoch'] - 1: loss}) xresult.update_latency(latencies) elif dataset == 'cifar10': xresult.update_OLD_eval('ori-test', results['valid_acc1es'], results['valid_losses']) loss, top1, top5, latencies = pure_evaluate( dataloader_dict['{:}@{:}'.format(dataset, 'test')], network.cuda()) xresult.update_latency(latencies) elif dataset == 'cifar100' or dataset == 'ImageNet16-120': xresult.update_OLD_eval('ori-test', results['valid_acc1es'], results['valid_losses']) loss, top1, top5, latencies = pure_evaluate( dataloader_dict['{:}@{:}'.format(dataset, 'valid')], network.cuda()) xresult.update_OLD_eval('x-valid', {results['total_epoch'] - 1: top1}, {results['total_epoch'] - 1: loss}) loss, top1, top5, latencies = pure_evaluate( dataloader_dict['{:}@{:}'.format(dataset, 'test')], network.cuda()) xresult.update_OLD_eval('x-test', {results['total_epoch'] - 1: top1}, {results['total_epoch'] - 1: loss}) xresult.update_latency(latencies) else: raise ValueError('invalid dataset name : {:}'.format(dataset)) return xresult
def generate_meta_info(save_dir, max_node, divide=40): aa_nas_bench_ss = get_search_spaces('cell', 'nas-bench-201') archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print ('There are {:} archs vs {:}.'.format(len(archs), len(aa_nas_bench_ss) ** ((max_node-1)*max_node/2))) random.seed( 88 ) # please do not change this line for reproducibility random.shuffle( archs ) # to test fixed-random shuffle #print ('arch [0] : {:}\n---->>>> {:}'.format( archs[0], archs[0].tostr() )) #print ('arch [9] : {:}\n---->>>> {:}'.format( archs[9], archs[9].tostr() )) assert archs[0 ].tostr() == '|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|', 'please check the 0-th architecture : {:}'.format(archs[0]) assert archs[9 ].tostr() == '|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|', 'please check the 9-th architecture : {:}'.format(archs[9]) assert archs[123].tostr() == '|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|', 'please check the 123-th architecture : {:}'.format(archs[123]) total_arch = len(archs) num = 50000 indexes_5W = list(range(num)) random.seed( 1021 ) random.shuffle( indexes_5W ) train_split = sorted( list(set(indexes_5W[:num//2])) ) valid_split = sorted( list(set(indexes_5W[num//2:])) ) assert len(train_split) + len(valid_split) == num assert train_split[0] == 0 and train_split[10] == 26 and train_split[111] == 203 and valid_split[0] == 1 and valid_split[10] == 18 and valid_split[111] == 242, '{:} {:} {:} - {:} {:} {:}'.format(train_split[0], train_split[10], train_split[111], valid_split[0], valid_split[10], valid_split[111]) splits = {num: {'train': train_split, 'valid': valid_split} } info = {'archs' : [x.tostr() for x in archs], 'total' : total_arch, 'max_node' : max_node, 'splits': splits} save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) save_name = save_dir / 'meta-node-{:}.pth'.format(max_node) assert not save_name.exists(), '{:} already exist'.format(save_name) torch.save(info, save_name) print ('save the meta file into {:}'.format(save_name)) script_name_full = save_dir / 'BENCH-201-N{:}.opt-full.script'.format(max_node) script_name_less = save_dir / 'BENCH-201-N{:}.opt-less.script'.format(max_node) full_file = open(str(script_name_full), 'w') less_file = open(str(script_name_less), 'w') gaps = total_arch // divide for start in range(0, total_arch, gaps): xend = min(start+gaps, total_arch) full_file.write('bash ./scripts-search/NAS-Bench-201/train-models.sh 0 {:5d} {:5d} -1 \'777 888 999\'\n'.format(start, xend-1)) less_file.write('bash ./scripts-search/NAS-Bench-201/train-models.sh 1 {:5d} {:5d} -1 \'777 888 999\'\n'.format(start, xend-1)) print ('save the training script into {:} and {:}'.format(script_name_full, script_name_less)) full_file.close() less_file.close() script_name = save_dir / 'meta-node-{:}.cal-script.txt'.format(max_node) macro = 'OMP_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0' with open(str(script_name), 'w') as cfile: for start in range(0, total_arch, gaps): xend = min(start+gaps, total_arch) cfile.write('{:} python exps/NAS-Bench-201/statistics.py --mode cal --target_dir {:06d}-{:06d}-C16-N5\n'.format(macro, start, xend-1)) print ('save the post-processing script into {:}'.format(script_name))
def evaluate_one_shot(model, xloader, api, cal_mode, seed=111): weights = deepcopy(model.state_dict()) model.train(cal_mode) with torch.no_grad(): logits = nn.functional.log_softmax(model.arch_parameters, dim=-1) archs = CellStructure.gen_all(model.op_names, model.max_nodes, False) probs, accuracies, gt_accs_10_valid, gt_accs_10_test = [], [], [], [] loader_iter = iter(xloader) random.seed(seed) random.shuffle(archs) for idx, arch in enumerate(archs): arch_index = api.query_index_by_arch(arch) metrics = api.get_more_info(arch_index, 'cifar10-valid', None, False, False) gt_accs_10_valid.append(metrics['valid-accuracy']) metrics = api.get_more_info(arch_index, 'cifar10', None, False, False) gt_accs_10_test.append(metrics['test-accuracy']) select_logits = [] for i, node_info in enumerate(arch.nodes): for op, xin in node_info: node_str = '{:}<-{:}'.format(i + 1, xin) op_index = model.op_names.index(op) select_logits.append(logits[model.edge2index[node_str], op_index]) cur_prob = sum(select_logits).item() probs.append(cur_prob) cor_prob_valid = np.corrcoef(probs, gt_accs_10_valid)[0, 1] cor_prob_test = np.corrcoef(probs, gt_accs_10_test)[0, 1] print( '{:} correlation for probabilities : {:.6f} on CIFAR-10 validation and {:.6f} on CIFAR-10 test' .format(time_string(), cor_prob_valid, cor_prob_test)) for idx, arch in enumerate(archs): model.set_cal_mode('dynamic', arch) try: inputs, targets = next(loader_iter) except: loader_iter = iter(xloader) inputs, targets = next(loader_iter) _, logits = model(inputs.cuda()) _, preds = torch.max(logits, dim=-1) correct = (preds == targets.cuda()).float() accuracies.append(correct.mean().item()) if idx != 0 and (idx % 500 == 0 or idx + 1 == len(archs)): cor_accs_valid = np.corrcoef(accuracies, gt_accs_10_valid[:idx + 1])[0, 1] cor_accs_test = np.corrcoef(accuracies, gt_accs_10_test[:idx + 1])[0, 1] print( '{:} {:05d}/{:05d} mode={:5s}, correlation : accs={:.5f} for CIFAR-10 valid, {:.5f} for CIFAR-10 test.' .format(time_string(), idx, len(archs), 'Train' if cal_mode else 'Eval', cor_accs_valid, cor_accs_test)) model.load_state_dict(weights) return archs, probs, accuracies
def generate_arch(self, actions): genotypes = [] for i in range(1, self.max_nodes): xlist = [] for j in range(i): node_str = '{:}<-{:}'.format(i, j) op_name = self.search_space[actions[self.edge2index[node_str]]] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def config2structure(config): genotypes = [] for i in range(1, max_nodes): xlist = [] for j in range(i): node_str = "{:}<-{:}".format(i, j) op_name = config[node_str] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def random_architecture(): genotypes = [] for i in range(1, max_nodes): xlist = [] for j in range(i): node_str = '{:}<-{:}'.format(i, j) op_name = random.choice( op_names ) xlist.append((op_name, j)) genotypes.append( tuple(xlist) ) return CellStructure( genotypes )
def get_an_arch(): genotypes = [] for i in range(1, 4): xlist = [] for j in range(i): node_str = '{:}<-{:}'.format(i, j) op_name = 'nor_conv_3x3' xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def main(save_dir: Path, workers: int, datasets: List[Text], xpaths: List[Text], splits: List[int], seeds: List[int], nets: List[str], opt_config: Dict[Text, Any], to_evaluate_indexes: tuple, cover_mode: bool, arch_config: Dict[Text, Any]): log_dir = save_dir / 'logs' log_dir.mkdir(parents=True, exist_ok=True) logger = Logger(str(log_dir), os.getpid(), False) logger.log('xargs : seeds = {:}'.format(seeds)) logger.log('xargs : cover_mode = {:}'.format(cover_mode)) logger.log('-' * 100) logger.log( 'Start evaluating range =: {:06d} - {:06d}'.format(min(to_evaluate_indexes), max(to_evaluate_indexes)) +'({:} in total) / {:06d} with cover-mode={:}'.format(len(to_evaluate_indexes), len(nets), cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log( '--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}'.format(i, len(datasets), dataset, xpath, split)) logger.log('--->>> optimization config : {:}'.format(opt_config)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): arch = nets[index] logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] {:}'.format(time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, '-' * 15)) logger.log('{:} {:} {:}'.format('-' * 15, arch, '-' * 15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = save_dir / 'arch-{:06d}-seed-{:04d}.pth'.format(index, seed) if to_save_name.exists(): if cover_mode: logger.log('Find existing file : {:}, remove it before evaluation'.format(to_save_name)) os.remove(str(to_save_name)) else: logger.log('Find existing file : {:}, skip this evaluation'.format(to_save_name)) has_continue = True continue results = evaluate_all_datasets(CellStructure.str2structure(arch), datasets, xpaths, splits, opt_config, seed, arch_config, workers, logger) torch.save(results, to_save_name) logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] ===>>> {:}'.format(time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, to_save_name)) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format(convert_secs2time(epoch_time.avg * (len(to_evaluate_indexes)-i-1), True) ) logger.log('This arch costs : {:}'.format(convert_secs2time(epoch_time.val, True) )) logger.log('{:}'.format('*' * 100)) logger.log('{:} {:74s} {:}'.format('*' * 10, '{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}'.format(i, len( to_evaluate_indexes), index, len(nets), need_time), '*' * 10)) logger.log('{:}'.format('*' * 100)) logger.close()
def traverse_net(max_node): aa_nas_bench_ss = get_search_spaces('cell', 'nats-bench') archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print ('There are {:} archs vs {:}.'.format(len(archs), len(aa_nas_bench_ss) ** ((max_node-1)*max_node/2))) random.seed( 88 ) # please do not change this line for reproducibility random.shuffle( archs ) assert archs[0 ].tostr() == '|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|', 'please check the 0-th architecture : {:}'.format(archs[0]) assert archs[9 ].tostr() == '|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|', 'please check the 9-th architecture : {:}'.format(archs[9]) assert archs[123].tostr() == '|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|', 'please check the 123-th architecture : {:}'.format(archs[123]) return [x.tostr() for x in archs]
def genotype(self): genotypes = [] for i in range(1, self.max_nodes): xlist = [] for j in range(i): node_str = '{:}<-{:}'.format(i, j) with torch.no_grad(): weights = self.arch_parameters[self.edge2index[node_str]] op_name = self.search_space[weights.argmax().item()] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def train_single_model(save_dir, workers, datasets, xpaths, splits, use_less, seeds, model_str, arch_config): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True #torch.backends.cudnn.benchmark = True torch.set_num_threads( workers ) save_dir = Path(save_dir) / 'specifics' / '{:}-{:}-{:}-{:}'.format('LESS' if use_less else 'FULL', model_str, arch_config['channel'], arch_config['num_cells']) logger = Logger(str(save_dir), 0, False) if model_str in CellArchitectures: arch = CellArchitectures[model_str] logger.log('The model string is found in pre-defined architecture dict : {:}'.format(model_str)) else: try: arch = CellStructure.str2structure(model_str) except: raise ValueError('Invalid model string : {:}. It can not be found or parsed.'.format(model_str)) assert arch.check_valid_op(get_search_spaces('cell', 'full')), '{:} has the invalid op.'.format(arch) logger.log('Start train-evaluate {:}'.format(arch.tostr())) logger.log('arch_config : {:}'.format(arch_config)) start_time, seed_time = time.time(), AverageMeter() for _is, seed in enumerate(seeds): logger.log('\nThe {:02d}/{:02d}-th seed is {:} ----------------------<.>----------------------'.format(_is, len(seeds), seed)) to_save_name = save_dir / 'seed-{:04d}.pth'.format(seed) if to_save_name.exists(): logger.log('Find the existing file {:}, directly load!'.format(to_save_name)) checkpoint = torch.load(to_save_name) else: logger.log('Does not find the existing file {:}, train and evaluate!'.format(to_save_name)) checkpoint = evaluate_all_datasets(arch, datasets, xpaths, splits, use_less, seed, arch_config, workers, logger) torch.save(checkpoint, to_save_name) # log information logger.log('{:}'.format(checkpoint['info'])) all_dataset_keys = checkpoint['all_dataset_keys'] for dataset_key in all_dataset_keys: logger.log('\n{:} dataset : {:} {:}'.format('-'*15, dataset_key, '-'*15)) dataset_info = checkpoint[dataset_key] #logger.log('Network ==>\n{:}'.format( dataset_info['net_string'] )) logger.log('Flops = {:} MB, Params = {:} MB'.format(dataset_info['flop'], dataset_info['param'])) logger.log('config : {:}'.format(dataset_info['config'])) logger.log('Training State (finish) = {:}'.format(dataset_info['finish-train'])) last_epoch = dataset_info['total_epoch'] - 1 train_acc1es, train_acc5es = dataset_info['train_acc1es'], dataset_info['train_acc5es'] valid_acc1es, valid_acc5es = dataset_info['valid_acc1es'], dataset_info['valid_acc5es'] logger.log('Last Info : Train = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%, Test = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%'.format(train_acc1es[last_epoch], train_acc5es[last_epoch], 100-train_acc1es[last_epoch], valid_acc1es[last_epoch], valid_acc5es[last_epoch], 100-valid_acc1es[last_epoch])) # measure elapsed time seed_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format( convert_secs2time(seed_time.avg * (len(seeds)-_is-1), True) ) logger.log('\n<<<***>>> The {:02d}/{:02d}-th seed is {:} <finish> other procedures need {:}'.format(_is, len(seeds), seed, need_time)) logger.close()
def generate_meta_info(save_dir, max_node, divide=40): aa_nas_bench_ss = get_search_spaces('cell', 'nas-bench-201') archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print('There are {:} archs vs {:}.'.format( len(archs), len(aa_nas_bench_ss)**((max_node - 1) * max_node / 2))) random.seed(88) # please do not change this line for reproducibility random.shuffle(archs) # to test fixed-random shuffle #print ('arch [0] : {:}\n---->>>> {:}'.format( archs[0], archs[0].tostr() )) #print ('arch [9] : {:}\n---->>>> {:}'.format( archs[9], archs[9].tostr() )) assert archs[0].tostr( ) == '|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|', 'please check the 0-th architecture : {:}'.format( archs[0]) assert archs[9].tostr( ) == '|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|', 'please check the 9-th architecture : {:}'.format( archs[9]) assert archs[123].tostr( ) == '|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|', 'please check the 123-th architecture : {:}'.format( archs[123]) total_arch = len(archs) num = 50000 indexes_5W = list(range(num)) random.seed(1021) random.shuffle(indexes_5W) train_split = sorted(list(set(indexes_5W[:num // 2]))) valid_split = sorted(list(set(indexes_5W[num // 2:]))) assert len(train_split) + len(valid_split) == num assert train_split[0] == 0 and train_split[10] == 26 and train_split[ 111] == 203 and valid_split[0] == 1 and valid_split[ 10] == 18 and valid_split[ 111] == 242, '{:} {:} {:} - {:} {:} {:}'.format( train_split[0], train_split[10], train_split[111], valid_split[0], valid_split[10], valid_split[111]) splits = {num: {'train': train_split, 'valid': valid_split}} info = { 'archs': [x.tostr() for x in archs], 'total': total_arch, 'max_node': max_node, 'splits': splits } save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) save_name = save_dir / 'meta-node-{:}.pth'.format(max_node) assert not save_name.exists(), '{:} already exist'.format(save_name) torch.save(info, save_name) print('save the meta file into {:}'.format(save_name))
def check_unique_arch(meta_file): api = API(str(meta_file)) arch_strs = deepcopy(api.meta_archs) xarchs = [CellStructure.str2structure(x) for x in arch_strs] def get_unique_matrix(archs, consider_zero): UniquStrs = [arch.to_unique_str(consider_zero) for arch in archs] print("{:} create unique-string ({:}/{:}) done".format( time_string(), len(set(UniquStrs)), len(UniquStrs))) Unique2Index = dict() for index, xstr in enumerate(UniquStrs): if xstr not in Unique2Index: Unique2Index[xstr] = list() Unique2Index[xstr].append(index) sm_matrix = torch.eye(len(archs)).bool() for _, xlist in Unique2Index.items(): for i in xlist: for j in xlist: sm_matrix[i, j] = True unique_ids, unique_num = [-1 for _ in archs], 0 for i in range(len(unique_ids)): if unique_ids[i] > -1: continue neighbours = sm_matrix[i].nonzero().view(-1).tolist() for nghb in neighbours: assert unique_ids[nghb] == -1, "impossible" unique_ids[nghb] = unique_num unique_num += 1 return sm_matrix, unique_ids, unique_num print("There are {:} valid-archs".format( sum(arch.check_valid() for arch in xarchs))) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, None) print( "{:} There are {:} unique architectures (considering nothing).".format( time_string(), unique_num)) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, False) print("{:} There are {:} unique architectures (not considering zero).". format(time_string(), unique_num)) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, True) print("{:} There are {:} unique architectures (considering zero).".format( time_string(), unique_num))
def test_issue_81_82(api): results = api.query_by_index(0, 'cifar10-valid', hp='12') results = api.query_by_index(0, 'cifar10-valid', hp='200') print(list(results.keys())) print(results[888].get_eval('valid')) print(results[888].get_eval('x-valid')) result_dict = api.get_more_info(index=0, dataset='cifar10-valid', iepoch=11, hp='200', is_random=False) info = api.query_by_arch( '|nor_conv_3x3~0|+|skip_connect~0|nor_conv_3x3~1|+|skip_connect~0|none~1|nor_conv_3x3~2|', '200') print(info) structure = CellStructure.str2structure( '|nor_conv_3x3~0|+|skip_connect~0|nor_conv_3x3~1|+|skip_connect~0|none~1|nor_conv_3x3~2|' ) info = api.query_by_arch(structure, '200') print(info)
def get_all_archs(operations): combs = [] for i in range(1, 4): for j in range(i): if len(combs) == 0: for func in operations[(i, j)]: combs.append([(func, j)]) else: new_combs = [] for string in combs: for func in operations[(i, j)]: xstring = string + [(func, j)] new_combs.append(xstring) combs = new_combs operations = combs operations_ = [] for ops in operations: temp = [[ops[0]], [ops[1], ops[2]], [ops[3], ops[4], ops[5]]] operations_.append(CellStructure(temp)) return operations_
def main(save_dir, workers, datasets, xpaths, splits, use_less, srange, arch_index, seeds, cover_mode, meta_info, arch_config): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True #torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True torch.set_num_threads( workers ) assert len(srange) == 2 and 0 <= srange[0] <= srange[1], 'invalid srange : {:}'.format(srange) if use_less: sub_dir = Path(save_dir) / '{:06d}-{:06d}-C{:}-N{:}-LESS'.format(srange[0], srange[1], arch_config['channel'], arch_config['num_cells']) else: sub_dir = Path(save_dir) / '{:06d}-{:06d}-C{:}-N{:}'.format(srange[0], srange[1], arch_config['channel'], arch_config['num_cells']) logger = Logger(str(sub_dir), 0, False) all_archs = meta_info['archs'] assert srange[1] < meta_info['total'], 'invalid range : {:}-{:} vs. {:}'.format(srange[0], srange[1], meta_info['total']) assert arch_index == -1 or srange[0] <= arch_index <= srange[1], 'invalid range : {:} vs. {:} vs. {:}'.format(srange[0], arch_index, srange[1]) if arch_index == -1: to_evaluate_indexes = list(range(srange[0], srange[1]+1)) else: to_evaluate_indexes = [arch_index] logger.log('xargs : seeds = {:}'.format(seeds)) logger.log('xargs : arch_index = {:}'.format(arch_index)) logger.log('xargs : cover_mode = {:}'.format(cover_mode)) logger.log('-'*100) logger.log('Start evaluating range =: {:06d} vs. {:06d} vs. {:06d} / {:06d} with cover-mode={:}'.format(srange[0], arch_index, srange[1], meta_info['total'], cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log('--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}'.format(i, len(datasets), dataset, xpath, split)) logger.log('--->>> architecture config : {:}'.format(arch_config)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): arch = all_archs[index] logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th architecture [seeds={:}] {:}'.format('-'*15, i, len(to_evaluate_indexes), index, meta_info['total'], seeds, '-'*15)) #logger.log('{:} {:} {:}'.format('-'*15, arch.tostr(), '-'*15)) logger.log('{:} {:} {:}'.format('-'*15, arch, '-'*15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = sub_dir / 'arch-{:06d}-seed-{:04d}.pth'.format(index, seed) if to_save_name.exists(): if cover_mode: logger.log('Find existing file : {:}, remove it before evaluation'.format(to_save_name)) os.remove(str(to_save_name)) else : logger.log('Find existing file : {:}, skip this evaluation'.format(to_save_name)) has_continue = True continue results = evaluate_all_datasets(CellStructure.str2structure(arch), \ datasets, xpaths, splits, use_less, seed, \ arch_config, workers, logger) torch.save(results, to_save_name) logger.log('{:} --evaluate-- {:06d}/{:06d} ({:06d}/{:06d})-th seed={:} done, save into {:}'.format('-'*15, i, len(to_evaluate_indexes), index, meta_info['total'], seed, to_save_name)) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.avg * (len(to_evaluate_indexes)-i-1), True) ) logger.log('This arch costs : {:}'.format( convert_secs2time(epoch_time.val, True) )) logger.log('{:}'.format('*'*100)) logger.log('{:} {:74s} {:}'.format('*'*10, '{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}'.format(i, len(to_evaluate_indexes), index, meta_info['total'], need_time), '*'*10)) logger.log('{:}'.format('*'*100)) logger.close()
def main(xargs): cifar10 = tf.keras.datasets.cifar10 (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 x_train, x_test = x_train.astype('float32'), x_test.astype('float32') # Add a channels dimension all_indexes = list(range(x_train.shape[0])) random.shuffle(all_indexes) s_train_idxs, s_valid_idxs = all_indexes[::2], all_indexes[1::2] search_train_x, search_train_y = x_train[s_train_idxs], y_train[ s_train_idxs] search_valid_x, search_valid_y = x_train[s_valid_idxs], y_train[ s_valid_idxs] #x_train, x_test = x_train[..., tf.newaxis], x_test[..., tf.newaxis] # Use tf.data #train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(64) search_ds = tf.data.Dataset.from_tensor_slices( (search_train_x, search_train_y, search_valid_x, search_valid_y)) search_ds = search_ds.map(pre_process).shuffle(1000).batch(64) test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32) # Create an instance of the model config = dict2config( { 'name': 'GDAS', 'C': xargs.channel, 'N': xargs.num_cells, 'max_nodes': xargs.max_nodes, 'num_classes': 10, 'space': 'nas-bench-102', 'affine': True }, None) model = get_cell_based_tiny_net(config) #import pdb; pdb.set_trace() #model.build(((64, 32, 32, 3), (1,))) #for x in model.trainable_variables: # print('{:30s} : {:}'.format(x.name, x.shape)) # Choose optimizer loss_object = tf.keras.losses.SparseCategoricalCrossentropy() w_optimizer = SGDW(learning_rate=xargs.w_lr, weight_decay=xargs.w_weight_decay, momentum=xargs.w_momentum, nesterov=True) a_optimizer = AdamW(learning_rate=xargs.arch_learning_rate, weight_decay=xargs.arch_weight_decay, beta_1=0.5, beta_2=0.999, epsilon=1e-07) #w_optimizer = tf.keras.optimizers.SGD(learning_rate=0.025, momentum=0.9, nesterov=True) #a_optimizer = tf.keras.optimizers.AdamW(learning_rate=xargs.arch_learning_rate, beta_1=0.5, beta_2=0.999, epsilon=1e-07) #### # metrics train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') valid_loss = tf.keras.metrics.Mean(name='valid_loss') valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='valid_accuracy') test_loss = tf.keras.metrics.Mean(name='test_loss') test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='test_accuracy') @tf.function def search_step(train_images, train_labels, valid_images, valid_labels, tf_tau): # optimize weights with tf.GradientTape() as tape: predictions = model(train_images, tf_tau, True) w_loss = loss_object(train_labels, predictions) net_w_param = model.get_weights() gradients = tape.gradient(w_loss, net_w_param) w_optimizer.apply_gradients(zip(gradients, net_w_param)) train_loss(w_loss) train_accuracy(train_labels, predictions) # optimize alphas with tf.GradientTape() as tape: predictions = model(valid_images, tf_tau, True) a_loss = loss_object(valid_labels, predictions) net_a_param = model.get_alphas() gradients = tape.gradient(a_loss, net_a_param) a_optimizer.apply_gradients(zip(gradients, net_a_param)) valid_loss(a_loss) valid_accuracy(valid_labels, predictions) # TEST @tf.function def test_step(images, labels): predictions = model(images) t_loss = loss_object(labels, predictions) test_loss(t_loss) test_accuracy(labels, predictions) print( '{:} start searching with {:} epochs ({:} batches per epoch).'.format( time_string(), xargs.epochs, tf.data.experimental.cardinality(search_ds).numpy())) for epoch in range(xargs.epochs): # Reset the metrics at the start of the next epoch train_loss.reset_states() train_accuracy.reset_states() test_loss.reset_states() test_accuracy.reset_states() cur_tau = xargs.tau_max - (xargs.tau_max - xargs.tau_min) * epoch / (xargs.epochs - 1) tf_tau = tf.cast(cur_tau, dtype=tf.float32, name='tau') for trn_imgs, trn_labels, val_imgs, val_labels in search_ds: search_step(trn_imgs, trn_labels, val_imgs, val_labels, tf_tau) genotype = model.genotype() genotype = CellStructure(genotype) #for test_images, test_labels in test_ds: # test_step(test_images, test_labels) template = '{:} Epoch {:03d}/{:03d}, Train-Loss: {:.3f}, Train-Accuracy: {:.2f}%, Valid-Loss: {:.3f}, Valid-Accuracy: {:.2f}% | tau={:.3f}' print( template.format(time_string(), epoch + 1, xargs.epochs, train_loss.result(), train_accuracy.result() * 100, valid_loss.result(), valid_accuracy.result() * 100, cur_tau)) print('{:} genotype : {:}\n{:}\n'.format(time_string(), genotype, model.get_np_alphas()))
def create_result_count( used_seed: int, dataset: Text, arch_config: Dict[Text, Any], results: Dict[Text, Any], dataloader_dict: Dict[Text, Any], ) -> ResultsCount: xresult = ResultsCount( dataset, results["net_state_dict"], results["train_acc1es"], results["train_losses"], results["param"], results["flop"], arch_config, used_seed, results["total_epoch"], None, ) net_config = dict2config( { "name": "infer.tiny", "C": arch_config["channel"], "N": arch_config["num_cells"], "genotype": CellStructure.str2structure(arch_config["arch_str"]), "num_classes": arch_config["class_num"], }, None, ) if "train_times" in results: # new version xresult.update_train_info( results["train_acc1es"], results["train_acc5es"], results["train_losses"], results["train_times"], ) xresult.update_eval(results["valid_acc1es"], results["valid_losses"], results["valid_times"]) else: network = get_cell_based_tiny_net(net_config) network.load_state_dict(xresult.get_net_param()) if dataset == "cifar10-valid": xresult.update_OLD_eval("x-valid", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format("cifar10", "test")], network.cuda()) xresult.update_OLD_eval( "ori-test", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) xresult.update_latency(latencies) elif dataset == "cifar10": xresult.update_OLD_eval("ori-test", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "test")], network.cuda()) xresult.update_latency(latencies) elif dataset == "cifar100" or dataset == "ImageNet16-120": xresult.update_OLD_eval("ori-test", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "valid")], network.cuda()) xresult.update_OLD_eval( "x-valid", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "test")], network.cuda()) xresult.update_OLD_eval( "x-test", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) xresult.update_latency(latencies) else: raise ValueError("invalid dataset name : {:}".format(dataset)) return xresult
def train_single_model( save_dir, workers, datasets, xpaths, splits, use_less, seeds, model_str, arch_config ): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = True torch.set_num_threads(workers) save_dir = ( Path(save_dir) / "specifics" / "{:}-{:}-{:}-{:}".format( "LESS" if use_less else "FULL", model_str, arch_config["channel"], arch_config["num_cells"], ) ) logger = Logger(str(save_dir), 0, False) if model_str in CellArchitectures: arch = CellArchitectures[model_str] logger.log( "The model string is found in pre-defined architecture dict : {:}".format( model_str ) ) else: try: arch = CellStructure.str2structure(model_str) except: raise ValueError( "Invalid model string : {:}. It can not be found or parsed.".format( model_str ) ) assert arch.check_valid_op( get_search_spaces("cell", "full") ), "{:} has the invalid op.".format(arch) logger.log("Start train-evaluate {:}".format(arch.tostr())) logger.log("arch_config : {:}".format(arch_config)) start_time, seed_time = time.time(), AverageMeter() for _is, seed in enumerate(seeds): logger.log( "\nThe {:02d}/{:02d}-th seed is {:} ----------------------<.>----------------------".format( _is, len(seeds), seed ) ) to_save_name = save_dir / "seed-{:04d}.pth".format(seed) if to_save_name.exists(): logger.log( "Find the existing file {:}, directly load!".format(to_save_name) ) checkpoint = torch.load(to_save_name) else: logger.log( "Does not find the existing file {:}, train and evaluate!".format( to_save_name ) ) checkpoint = evaluate_all_datasets( arch, datasets, xpaths, splits, use_less, seed, arch_config, workers, logger, ) torch.save(checkpoint, to_save_name) # log information logger.log("{:}".format(checkpoint["info"])) all_dataset_keys = checkpoint["all_dataset_keys"] for dataset_key in all_dataset_keys: logger.log( "\n{:} dataset : {:} {:}".format("-" * 15, dataset_key, "-" * 15) ) dataset_info = checkpoint[dataset_key] # logger.log('Network ==>\n{:}'.format( dataset_info['net_string'] )) logger.log( "Flops = {:} MB, Params = {:} MB".format( dataset_info["flop"], dataset_info["param"] ) ) logger.log("config : {:}".format(dataset_info["config"])) logger.log( "Training State (finish) = {:}".format(dataset_info["finish-train"]) ) last_epoch = dataset_info["total_epoch"] - 1 train_acc1es, train_acc5es = ( dataset_info["train_acc1es"], dataset_info["train_acc5es"], ) valid_acc1es, valid_acc5es = ( dataset_info["valid_acc1es"], dataset_info["valid_acc5es"], ) logger.log( "Last Info : Train = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%, Test = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%".format( train_acc1es[last_epoch], train_acc5es[last_epoch], 100 - train_acc1es[last_epoch], valid_acc1es[last_epoch], valid_acc5es[last_epoch], 100 - valid_acc1es[last_epoch], ) ) # measure elapsed time seed_time.update(time.time() - start_time) start_time = time.time() need_time = "Time Left: {:}".format( convert_secs2time(seed_time.avg * (len(seeds) - _is - 1), True) ) logger.log( "\n<<<***>>> The {:02d}/{:02d}-th seed is {:} <finish> other procedures need {:}".format( _is, len(seeds), seed, need_time ) ) logger.close()