def generate_meta_info(save_dir, max_node, divide=40): aa_nas_bench_ss = get_search_spaces("cell", "nas-bench-201") archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print("There are {:} archs vs {:}.".format( len(archs), len(aa_nas_bench_ss)**((max_node - 1) * max_node / 2))) random.seed(88) # please do not change this line for reproducibility random.shuffle(archs) # to test fixed-random shuffle # print ('arch [0] : {:}\n---->>>> {:}'.format( archs[0], archs[0].tostr() )) # print ('arch [9] : {:}\n---->>>> {:}'.format( archs[9], archs[9].tostr() )) assert ( archs[0].tostr() == "|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|" ), "please check the 0-th architecture : {:}".format(archs[0]) assert ( archs[9].tostr() == "|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|" ), "please check the 9-th architecture : {:}".format(archs[9]) assert ( archs[123].tostr() == "|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|" ), "please check the 123-th architecture : {:}".format(archs[123]) total_arch = len(archs) num = 50000 indexes_5W = list(range(num)) random.seed(1021) random.shuffle(indexes_5W) train_split = sorted(list(set(indexes_5W[:num // 2]))) valid_split = sorted(list(set(indexes_5W[num // 2:]))) assert len(train_split) + len(valid_split) == num assert (train_split[0] == 0 and train_split[10] == 26 and train_split[111] == 203 and valid_split[0] == 1 and valid_split[10] == 18 and valid_split[111] == 242), "{:} {:} {:} - {:} {:} {:}".format( train_split[0], train_split[10], train_split[111], valid_split[0], valid_split[10], valid_split[111], ) splits = {num: {"train": train_split, "valid": valid_split}} info = { "archs": [x.tostr() for x in archs], "total": total_arch, "max_node": max_node, "splits": splits, } save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) save_name = save_dir / "meta-node-{:}.pth".format(max_node) assert not save_name.exists(), "{:} already exist".format(save_name) torch.save(info, save_name) print("save the meta file into {:}".format(save_name))
def config2structure(config): genotypes = [] for i in range(1, max_nodes): xlist = [] for j in range(i): node_str = "{:}<-{:}".format(i, j) op_name = config[node_str] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def random_architecture(): genotypes = [] for i in range(1, max_nodes): xlist = [] for j in range(i): node_str = "{:}<-{:}".format(i, j) op_name = random.choice(op_names) xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def generate_arch(self, actions): genotypes = [] for i in range(1, self.max_nodes): xlist = [] for j in range(i): node_str = "{:}<-{:}".format(i, j) op_name = self.search_space[actions[self.edge2index[node_str]]] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def genotype(self): genotypes = [] for i in range(1, self.max_nodes): xlist = [] for j in range(i): node_str = "{:}<-{:}".format(i, j) with torch.no_grad(): weights = self.arch_parameters[self.edge2index[node_str]] op_name = self.search_space[weights.argmax().item()] xlist.append((op_name, j)) genotypes.append(tuple(xlist)) return CellStructure(genotypes)
def check_unique_arch(meta_file): api = API(str(meta_file)) arch_strs = deepcopy(api.meta_archs) xarchs = [CellStructure.str2structure(x) for x in arch_strs] def get_unique_matrix(archs, consider_zero): UniquStrs = [arch.to_unique_str(consider_zero) for arch in archs] print("{:} create unique-string ({:}/{:}) done".format( time_string(), len(set(UniquStrs)), len(UniquStrs))) Unique2Index = dict() for index, xstr in enumerate(UniquStrs): if xstr not in Unique2Index: Unique2Index[xstr] = list() Unique2Index[xstr].append(index) sm_matrix = torch.eye(len(archs)).bool() for _, xlist in Unique2Index.items(): for i in xlist: for j in xlist: sm_matrix[i, j] = True unique_ids, unique_num = [-1 for _ in archs], 0 for i in range(len(unique_ids)): if unique_ids[i] > -1: continue neighbours = sm_matrix[i].nonzero().view(-1).tolist() for nghb in neighbours: assert unique_ids[nghb] == -1, "impossible" unique_ids[nghb] = unique_num unique_num += 1 return sm_matrix, unique_ids, unique_num print("There are {:} valid-archs".format( sum(arch.check_valid() for arch in xarchs))) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, None) print( "{:} There are {:} unique architectures (considering nothing).".format( time_string(), unique_num)) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, False) print("{:} There are {:} unique architectures (not considering zero).". format(time_string(), unique_num)) sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, True) print("{:} There are {:} unique architectures (considering zero).".format( time_string(), unique_num))
def traverse_net(max_node): aa_nas_bench_ss = get_search_spaces("cell", "nats-bench") archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print("There are {:} archs vs {:}.".format( len(archs), len(aa_nas_bench_ss)**((max_node - 1) * max_node / 2))) random.seed(88) # please do not change this line for reproducibility random.shuffle(archs) assert ( archs[0].tostr() == "|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|" ), "please check the 0-th architecture : {:}".format(archs[0]) assert ( archs[9].tostr() == "|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|" ), "please check the 9-th architecture : {:}".format(archs[9]) assert ( archs[123].tostr() == "|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|" ), "please check the 123-th architecture : {:}".format(archs[123]) return [x.tostr() for x in archs]
def generate_meta_info(save_dir, max_node, divide=40): aa_nas_bench_ss = get_search_spaces("cell", "nas-bench-201") archs = CellStructure.gen_all(aa_nas_bench_ss, max_node, False) print("There are {:} archs vs {:}.".format( len(archs), len(aa_nas_bench_ss)**((max_node - 1) * max_node / 2))) random.seed(88) # please do not change this line for reproducibility random.shuffle(archs) # to test fixed-random shuffle # print ('arch [0] : {:}\n---->>>> {:}'.format( archs[0], archs[0].tostr() )) # print ('arch [9] : {:}\n---->>>> {:}'.format( archs[9], archs[9].tostr() )) assert ( archs[0].tostr() == "|avg_pool_3x3~0|+|nor_conv_1x1~0|skip_connect~1|+|nor_conv_1x1~0|skip_connect~1|skip_connect~2|" ), "please check the 0-th architecture : {:}".format(archs[0]) assert ( archs[9].tostr() == "|avg_pool_3x3~0|+|none~0|none~1|+|skip_connect~0|none~1|nor_conv_3x3~2|" ), "please check the 9-th architecture : {:}".format(archs[9]) assert ( archs[123].tostr() == "|avg_pool_3x3~0|+|avg_pool_3x3~0|nor_conv_1x1~1|+|none~0|avg_pool_3x3~1|nor_conv_3x3~2|" ), "please check the 123-th architecture : {:}".format(archs[123]) total_arch = len(archs) num = 50000 indexes_5W = list(range(num)) random.seed(1021) random.shuffle(indexes_5W) train_split = sorted(list(set(indexes_5W[:num // 2]))) valid_split = sorted(list(set(indexes_5W[num // 2:]))) assert len(train_split) + len(valid_split) == num assert (train_split[0] == 0 and train_split[10] == 26 and train_split[111] == 203 and valid_split[0] == 1 and valid_split[10] == 18 and valid_split[111] == 242), "{:} {:} {:} - {:} {:} {:}".format( train_split[0], train_split[10], train_split[111], valid_split[0], valid_split[10], valid_split[111], ) splits = {num: {"train": train_split, "valid": valid_split}} info = { "archs": [x.tostr() for x in archs], "total": total_arch, "max_node": max_node, "splits": splits, } save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) save_name = save_dir / "meta-node-{:}.pth".format(max_node) assert not save_name.exists(), "{:} already exist".format(save_name) torch.save(info, save_name) print("save the meta file into {:}".format(save_name)) script_name_full = save_dir / "BENCH-201-N{:}.opt-full.script".format( max_node) script_name_less = save_dir / "BENCH-201-N{:}.opt-less.script".format( max_node) full_file = open(str(script_name_full), "w") less_file = open(str(script_name_less), "w") gaps = total_arch // divide for start in range(0, total_arch, gaps): xend = min(start + gaps, total_arch) full_file.write( "bash ./scripts-search/NAS-Bench-201/train-models.sh 0 {:5d} {:5d} -1 '777 888 999'\n" .format(start, xend - 1)) less_file.write( "bash ./scripts-search/NAS-Bench-201/train-models.sh 1 {:5d} {:5d} -1 '777 888 999'\n" .format(start, xend - 1)) print("save the training script into {:} and {:}".format( script_name_full, script_name_less)) full_file.close() less_file.close() script_name = save_dir / "meta-node-{:}.cal-script.txt".format(max_node) macro = "OMP_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0" with open(str(script_name), "w") as cfile: for start in range(0, total_arch, gaps): xend = min(start + gaps, total_arch) cfile.write( "{:} python exps/NAS-Bench-201/statistics.py --mode cal --target_dir {:06d}-{:06d}-C16-N5\n" .format(macro, start, xend - 1)) print("save the post-processing script into {:}".format(script_name))
def train_single_model(save_dir, workers, datasets, xpaths, splits, use_less, seeds, model_str, arch_config): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = True torch.set_num_threads(workers) save_dir = (Path(save_dir) / "specifics" / "{:}-{:}-{:}-{:}".format( "LESS" if use_less else "FULL", model_str, arch_config["channel"], arch_config["num_cells"], )) logger = Logger(str(save_dir), 0, False) if model_str in CellArchitectures: arch = CellArchitectures[model_str] logger.log( "The model string is found in pre-defined architecture dict : {:}". format(model_str)) else: try: arch = CellStructure.str2structure(model_str) except: raise ValueError( "Invalid model string : {:}. It can not be found or parsed.". format(model_str)) assert arch.check_valid_op(get_search_spaces( "cell", "full")), "{:} has the invalid op.".format(arch) logger.log("Start train-evaluate {:}".format(arch.tostr())) logger.log("arch_config : {:}".format(arch_config)) start_time, seed_time = time.time(), AverageMeter() for _is, seed in enumerate(seeds): logger.log( "\nThe {:02d}/{:02d}-th seed is {:} ----------------------<.>----------------------" .format(_is, len(seeds), seed)) to_save_name = save_dir / "seed-{:04d}.pth".format(seed) if to_save_name.exists(): logger.log("Find the existing file {:}, directly load!".format( to_save_name)) checkpoint = torch.load(to_save_name) else: logger.log( "Does not find the existing file {:}, train and evaluate!". format(to_save_name)) checkpoint = evaluate_all_datasets( arch, datasets, xpaths, splits, use_less, seed, arch_config, workers, logger, ) torch.save(checkpoint, to_save_name) # log information logger.log("{:}".format(checkpoint["info"])) all_dataset_keys = checkpoint["all_dataset_keys"] for dataset_key in all_dataset_keys: logger.log("\n{:} dataset : {:} {:}".format( "-" * 15, dataset_key, "-" * 15)) dataset_info = checkpoint[dataset_key] # logger.log('Network ==>\n{:}'.format( dataset_info['net_string'] )) logger.log("Flops = {:} MB, Params = {:} MB".format( dataset_info["flop"], dataset_info["param"])) logger.log("config : {:}".format(dataset_info["config"])) logger.log("Training State (finish) = {:}".format( dataset_info["finish-train"])) last_epoch = dataset_info["total_epoch"] - 1 train_acc1es, train_acc5es = ( dataset_info["train_acc1es"], dataset_info["train_acc5es"], ) valid_acc1es, valid_acc5es = ( dataset_info["valid_acc1es"], dataset_info["valid_acc5es"], ) logger.log( "Last Info : Train = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%, Test = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%" .format( train_acc1es[last_epoch], train_acc5es[last_epoch], 100 - train_acc1es[last_epoch], valid_acc1es[last_epoch], valid_acc5es[last_epoch], 100 - valid_acc1es[last_epoch], )) # measure elapsed time seed_time.update(time.time() - start_time) start_time = time.time() need_time = "Time Left: {:}".format( convert_secs2time(seed_time.avg * (len(seeds) - _is - 1), True)) logger.log( "\n<<<***>>> The {:02d}/{:02d}-th seed is {:} <finish> other procedures need {:}" .format(_is, len(seeds), seed, need_time)) logger.close()
def main( save_dir, workers, datasets, xpaths, splits, use_less, srange, arch_index, seeds, cover_mode, meta_info, arch_config, ): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True # torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True torch.set_num_threads(workers) assert (len(srange) == 2 and 0 <= srange[0] <= srange[1]), "invalid srange : {:}".format(srange) if use_less: sub_dir = Path(save_dir) / "{:06d}-{:06d}-C{:}-N{:}-LESS".format( srange[0], srange[1], arch_config["channel"], arch_config["num_cells"]) else: sub_dir = Path(save_dir) / "{:06d}-{:06d}-C{:}-N{:}".format( srange[0], srange[1], arch_config["channel"], arch_config["num_cells"]) logger = Logger(str(sub_dir), 0, False) all_archs = meta_info["archs"] assert srange[1] < meta_info[ "total"], "invalid range : {:}-{:} vs. {:}".format( srange[0], srange[1], meta_info["total"]) assert (arch_index == -1 or srange[0] <= arch_index <= srange[1] ), "invalid range : {:} vs. {:} vs. {:}".format( srange[0], arch_index, srange[1]) if arch_index == -1: to_evaluate_indexes = list(range(srange[0], srange[1] + 1)) else: to_evaluate_indexes = [arch_index] logger.log("xargs : seeds = {:}".format(seeds)) logger.log("xargs : arch_index = {:}".format(arch_index)) logger.log("xargs : cover_mode = {:}".format(cover_mode)) logger.log("-" * 100) logger.log( "Start evaluating range =: {:06d} vs. {:06d} vs. {:06d} / {:06d} with cover-mode={:}" .format(srange[0], arch_index, srange[1], meta_info["total"], cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log( "--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}". format(i, len(datasets), dataset, xpath, split)) logger.log("--->>> architecture config : {:}".format(arch_config)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): arch = all_archs[index] logger.log( "\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th architecture [seeds={:}] {:}" .format( "-" * 15, i, len(to_evaluate_indexes), index, meta_info["total"], seeds, "-" * 15, )) # logger.log('{:} {:} {:}'.format('-'*15, arch.tostr(), '-'*15)) logger.log("{:} {:} {:}".format("-" * 15, arch, "-" * 15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = sub_dir / "arch-{:06d}-seed-{:04d}.pth".format( index, seed) if to_save_name.exists(): if cover_mode: logger.log( "Find existing file : {:}, remove it before evaluation" .format(to_save_name)) os.remove(str(to_save_name)) else: logger.log( "Find existing file : {:}, skip this evaluation". format(to_save_name)) has_continue = True continue results = evaluate_all_datasets( CellStructure.str2structure(arch), datasets, xpaths, splits, use_less, seed, arch_config, workers, logger, ) torch.save(results, to_save_name) logger.log( "{:} --evaluate-- {:06d}/{:06d} ({:06d}/{:06d})-th seed={:} done, save into {:}" .format( "-" * 15, i, len(to_evaluate_indexes), index, meta_info["total"], seed, to_save_name, )) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = "Time Left: {:}".format( convert_secs2time( epoch_time.avg * (len(to_evaluate_indexes) - i - 1), True)) logger.log("This arch costs : {:}".format( convert_secs2time(epoch_time.val, True))) logger.log("{:}".format("*" * 100)) logger.log("{:} {:74s} {:}".format( "*" * 10, "{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}".format( i, len(to_evaluate_indexes), index, meta_info["total"], need_time), "*" * 10, )) logger.log("{:}".format("*" * 100)) logger.close()
def main( save_dir: Path, workers: int, datasets: List[Text], xpaths: List[Text], splits: List[int], seeds: List[int], nets: List[str], opt_config: Dict[Text, Any], to_evaluate_indexes: tuple, cover_mode: bool, arch_config: Dict[Text, Any], ): log_dir = save_dir / "logs" log_dir.mkdir(parents=True, exist_ok=True) logger = Logger(str(log_dir), os.getpid(), False) logger.log("xargs : seeds = {:}".format(seeds)) logger.log("xargs : cover_mode = {:}".format(cover_mode)) logger.log("-" * 100) logger.log("Start evaluating range =: {:06d} - {:06d}".format( min(to_evaluate_indexes), max(to_evaluate_indexes)) + "({:} in total) / {:06d} with cover-mode={:}".format( len(to_evaluate_indexes), len(nets), cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log( "--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}". format(i, len(datasets), dataset, xpath, split)) logger.log("--->>> optimization config : {:}".format(opt_config)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): arch = nets[index] logger.log( "\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] {:}" .format( time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, "-" * 15, )) logger.log("{:} {:} {:}".format("-" * 15, arch, "-" * 15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = save_dir / "arch-{:06d}-seed-{:04d}.pth".format( index, seed) if to_save_name.exists(): if cover_mode: logger.log( "Find existing file : {:}, remove it before evaluation" .format(to_save_name)) os.remove(str(to_save_name)) else: logger.log( "Find existing file : {:}, skip this evaluation". format(to_save_name)) has_continue = True continue results = evaluate_all_datasets( CellStructure.str2structure(arch), datasets, xpaths, splits, opt_config, seed, arch_config, workers, logger, ) torch.save(results, to_save_name) logger.log( "\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] ===>>> {:}" .format( time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, to_save_name, )) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = "Time Left: {:}".format( convert_secs2time( epoch_time.avg * (len(to_evaluate_indexes) - i - 1), True)) logger.log("This arch costs : {:}".format( convert_secs2time(epoch_time.val, True))) logger.log("{:}".format("*" * 100)) logger.log("{:} {:74s} {:}".format( "*" * 10, "{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}".format( i, len(to_evaluate_indexes), index, len(nets), need_time), "*" * 10, )) logger.log("{:}".format("*" * 100)) logger.close()
def create_result_count(used_seed, dataset, arch_config, results, dataloader_dict): xresult = ResultsCount( dataset, results["net_state_dict"], results["train_acc1es"], results["train_losses"], results["param"], results["flop"], arch_config, used_seed, results["total_epoch"], None, ) net_config = dict2config( { "name": "infer.tiny", "C": arch_config["channel"], "N": arch_config["num_cells"], "genotype": CellStructure.str2structure(arch_config["arch_str"]), "num_classes": arch_config["class_num"], }, None, ) network = get_cell_based_tiny_net(net_config) network.load_state_dict(xresult.get_net_param()) if "train_times" in results: # new version xresult.update_train_info( results["train_acc1es"], results["train_acc5es"], results["train_losses"], results["train_times"], ) xresult.update_eval(results["valid_acc1es"], results["valid_losses"], results["valid_times"]) else: if dataset == "cifar10-valid": xresult.update_OLD_eval("x-valid", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format("cifar10", "test")], network.cuda()) xresult.update_OLD_eval( "ori-test", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) xresult.update_latency(latencies) elif dataset == "cifar10": xresult.update_OLD_eval("ori-test", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "test")], network.cuda()) xresult.update_latency(latencies) elif dataset == "cifar100" or dataset == "ImageNet16-120": xresult.update_OLD_eval("ori-test", results["valid_acc1es"], results["valid_losses"]) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "valid")], network.cuda()) xresult.update_OLD_eval( "x-valid", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) loss, top1, top5, latencies = pure_evaluate( dataloader_dict["{:}@{:}".format(dataset, "test")], network.cuda()) xresult.update_OLD_eval( "x-test", {results["total_epoch"] - 1: top1}, {results["total_epoch"] - 1: loss}, ) xresult.update_latency(latencies) else: raise ValueError("invalid dataset name : {:}".format(dataset)) return xresult