def debug(): # design to debug the encoding scheme seed = 0 np.random.seed(seed) budget = 2000 B, n_ops, n_cell = 5, 7, 2 networks = [] design_id = 1 while len(networks) < budget: bit_string = [] for c in range(n_cell): for b in range(B): bit_string += [np.random.randint(n_ops), np.random.randint(b + 2), np.random.randint(n_ops), np.random.randint(b + 2) ] genome = convert(bit_string) # check against evaluated networks in case of duplicates doTrain = True for network in networks: if compare(genome, network): doTrain = False break if doTrain: genotype = decode(genome) model = Network(16, 10, 8, False, genotype) model.drop_path_prob = 0.0 data = torch.randn(1, 3, 32, 32) output, output_aux = model(torch.autograd.Variable(data)) networks.append(genome) design_id += 1 print(design_id)
def main(macro_genome, micro_genome, epochs, search_space='micro', save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24, layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, batch_size=128): # ---- train logger ----------------- # save_pth = os.path.join(expr_root, '{}'.format(save)) utils.create_exp_dir(save_pth) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') # ---- parameter values setting ----- # CIFAR_CLASSES = config_dict()['n_classes'] INPUT_CHANNELS = config_dict()['n_channels'] learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = '../data' cutout_length = 16 auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { 'auxiliary': auxiliary, 'auxiliary_weight': auxiliary_weight, 'grad_clip': grad_clip, 'report_freq': report_freq, } if search_space == 'micro' or search_space == 'micro_garbage': genome = micro_genome genotype = micro_encoding.decode(genome) model = Network(init_channels, CIFAR_CLASSES, config_dict()['n_channels'], layers, auxiliary, genotype) elif search_space == 'macro' or search_space == 'macro_garbage': genome = macro_genome genotype = macro_encoding.decode(genome) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2*init_channels), (2*init_channels, 4*init_channels)] model = EvoNetwork(genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual') elif search_space == 'micromacro': genome = [macro_genome, micro_genome] macro_genotype = macro_encoding.decode(macro_genome) micro_genotype = micro_encoding.decode(micro_genome) genotype = [macro_genotype, micro_genotype] set_config('micro_creator', make_micro_creator(micro_genotype, convert=False)) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2 * init_channels), (2 * init_channels, 4 * init_channels)] model = EvoNetwork(macro_genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual') else: raise NameError('Unknown search space type') # logging.info("Genome = %s", genome) logging.info("Architecture = %s", genotype) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) n_params = (np.sum(np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) model = model.to(device) logging.info("param size = %fMB", n_params) if config_dict()['problem'] == 'classification': criterion = nn.CrossEntropyLoss() else: criterion = nn.MSELoss() criterion = criterion.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD( parameters, learning_rate, momentum=momentum, weight_decay=weight_decay ) CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) if cutout: train_transform.transforms.append(utils.Cutout(cutout_length)) train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD)) valid_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(CIFAR_MEAN, CIFAR_STD), ]) train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=False, transform=train_transform) valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=False, transform=valid_transform) train_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=1) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True, num_workers=1) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, int(epochs)) for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.droprate = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params) logging.info(f'train_{config_dict()["performance_measure"]} %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info(f'valid_{config_dict()["performance_measure"]} %f', valid_acc) # calculate for flops model = add_flops_counting_methods(model) model.eval() model.start_flops_count() random_data = torch.randn(1, INPUT_CHANNELS, config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']) model(torch.autograd.Variable(random_data).to(device)) n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4) logging.info('flops = %f', n_flops) # save to file # os.remove(os.path.join(save_pth, 'log.txt')) with open(os.path.join(save_pth, 'log.txt'), "w") as file: file.write("Genome = {}\n".format(genome)) file.write("Architecture = {}\n".format(genotype)) file.write("param size = {}MB\n".format(n_params)) file.write("flops = {}MB\n".format(n_flops)) file.write("valid_acc = {}\n".format(valid_acc)) # logging.info("Architecture = %s", genotype)) return { 'valid_acc': valid_acc, 'params': n_params, 'flops': n_flops, }
def inherit_one_model(individual, expr_root: str, model=None, args=None) -> nn.Module: """ Very complicated function. Handles inheritance of the common components not defined in the genome of the individual. Also calls the function to inherit weights for the cells, weight are defined by the individual. TODO: Maybe document this better TODO: Improve Error Logging Args: individual: the individual to inherit weights expr_root: path as defined in args.save model: not used except in testing args: not used except in testing Returns: model """ try: r = np.random.uniform(-0.5, 1.5) parent1 = projectcode.weightmanagement.common.read_parent_by_id( individual.parents[0], expr_root, args) parent2 = projectcode.weightmanagement.common.read_parent_by_id( individual.parents[1], expr_root, args) parents = common.determine_more_fit_parent(parent1, parent2) genotype = micro_encoding.decode(micro_encoding.convert(individual.X)) if model is None: CIFAR_CLASSES = 10 auxiliary = False model = Network(args.init_channels, CIFAR_CLASSES, args.layers, auxiliary, genotype) model = common.initialize_zero(model) wcom = WeightComputer(parents) # weight merge model.stem[0].weight = wcom.compute_child_weight( "stem.0.weight", r, inherit_rules="both", weight_tensor=model.stem[0].weight) previous_reduction = False pp_reduction = False for cell_number, cell in enumerate(model.cells): try: reduction = cell_number in [ len(model.cells) // 3, 2 * len(model.cells) // 3, ] # print(reduction, previous_reduction) model = inherit_one_cell( cell_number, individual, model, parents, reduce=reduction, previous_reduce=previous_reduction, weight_computer=wcom, pp_reduce=pp_reduction, r=r, ) pp_reduction = previous_reduction previous_reduction = reduction except: logger.warning("error in cell %i" % cell_number) raise inherit = None child_genome = common.decode_individual(individual) if parents[0]["genome"].normal_concat == parents[1][ "genome"].normal_concat: inherit = "both" elif parents[0]["genome"].normal_concat == child_genome.normal_concat: inherit = "first" elif parents[1]["genome"].normal_concat == child_genome.normal_concat: inherit = "second" else: inherit = "concat_mismatch" assert inherit is not None, "could not determine classifier inheritance" key = "classifier.weight" model.classifier.weight = wcom.compute_child_weight( key, r, inherit, model.classifier.weight) key = "classifier.bias" model.classifier.bias = wcom.compute_child_weight( key, r, inherit, model.classifier.bias) common.assert_non_null_weights(model.state_dict()) except: logger.warning( projectcode.weightmanagement.common.decode_individual(individual)) logger.warning(parents[0]["genome"]) logger.warning(parents[1]["genome"]) logger.warning(individual.parents) raise return model
def main(genome, epochs, search_space='micro', save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24, layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, train_dataset="", val_dataset=""): # ---- train logger ----------------- # save_pth = os.path.join(expr_root, '{}'.format(save)) utils.create_exp_dir(save_pth) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt')) # fh.setFormatter(logging.Formatter(log_format)) # logging.getLogger().addHandler(fh) # ---- parameter values setting ----- # NUM_CLASSES = 4 CIFAR_CLASSES = NUM_CLASSES DATA_SHAPE = (128, 128) INPUT_CHANNELS = 3 learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = '../data' batch_size = 16 cutout_length = 16 auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { 'auxiliary': auxiliary, 'auxiliary_weight': auxiliary_weight, 'grad_clip': grad_clip, 'report_freq': report_freq, } if search_space == 'micro': genotype = micro_encoding.decode(genome) model = Network(init_channels, CIFAR_CLASSES, layers, auxiliary, genotype) elif search_space == 'macro': genotype = macro_encoding.decode(genome) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2 * init_channels), (2 * init_channels, 4 * init_channels)] model = EvoNetwork(genotype, channels, CIFAR_CLASSES, DATA_SHAPE, decoder='residual') else: raise NameError('Unknown search space type') # logging.info("Genome = %s", genome) logging.info("Architecture = %s", genotype) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) n_params = (np.sum( np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) model = model.to(device) logging.info("param size = %fMB", n_params) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD(parameters, learning_rate, momentum=momentum, weight_decay=weight_decay) #TODO: change CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] DATASET_MEAN = [0.4785047, 0.45649716, 0.42604172] CIFAR_MEAN = DATASET_MEAN DATASET_STD = [0.31962952, 0.3112294, 0.31206125] CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] CIFAR_STD = DATASET_STD # # data agumentation # train_transform = transforms.Compose([ # transforms.RandomCrop(32, padding=4), # transforms.RandomHorizontalFlip(), # transforms.ToTensor() # ]) # if cutout: # train_transform.transforms.append(utils.Cutout(cutout_length)) # train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD)) # valid_transform = transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize(CIFAR_MEAN, CIFAR_STD), # ]) # train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=train_transform) # valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=True, transform=valid_transform) # # num_train = len(train_data) # # indices = list(range(num_train)) # # split = int(np.floor(train_portion * num_train)) train_data = train_dataset valid_data = val_dataset train_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=4) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True, num_workers=4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, int(epochs)) for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.droprate = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) # calculate for flops model = add_flops_counting_methods(model) model.eval() model.start_flops_count() random_data = torch.randn(1, INPUT_CHANNELS, *DATA_SHAPE) model(torch.autograd.Variable(random_data).to(device)) n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4) logging.info('flops = %f', n_flops) # save to file # os.remove(os.path.join(save_pth, 'log.txt')) with open(os.path.join(save_pth, 'log.txt'), "w") as file: file.write("Genome = {}\n".format(genome)) file.write("Architecture = {}\n".format(genotype)) file.write("param size = {}MB\n".format(n_params)) file.write("flops = {}MB\n".format(n_flops)) file.write("valid_acc = {}\n".format(valid_acc)) # logging.info("Architecture = %s", genotype)) return { 'valid_acc': valid_acc, 'params': n_params, 'flops': n_flops, }
def train_and_evaluate( genome: tuple, individual=None, args: argparse.Namespace = None, first_gen: bool = True, save: str = None, client_id: str = None, ): """ Function to train and evaluate an individual using a TPU. Results are always saved in the save dir to make distributed data management easier. Args: first_gen: genome: save: individual: args: Returns: """ if args.stream == "tpu": # must warp up TPU import torch_xla auxiliary = False assert hasattr(individual, "id") if not first_gen: # this is not the first generation, so mating should have occurred assert hasattr(individual, "parents") expr_root = "" save_pth = os.path.join(expr_root, "{}".format(save)) utils.create_exp_dir(save_pth) CIFAR_CLASSES = 10 learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = "../data" batch_size = args.batch_size auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { "auxiliary": auxiliary, "auxiliary_weight": auxiliary_weight, "grad_clip": grad_clip, "report_freq": report_freq, } if args.search_space == "micro": genotype = micro_encoding.decode(genome) model = Network(args.init_channels, CIFAR_CLASSES, args.layers, auxiliary, genotype) if not first_gen: # change the way the weights are set up model = manage_weights(model, individual, expr_root, args) elif args.search_space == "macro": raise NotImplementedError("Not supported") else: raise NameError("Unknown search space type") logger.info("Architecture = %s", genotype) try: max_weight = args.max_weight except: print("Could Not Determine Maximum Weight Argument") max_weight = 1e20 clip = weightClip(max_weight=max_weight, min_weight=max_weight * -1) if args.stream == "tpu": from projectcode.training.tpu import get_map_fn import torch_xla.distributed.xla_multiprocessing as xmp WRAPPED_MODEL = xmp.MpModelWrapper(model) logger.info("Executing TPU Training") map_fn = get_map_fn(model, train_params, data_root, momentum, weight_decay, CIFAR_CLASSES, learning_rate, args.layers, batch_size, epochs=args.epochs, save_pth=save_pth, args=args, WRAPPED_MODEL=WRAPPED_MODEL, clip=clip) FLAGS = {} xmp.spawn(map_fn, args=(FLAGS, ), nprocs=1, start_method="fork") valid_acc, n_flops = torch.load("results.pt") elif args.stream == "gpu": from projectcode.training.gpu import train_gpu logger.info("Executing GPU Training") valid_acc, n_flops = train_gpu(model, train_params, data_root, momentum, weight_decay, CIFAR_CLASSES, learning_rate, args.layers, batch_size, epochs=args.epochs, save_pth=save_pth, args=args, clip=clip) else: raise NameError("Unrecognized client stream") n_params = (np.sum( np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) if main_config.distributed_cloud and args.weight_init == "lammarckian": wt_path = f"{args.code}_{client_id}_weights_{individual.id:05d}.pt" torch.save(model.state_dict(), wt_path) blob_name = upload_blob(wt_path) else: blob_name = None torch.save(model.state_dict(), os.path.join(save_pth, "weights.pt")) result_dict = { "id": individual.id, "save_path": save_pth, "valid_acc": valid_acc, "params": n_params, "flops": n_flops, "wt_blob_name": blob_name, } dump(result_dict, os.path.join(save_pth, "result.pkl")) return result_dict