def __init__(self): super(Helper, self).__init__() self.args._save = copy(self.args.save) self.args.save = '{}/{}/{}/{}_{}-{}'.format(self.args.save, self.args.space, self.args.dataset, self.args.drop_path_prob, self.args.weight_decay, self.args.job_id) utils.create_exp_dir(self.args.save) config_filename = os.path.join(self.args._save, 'config.yaml') if not os.path.exists(config_filename): with open(config_filename, 'w') as f: yaml.dump(self.args_to_log, f, default_flow_style=False) if self.args.dataset != 'cifar100': self.args.n_classes = 10 else: self.args.n_classes = 100 # set cutout to False if the drop_prob is 0 if self.args.drop_path_prob == 0: self.args.cutout = False
def run_bohb(exp_name, log_dir='EXP', iterations=20): run_dir = 'bohb-{}-{}'.format(log_dir, exp_name) if not os.path.exists(run_dir): utils.create_exp_dir(run_dir, scripts_to_save=glob.glob('*.py')) # log_format = '%(asctime)s %(message)s' # logging.basicConfig(stream=sys.stdout, level=logging.INFO, # format=log_format, datefmt='%m/%d %I:%M:%S %p') # fh = logging.FileHandler(os.path.join(run_dir, 'log.txt')) # fh.setFormatter(logging.Formatter(log_format)) # logging.getLogger().addHandler(fh) result_logger = hpres.json_result_logger(directory=run_dir, overwrite=True) # Start a nameserver NS = hpns.NameServer(run_id=exp_name, host='127.0.0.1', port=0) ns_host, ns_port = NS.start() # Start a localserver worker = TorchWorker(run_id=exp_name, host='127.0.0.1', nameserver=ns_host, nameserver_port=ns_port, timeout=120, run_dir=run_dir) worker.run(background=True) # Initialise optimiser bohb = BOHB(configspace=worker.get_configspace(), run_id=exp_name, host='127.0.0.1', nameserver=ns_host, nameserver_port=ns_port, result_logger=result_logger, min_budget=2, max_budget=5, ) print('Worker running') res = bohb.run(n_iterations=iterations) # Store the results with open(os.path.join(run_dir, 'result.pkl'), 'wb') as file: pickle.dump(res, file) # Shutdown bohb.shutdown(shutdown_workers=True) NS.shutdown() # get all runs all_runs = res.get_all_runs() # get id to configuration mapping as dictionary id2conf = res.get_id2config_mapping() # get best/incubent run best_run = res.get_incumbent_id() best_config = id2conf[best_run]['config'] print(f"Best run id:{best_run}, \n Config:{best_config}") # Store all run info file = open(os.path.join(run_dir, 'summary.txt'), 'w') file.write(f"{all_runs}") file.close()
def __init__(self, config_file): self.args = utils.config_parser(config_file) utils.print_args(self.args) self.args._save = copy(self.args.save) self.args.save = '{}/{}'.format(self.args.save, self.args.dataset) utils.create_exp_dir(self.args.save) if self.args.dataset != 'cifar100': self.args.n_classes = 10 else: self.args.n_classes = 100
def __init__(self): super(Helper, self).__init__() self.args._save = copy(self.args.save) self.args.save = '{}/{}/{}/{}_{}-{}'.format( self.args.save, self.args.space, self.args.dataset, self.args.search_dp, self.args.search_wd, self.args.job_id) utils.create_exp_dir(self.args.save) config_filename = os.path.join(self.args._save, 'config.yaml') if not os.path.exists(config_filename): with open(config_filename, 'w') as f: yaml.dump(self.args_to_log, f, default_flow_style=False) if self.args.dataset != 'cifar100': self.args.n_classes = 10 else: self.args.n_classes = 100
def main(genome, epochs, search_space='micro', save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24, layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, data_path="../data", dataset="CIFAR10"): # ---- train logger ----------------- # save_pth = os.path.join(expr_root, '{}'.format(save)) utils.create_exp_dir(save_pth) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt')) # fh.setFormatter(logging.Formatter(log_format)) # logging.getLogger().addHandler(fh) # ---- parameter values setting ----- # if dataset == "CIFAR10": CLASSES = 10 elif dataset == "CIFAR100": CLASSES = 100 elif dataset == "Sport8": CLASSES = 8 elif dataset == "MIT67": CLASSES = 67 elif dataset == "flowers102": CLASSES = 102 learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = data_path batch_size = 128 cutout_length = 16 auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { 'auxiliary': auxiliary, 'auxiliary_weight': auxiliary_weight, 'grad_clip': grad_clip, 'report_freq': report_freq, } if search_space == 'micro': genotype = micro_encoding.decode(genome) if dataset == "CIFAR10" or dataset == "CIFAR100": model = NetworkCIFAR(init_channels, CLASSES, layers, auxiliary, genotype) else: model = NetworkImageNet(init_channels, CLASSES, layers, auxiliary, genotype) elif search_space == 'macro': genotype = macro_encoding.decode(genome) channels = [(3, init_channels), (init_channels, 2 * init_channels), (2 * init_channels, 4 * init_channels)] model = EvoNetwork(genotype, channels, CLASSES, (32, 32), decoder='residual') else: raise NameError('Unknown search space type') # logging.info("Genome = %s", genome) logging.info("Architecture = %s", genotype) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) n_params = (np.sum( np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) model = model.to(device) logging.info("param size = %fMB", n_params) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD(parameters, learning_rate, momentum=momentum, weight_decay=weight_decay) if dataset == "CIFAR10" or dataset == "CIFAR100": MEAN = [0.49139968, 0.48215827, 0.44653124] STD = [0.24703233, 0.24348505, 0.26158768] train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) if cutout: train_transform.transforms.append(utils.Cutout(cutout_length)) train_transform.transforms.append(transforms.Normalize(MEAN, STD)) valid_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(MEAN, STD), ]) if dataset == "CIFAR10": train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=train_transform) valid_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=valid_transform) #dunno elif dataset == "CIFAR100": train_data = dset.CIFAR100(root=data_root, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR100(root=data_root, train=True, download=True, transform=valid_transform) else: MEAN = [0.485, 0.456, 0.406] STD = [0.229, 0.224, 0.225] transf_train = [ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2) ] transf_val = [ transforms.Resize(256), transforms.CenterCrop(224), ] normalize = [transforms.ToTensor(), transforms.Normalize(MEAN, STD)] train_transform = transforms.Compose(transf_train + normalize) valid_transform = transforms.Compose(transf_val + normalize) if cutout: train_transform.transforms.append(utils.Cutout(cutout_length)) train_data = dset.ImageFolder(root=data_path + "/" + dataset + "/train", transform=train_transform) valid_data = dset.ImageFolder(root=data_path + "/" + dataset + "/test", transform=valid_transform) n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) random.shuffle(indices) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[:split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split:]) train_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=4) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:n_train]), pin_memory=True, num_workers=4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, int(epochs)) for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.droprate = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) # calculate for flops model = add_flops_counting_methods(model) model.eval() model.start_flops_count() random_data = torch.randn(1, 3, 32, 32) #to change model(torch.autograd.Variable(random_data).to(device)) n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4) logging.info('flops = %f', n_flops) # save to file # os.remove(os.path.join(save_pth, 'log.txt')) with open(os.path.join(save_pth, 'log.txt'), "w") as file: file.write("Genome = {}\n".format(genome)) file.write("Architecture = {}\n".format(genotype)) file.write("param size = {}MB\n".format(n_params)) file.write("flops = {}MB\n".format(n_flops)) file.write("valid_acc = {}\n".format(valid_acc)) # logging.info("Architecture = %s", genotype)) with open(os.path.join(save_pth, 'genotype.txt'), "w") as f: f.write(str(genotype)) return { 'valid_acc': valid_acc, 'params': n_params, 'flops': n_flops, }
def main(genome, epochs, search_space='micro', save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24, layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, train_dataset="", val_dataset=""): # ---- train logger ----------------- # save_pth = os.path.join(expr_root, '{}'.format(save)) utils.create_exp_dir(save_pth) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt')) # fh.setFormatter(logging.Formatter(log_format)) # logging.getLogger().addHandler(fh) # ---- parameter values setting ----- # NUM_CLASSES = 4 CIFAR_CLASSES = NUM_CLASSES DATA_SHAPE = (128, 128) INPUT_CHANNELS = 3 learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = '../data' batch_size = 16 cutout_length = 16 auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { 'auxiliary': auxiliary, 'auxiliary_weight': auxiliary_weight, 'grad_clip': grad_clip, 'report_freq': report_freq, } if search_space == 'micro': genotype = micro_encoding.decode(genome) model = Network(init_channels, CIFAR_CLASSES, layers, auxiliary, genotype) elif search_space == 'macro': genotype = macro_encoding.decode(genome) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2 * init_channels), (2 * init_channels, 4 * init_channels)] model = EvoNetwork(genotype, channels, CIFAR_CLASSES, DATA_SHAPE, decoder='residual') else: raise NameError('Unknown search space type') # logging.info("Genome = %s", genome) logging.info("Architecture = %s", genotype) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) n_params = (np.sum( np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) model = model.to(device) logging.info("param size = %fMB", n_params) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD(parameters, learning_rate, momentum=momentum, weight_decay=weight_decay) #TODO: change CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] DATASET_MEAN = [0.4785047, 0.45649716, 0.42604172] CIFAR_MEAN = DATASET_MEAN DATASET_STD = [0.31962952, 0.3112294, 0.31206125] CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] CIFAR_STD = DATASET_STD # # data agumentation # train_transform = transforms.Compose([ # transforms.RandomCrop(32, padding=4), # transforms.RandomHorizontalFlip(), # transforms.ToTensor() # ]) # if cutout: # train_transform.transforms.append(utils.Cutout(cutout_length)) # train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD)) # valid_transform = transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize(CIFAR_MEAN, CIFAR_STD), # ]) # train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=train_transform) # valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=True, transform=valid_transform) # # num_train = len(train_data) # # indices = list(range(num_train)) # # split = int(np.floor(train_portion * num_train)) train_data = train_dataset valid_data = val_dataset train_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=4) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True, num_workers=4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, int(epochs)) for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.droprate = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) # calculate for flops model = add_flops_counting_methods(model) model.eval() model.start_flops_count() random_data = torch.randn(1, INPUT_CHANNELS, *DATA_SHAPE) model(torch.autograd.Variable(random_data).to(device)) n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4) logging.info('flops = %f', n_flops) # save to file # os.remove(os.path.join(save_pth, 'log.txt')) with open(os.path.join(save_pth, 'log.txt'), "w") as file: file.write("Genome = {}\n".format(genome)) file.write("Architecture = {}\n".format(genotype)) file.write("param size = {}MB\n".format(n_params)) file.write("flops = {}MB\n".format(n_flops)) file.write("valid_acc = {}\n".format(valid_acc)) # logging.info("Architecture = %s", genotype)) return { 'valid_acc': valid_acc, 'params': n_params, 'flops': n_flops, }
#print(step) #if (step + 1) % 10 == 0: #break #print("Finished in {} seconds".format((time.time() - valid_start) )) logging.info("[{} Generation] {}/{} finished with validation loss: {}, prec1: {}, prec5: {}".format(gen, i+1, len(population.get_population()), population.get_population()[i].objs.avg, population.get_population()[i].top1.avg, population.get_population()[i].top5.avg)) #break DIR = "search-{}-{}".format(time.strftime("%Y%m%d-%H%M%S"), args.dataset) if args.dir is not None: if not os.path.exists(args.dir): utils.create_exp_dir(args.dir) DIR = os.path.join(args.dir, DIR) else: DIR = os.path.join(os.getcwd(), DIR) utils.create_exp_dir(DIR) utils.create_exp_dir(os.path.join(DIR, "weights")) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(DIR, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) # Initializing the summary writer writer = SummaryWriter(os.path.join(DIR, 'runs')) torch.manual_seed(args.seed)
def main(): utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) print(args) seed = random.randint(1, 100000000) print(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True n_channels = 3 n_bins = 2.**args.n_bits # Define model and loss criteria model = SearchNetwork(n_channels, args.n_flow, args.n_block, n_bins, affine=args.affine, conv_lu=not args.no_lu) model = nn.DataParallel(model, [args.gpu]) model.load_state_dict( torch.load("architecture.pt", map_location="cuda:{}".format(args.gpu))) model = model.module genotype = model.sample_architecture() with open(args.save + '/genotype.pkl', 'wb') as fp: pickle.dump(genotype, fp) model_single = EnsembleNetwork(n_channels, args.n_flow, args.n_block, n_bins, genotype, affine=args.affine, conv_lu=not args.no_lu) model = model_single model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) dataset = iter(sample_cifar10(args.batch, args.img_size)) # Sample generated images z_sample = [] z_shapes = calc_z_shapes(n_channels, args.img_size, args.n_flow, args.n_block) for z in z_shapes: z_new = torch.randn(args.n_sample, *z) * args.temp z_sample.append(z_new.to(device)) with tqdm(range(args.iter)) as pbar: for i in pbar: # Training procedure model.train() # Get a random minibatch from the search queue with replacement input, _ = next(dataset) input = Variable(input, requires_grad=False).cuda(non_blocking=True) log_p, logdet, _ = model(input + torch.rand_like(input) / n_bins) logdet = logdet.mean() loss, _, _ = likelihood_loss(log_p, logdet, args.img_size, n_bins) # Optimize model optimizer.zero_grad() loss.backward() optimizer.step() pbar.set_description("Loss: {}".format(loss.item())) # Save generated samples if i % 100 == 0: with torch.no_grad(): tvutils.save_image( model_single.reverse(z_sample).cpu().data, "{}/samples/{}.png".format(args.save, str(i + 1).zfill(6)), normalize=False, nrow=10, ) # Save checkpoint if i % 1000 == 0: utils.save(model, os.path.join(args.save, 'latest_weights.pt'))
def main(): if args.load_path: args.save = Path(args.load_path) / 'eval-{}-{}'.format( args.save, time.strftime("%Y%m%d-%H%M%S")) else: args.save = Path('logs') / 'eval-{}-{}'.format( args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(args.save / 'log.txt') fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) model = eval(args.model) if args.gpu: model = model.cuda() if args.load_path: utils.load(model, os.path.join(args.load_path, 'weights.pt')) print("loaded") direct_model = model if args.gpu: model = torch.nn.DataParallel(model) logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers) if args.eval: direct_model.drop_path_prob = 0 valid_acc, valid_obj = infer(valid_queue, model, args.gpu) logging.info('valid_acc %f', valid_acc) return scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) for epoch in range(args.start_epoch, args.epochs): scheduler.step(epoch) logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) direct_model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, optimizer, args.gpu) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, args.gpu) logging.info('valid_acc %f', valid_acc) if epoch >= args.epochs - 50 or epoch % args.save_frequency == 0: utils.save(model.module, os.path.join(args.save, f'weights_{epoch}.pt'))
parser.add_argument('--search_space', choices=['1', '2', '3'], default='1') parser.add_argument( '--warm_start_epochs', type=int, default=0, help='Warm start one-shot model before starting architecture updates.') parser.add_argument('--s3_bucket', type=str, default='megadarts', help='s3 bucket for saving to remote') args = parser.parse_args() args.save = 'experiments/pc_darts/search_space_{}/search-{}-{}-{}-{}-{}'.format( args.search_space, args.save, time.strftime("%Y%m%d-%H%M%S"), args.seed, args.learning_rate, args.search_space) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) # Dump the config of the run with open(os.path.join(args.save, 'config.json'), 'w') as fp: json.dump(args.__dict__, fp) for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format))
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) parser = argparse.ArgumentParser("imagenet") parser.add_argument('--data', type=Path, default=DATA_DIRECTORY / 'imagenet', help='location of the data corpus') parser.add_argument('--batchsz', type=int, default=128, help='batch size') parser.add_argument('--lr', type=float, default=0.1, help='init learning rate') parser.add_argument('--momentum', type=float, default=0.9, help='momentum') parser.add_argument('--wd', type=float, default=3e-5, help='weight decay') parser.add_argument('--report_freq', type=float, default=100, help='report frequency') parser.add_argument('--gpu', type=str, help='gpu device id') parser.add_argument('--epochs', type=int, default=250, help='num of training epochs') parser.add_argument('--init_ch', type=int, default=48, help='num of init channels') parser.add_argument('--layers', type=int, default=14, help='total number of layers') parser.add_argument('--checkpoint_path', type=Path, help='path to checkpoint for restart') parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') parser.add_argument('--auxiliary_weight', type=float, default=0.4, help='weight for auxiliary loss') parser.add_argument('--drop_path_prob', type=float, default=0, help='drop path probability') parser.add_argument('--exp_path', type=Path, default=Path('exp_imagenet'), help='experiment name') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--arch', type=str, default='', help='which architecture to use') parser.add_argument('--arch_path', type=str, default='', help='which architecture of json to use') parser.add_argument('--grad_clip', type=float, default=5., help='gradient clipping') parser.add_argument('--label_smooth', type=float, default=0.1, help='label smoothing') parser.add_argument('--gamma', type=float, default=0.97, help='learning rate decay') parser.add_argument('--decay_period', type=int, default=1, help='epochs between two learning rate decays') args = parser.parse_args() my_dataset = MyDataset.ImageNet args.save = args.exp_path / f'ImageNet-{time.strftime("%Y%m%d-%H%M%S")}' utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) genotype = eval(f'genotypes.{args.arch}' ) if not args.arch_path else utils.load_genotype( args.arch_path) trainer = Trainer(args, genotype, my_dataset) _, _, _ = trainer.train() args.seed = 0 test_model = trainer.model.module if isinstance( trainer.model, DataParallel) else trainer.model tester = Tester(test_args=args, my_dataset=my_dataset, model=test_model) valid_acc_top1, valid_acc_top5, valid_obj = tester.infer() logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) logging.info('valid_err_top1 %f', 100 - valid_acc_top1) logging.info('valid_err_top5 %f', 100 - valid_acc_top5)
def initialize_run(self): """ TODO This is the same as NAO one. :return: """ args = self.args utils = project_utils if not self.args.continue_train: self.sub_directory_path = 'WeightSharingNasBenchNetRandom-{}_SEED_{}'.format( self.args.save, self.args.seed) self.exp_dir = os.path.join(self.args.main_path, self.sub_directory_path) utils.create_exp_dir(self.exp_dir) if self.args.visualize: self.viz_dir_path = utils.create_viz_dir(self.exp_dir) if self.args.tensorboard: self.tb_dir = self.exp_dir tboard_dir = os.path.join(self.args.tboard_dir, self.sub_directory_path) self.writer = SummaryWriter(tboard_dir) if self.args.debug: torch.autograd.set_detect_anomaly(True) self.nasbench = self.search_space.nasbench # Set logger. self.logger = utils.get_logger( "train_search", file_handler=utils.get_file_handler( os.path.join(self.exp_dir, 'log.txt')), level=logging.INFO if not args.debug else logging.DEBUG) logging.info(f"setting random seed as {args.seed}") utils.torch_random_seed(args.seed) logging.info('gpu number = %d' % args.gpus) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss().cuda() eval_criterion = nn.CrossEntropyLoss().cuda() self.eval_loss = eval_criterion train_transform, valid_transform = utils._data_transforms_cifar10( args.cutout_length if args.cutout else None) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=valid_transform) test_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.enas_search_config.ratio * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:split]), pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=args.enas_search_config.child_eval_batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=2) test_queue = torch.utils.data.DataLoader( test_data, batch_size=args.evaluate_batch_size, shuffle=False, pin_memory=True, num_workers=8) repeat_valid_loader = RepeatedDataLoader(valid_queue) return train_queue, valid_queue, test_queue, repeat_valid_loader, criterion, eval_criterion
def main(): ''' data:数据集的目录 batchsize:不能调太大 learning_rate learning_rate_min momentum weight_decay: optimizer4件套 report_freq: 打印报告的频率 epoch: 默认50 init_channels: 初始特征通道数,随着网络加深特征通道数会成倍增长 layers: 进行cell的搜索时,网络框架由几个cell组成 cutout, cutout_length: TODO 是否使用cutout及其参数??? drop_path_prob: 减少搜索过程中的计算时间以及内存占用的一个参数 save: 保存路径名 seed:随机种子 grad_clip:梯度裁剪用以解决梯度爆炸 train_portion:训练数据的比例,剩下的会当作“验证数据”(但不在验证集中 unrolled: one-step unrolled validation loss TODO arch_learning_rate/arch_weight_decay: 架构参数学习率,用以更新网络架构参数 ''' parser = argparse.ArgumentParser("cifar") parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') parser.add_argument('--batch_size', type=int, default=64, help='batch size') parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate') parser.add_argument('--learning_rate_min', type=float, default=0.001, help='min learning rate') parser.add_argument('--momentum', type=float, default=0.9, help='momentum') parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay') parser.add_argument('--report_freq', type=float, default=50, help='report frequency') parser.add_argument('--gpu', type=str, default='0,1', help='gpu device id, split with ","') parser.add_argument('--epochs', type=int, default=50, help='num of training epochs') parser.add_argument('--init_channels', type=int, default=16, help='num of init channels') parser.add_argument('--layers', type=int, default=8, help='total number of layers') parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model') parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') parser.add_argument('--drop_path_prob', type=float, default=0.3, help='drop path probability') parser.add_argument('--save', type=str, default='EXP', help='experiment name') parser.add_argument('--seed', type=int, default=12450, help='random seed') parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') parser.add_argument('--unrolled', action='store_true', default=False, help='use one-step unrolled validation loss') parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') # pasers about distributed data parallel parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N', help='number of data loading workers(default:1)') parser.add_argument('-nr','--nr',default=0, type=int, help='ranking within the nodes') args = parser.parse_args() args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) # 生成search目录 utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) # 把cnn内所有py脚本拷到search目录里 # glob.glob()查找符合特定规则的文件路径名 ''' log ''' log_format = '%(asctime)s %(message)s' # %(asctime)s 当前时间,%(message)s 用户输出的消息 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) args.CIFAR_CLASSES = 10 # Setting GPU device os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) gpus = [int(i) for i in args.gpu.split(',')] # argparser传入的参数转为int list num_gpu = len(gpus) if num_gpu == 1: torch.cuda.set_device(int(args.gpu)) logging.info('gpu device = %s' % args.gpu) logging.info("args = %s", args) args.world_size = num_gpu * args.nodes os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '23456' print('OMG') mp.spawn(train_search, nprocs=num_gpu, args=(args,))
def main(macro_genome, micro_genome, epochs, search_space='micro', save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24, layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, batch_size=128): # ---- train logger ----------------- # save_pth = os.path.join(expr_root, '{}'.format(save)) utils.create_exp_dir(save_pth) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') # ---- parameter values setting ----- # CIFAR_CLASSES = config_dict()['n_classes'] INPUT_CHANNELS = config_dict()['n_channels'] learning_rate = 0.025 momentum = 0.9 weight_decay = 3e-4 data_root = '../data' cutout_length = 16 auxiliary_weight = 0.4 grad_clip = 5 report_freq = 50 train_params = { 'auxiliary': auxiliary, 'auxiliary_weight': auxiliary_weight, 'grad_clip': grad_clip, 'report_freq': report_freq, } if search_space == 'micro' or search_space == 'micro_garbage': genome = micro_genome genotype = micro_encoding.decode(genome) model = Network(init_channels, CIFAR_CLASSES, config_dict()['n_channels'], layers, auxiliary, genotype) elif search_space == 'macro' or search_space == 'macro_garbage': genome = macro_genome genotype = macro_encoding.decode(genome) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2*init_channels), (2*init_channels, 4*init_channels)] model = EvoNetwork(genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual') elif search_space == 'micromacro': genome = [macro_genome, micro_genome] macro_genotype = macro_encoding.decode(macro_genome) micro_genotype = micro_encoding.decode(micro_genome) genotype = [macro_genotype, micro_genotype] set_config('micro_creator', make_micro_creator(micro_genotype, convert=False)) channels = [(INPUT_CHANNELS, init_channels), (init_channels, 2 * init_channels), (2 * init_channels, 4 * init_channels)] model = EvoNetwork(macro_genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual') else: raise NameError('Unknown search space type') # logging.info("Genome = %s", genome) logging.info("Architecture = %s", genotype) torch.cuda.set_device(gpu) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) n_params = (np.sum(np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6) model = model.to(device) logging.info("param size = %fMB", n_params) if config_dict()['problem'] == 'classification': criterion = nn.CrossEntropyLoss() else: criterion = nn.MSELoss() criterion = criterion.cuda() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD( parameters, learning_rate, momentum=momentum, weight_decay=weight_decay ) CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) if cutout: train_transform.transforms.append(utils.Cutout(cutout_length)) train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD)) valid_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(CIFAR_MEAN, CIFAR_STD), ]) train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=False, transform=train_transform) valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=False, transform=valid_transform) train_queue = torch.utils.data.DataLoader( train_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=1) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True, num_workers=1) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, int(epochs)) for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.droprate = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params) logging.info(f'train_{config_dict()["performance_measure"]} %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info(f'valid_{config_dict()["performance_measure"]} %f', valid_acc) # calculate for flops model = add_flops_counting_methods(model) model.eval() model.start_flops_count() random_data = torch.randn(1, INPUT_CHANNELS, config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']) model(torch.autograd.Variable(random_data).to(device)) n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4) logging.info('flops = %f', n_flops) # save to file # os.remove(os.path.join(save_pth, 'log.txt')) with open(os.path.join(save_pth, 'log.txt'), "w") as file: file.write("Genome = {}\n".format(genome)) file.write("Architecture = {}\n".format(genotype)) file.write("param size = {}MB\n".format(n_params)) file.write("flops = {}MB\n".format(n_flops)) file.write("valid_acc = {}\n".format(valid_acc)) # logging.info("Architecture = %s", genotype)) return { 'valid_acc': valid_acc, 'params': n_params, 'flops': n_flops, }
def main(): if args.load_checkpoint: args.save = Path(args.load_checkpoint) / 'eval-imagenet-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) else: args.save = Path('logs') / 'eval-imagenet-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(args.save / 'log.txt') fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) model = eval(args.model) # flops, params = profile(model, input_size=(1, 3, 224, 224)) # print("flops" + str(flops) + " params" + str(params)) if args.load_checkpoint: dictionary = torch.load(args.load_checkpoint) start_epoch = dictionary['epoch'] if args.start_epoch == -1 else args.start_epoch model.load_state_dict(dictionary['state_dict']) else: start_epoch = 0 if args.start_epoch == -1 else args.start_epoch direct_model = model if args.gpu: model = nn.DataParallel(model) logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD( model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) # if args.load_checkpoint: # optimizer.load_state_dict(dictionary['optimizer']) # del dictionary traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_data = dset.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ])) valid_data = dset.ImageFolder( validdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers) if args.eval: direct_model.drop_path_prob = 0 valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, args.gpu) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) return if args.period is not None: periods = args.period.split(',') periods = [int(p) for p in periods] totals = [] total = 0 for p in periods: total += p totals.append(total) scheduler = CosineAnnealingLR(optimizer, periods[0]) else: periods = None scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.decay_period, gamma=args.gamma) best_acc_top1 = 0 for epoch in range(start_epoch, args.epochs): if args.period is None: scheduler.step(epoch) else: assert len(periods) > 0 index = bisect.bisect_left(totals, epoch) scheduler.T_max = periods[index] if index == 0: e = epoch else: e = epoch - totals[index - 1] scheduler.step(e % periods[index]) logging.info("schedule epoch:" + str(e % periods[index])) logging.info("schedule period:" + str(periods[index])) logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) direct_model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, optimizer, args.gpu) logging.info('train_acc %f', train_acc) valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, args.gpu) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True utils.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, is_best, args.save)
def main(): utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) print(args) # Basic Setup np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.set_device(2) cudnn.benchmark = True cudnn.enabled = True n_channels = 3 n_bins = 2.**args.n_bits approx_samples = 4 # Define model model_single = Network(n_channels, args.n_flow, args.n_block, n_bins, affine=args.affine, conv_lu=not args.no_lu) model = nn.DataParallel(model_single, device_ids=[2, 3]) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) dataset = iter(sample_cifar10(args.batch, args.img_size)) # Sample generated images z_sample = [] z_shapes = calc_z_shapes(n_channels, args.img_size, args.n_flow, args.n_block) for z in z_shapes: z_new = torch.randn(args.n_sample, *z) * args.temp z_sample.append(z_new.to(device)) with tqdm(range(args.iter)) as pbar: for i in pbar: # Training procedure model.train() # Get a random minibatch from the search queue with replacement input, _ = next(dataset) input = Variable(input, requires_grad=False).cuda(non_blocking=True) input = input.repeat(approx_samples, 1, 1, 1) log_p, logdet, _ = model(input + torch.rand_like(input) / n_bins) loss, _, _ = likelihood_loss(log_p, logdet, args.img_size, n_bins) loss_variance = likelihood_loss_variance(log_p, logdet, args.img_size, n_bins, approx_samples) loss = loss + loss_variance # Optimize model optimizer.zero_grad() loss.backward() optimizer.step() pbar.set_description("Loss: {}".format(loss.item())) # Save generated samples if i % 100 == 0: with torch.no_grad(): tvutils.save_image( model_single.reverse(z_sample).cpu().data, "{}/samples/{}.png".format(args.save, str(i + 1).zfill(6)), normalize=False, nrow=10, ) # Save checkpoint if i % 1000 == 0: model_single.genotype() torch.save( model.state_dict(), "{}/checkpoint/model_{}.pt".format(args.save, str(i + 1).zfill(6))) # Save latest weights utils.save(model, os.path.join(args.save, 'latest_weights.pt'))
default=False, help="trans the embedding or not!") parser.add_argument('--first_order', action='store_true', default=False, help="use first order or not!") args = parser.parse_args() print("args ofm:", args.ofm) print("embedding_num:", args.embedding_num) save_name = 'experiments/{}/search-{}-{}-{}-{}-{}-{}-{}-{}'.format( args.dataset, time.strftime("%Y%m%d-%H%M%S"), args.mode, args.save, args.embedding_dim, args.opt, args.lr, args.arch_lr, args.seed) if args.unrolled: save_name += '-unrolled' save_name += '-' + str(np.random.randint(10000)) utils.create_exp_dir(save_name, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(save_name, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') np.random.seed(args.seed)
args.world_size = torch.distributed.get_world_size() # Set up primitives. from darts.genotypes import set_primitives set_primitives(-1) from util.datasets import imagenet_lmdb_dataset from darts import genotypes from util import utils from darts.model import NetworkImageNet as Network from darts.compute_flops import find_max_channels # Set up logging. assert args.root_dir args.save = args.root_dir + '/eval_imagenet-{}'.format(args.save) if args.local_rank == 0: utils.create_exp_dir(args.save) logging = utils.Logger(args.local_rank, args.save) writer = utils.Writer(args.local_rank, args.save) CLASSES = 1000 class CrossEntropyLabelSmooth(nn.Module): """Smoothed xentropy loss.""" def __init__(self, num_classes, epsilon): super(CrossEntropyLabelSmooth, self).__init__() self.num_classes = num_classes self.epsilon = epsilon self.logsoftmax = nn.LogSoftmax(dim=1) def forward(self, inputs, targets):
parser.add_argument('--name', type=str, default="runs", help='name for log') parser.add_argument('--train_portion', type=float, default=0.9, help='portion of training data') parser.add_argument('-j', '--workers', default=1, type=int, metavar='N', help='number of data loading workers (default: 1)') args = parser.parse_args() args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py'), exec_script=args.exec_script) # Logging configuration utils.setup_logger(args) # tensorboard_logger configuration configure('{}/{}'.format(args.save, args.name)) CIFAR_CLASSES = 10 os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) def main(): root = logging.getLogger()