def default_mutator_test_pipeline(self, mutator_cls): for model_cls in self.default_cls: for cuda_test in self.cuda_test: _reset_global_mutable_counting() model = model_cls(self) mutator = mutator_cls(model) if cuda_test: model.cuda() mutator.cuda() if cuda_test > 1: model = nn.DataParallel(model) self.iterative_sample_and_forward(model, mutator, use_cuda=cuda_test) _reset_global_mutable_counting() model_fixed = model_cls(self) if cuda_test: model_fixed.cuda() if cuda_test > 1: model_fixed = nn.DataParallel(model_fixed) with torch.no_grad(): arc = mutator.export() apply_fixed_architecture(model_fixed, arc) self.iterative_sample_and_forward(model_fixed, n_iters=1, use_cuda=cuda_test)
def get_model(embedding, num_layers): logger.info("num layers: {0}".format(num_layers)) assert FLAGS.child_fixed_arc is not None, "Architecture should be provided." child_model = Model(embedding=embedding, hidden_units=FLAGS.child_out_filters_scale * FLAGS.child_out_filters, num_layers=num_layers, num_classes=FLAGS.class_num, choose_from_k=5 if FLAGS.multi_path else 1, lstm_keep_prob=FLAGS.lstm_out_keep_prob, cnn_keep_prob=FLAGS.cnn_keep_prob, att_keep_prob=FLAGS.attention_keep_prob, att_mask=FLAGS.is_mask, embed_keep_prob=FLAGS.embed_keep_prob, final_output_keep_prob=FLAGS.final_output_keep_prob, global_pool=FLAGS.output_type) apply_fixed_architecture(child_model, FLAGS.child_fixed_arc) return child_model
def compare(self): self.logger.info("=" * 20) self.logger.info("Selecting the best architecture ...") self.enable_writter = False # split train dataset into train and valid dataset train_size = int(0.8 * len(self.train_dataset)) valid_size = len(self.train_dataset) - train_size self.train_dataset_part, self.valid_dataset_part = torch.utils.data.random_split( self.train_dataset, [train_size, valid_size]) # dataloader self.train_loader_part = torch.utils.data.DataLoader( self.train_dataset_part, batch_size=self.cfg.dataset.batch_size, shuffle=True, num_workers=self.cfg.dataset.workers, pin_memory=True) self.valid_loader_part = torch.utils.data.DataLoader( self.valid_dataset_part, batch_size=self.cfg.dataset.batch_size, shuffle=True, num_workers=self.cfg.dataset.workers, pin_memory=True) # choose the best architecture for arc in self.arcs: self.reset() self.mutator = apply_fixed_architecture(self.model, arc) size = self.model_size() arc_name = os.path.basename(arc) self.logger.info(f"{arc} Model size={size*4/1024**2} MB") # train for epoch in range(self.train_epochs): self.train_one_epoch(epoch, self.train_loader_part) val_acc = self.valid_one_epoch(-1, self.valid_loader_part) self.size_acc[arc_name] = { 'size': size, 'val_acc': val_acc, 'arc': arc } sorted_size_acc = sorted( self.size_acc.items(), key=lambda x: x[1]['val_acc']['save_metric'].avg, reverse=True) return sorted_size_acc[0][1]
parser.add_argument("--batch-size", default=96, type=int) parser.add_argument("--log-frequency", default=10, type=int) parser.add_argument("--epochs", default=600, type=int) parser.add_argument("--aux-weight", default=0.4, type=float) parser.add_argument("--drop-path-prob", default=0.2, type=float) parser.add_argument("--workers", default=4) parser.add_argument("--grad-clip", default=5., type=float) parser.add_argument("--arc-checkpoint", default="./checkpoints/epoch_0.json") args = parser.parse_args() dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16) model = CNN(32, 3, 36, 10, args.layers, auxiliary=True) apply_fixed_architecture(model, args.arc_checkpoint, device=device) criterion = nn.CrossEntropyLoss() model.to(device) criterion.to(device) optimizer = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size,
parser.add_argument('--ending_lr', default=0, type=float, help='ending learning rate') parser.add_argument('--cutout', default=0, type=int, help='cutout length in data augmentation') parser.add_argument("--channels", default=16, type=int) args = parser.parse_args() dataset_train, dataset_valid = datasets.get_dataset( "cifar10", cutout_length=args.cutout) model = CNN(32, 3, args.channels, 10, args.layers, auxiliary=True) apply_fixed_architecture(model, args.arc_checkpoint) criterion = nn.CrossEntropyLoss() model.to(device) criterion.to(device) optimizer = torch.optim.SGD(model.parameters(), args.initial_lr, momentum=0.9, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min=args.ending_lr) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True,
def main(): config = RetrainConfig() main_proc = not config.distributed or config.local_rank == 0 if config.distributed: torch.cuda.set_device(config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, rank=config.local_rank, world_size=config.world_size) if main_proc: os.makedirs(config.output_path, exist_ok=True) if config.distributed: torch.distributed.barrier() logger = utils.get_logger(os.path.join(config.output_path, 'search.log')) if main_proc: config.print_params(logger.info) utils.reset_seed(config.seed) loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers model = Model(config.dataset, config.layers, in_channels=config.input_channels, channels=config.init_channels, retrain=True).cuda() if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth(config.n_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() fixed_arc_path = os.path.join(config.output_path, config.arc_checkpoint) with open(fixed_arc_path, "r") as f: fixed_arc = json.load(f) fixed_arc = utils.encode_tensor(fixed_arc, torch.device("cuda")) genotypes = utils.parse_results(fixed_arc, n_nodes=4) genotypes_dict = {i: genotypes for i in range(3)} apply_fixed_architecture(model, fixed_arc_path) param_size = utils.param_size( model, criterion, [3, 32, 32] if 'cifar' in config.dataset else [3, 224, 224]) if main_proc: logger.info("Param size: %.6f", param_size) logger.info("Genotype: %s", genotypes) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif 3.0 < param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.distributed: apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel(model, delay_allreduce=True) optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs, eta_min=1E-6) best_top1 = best_top5 = 0. for epoch in range(config.epochs): drop_prob = config.drop_path_prob * epoch / config.epochs if config.distributed: model.module.drop_path_prob(drop_prob) else: model.drop_path_prob(drop_prob) # training if config.distributed: train_sampler.set_epoch(epoch) train(logger, config, train_loader, model, optimizer, criterion, epoch, main_proc) # validation top1, top5 = validate(logger, config, valid_loader, model, criterion, epoch, main_proc) best_top1 = max(best_top1, top1) best_top5 = max(best_top5, top5) lr_scheduler.step() logger.info("Final best Prec@1 = %.4f Prec@5 = %.4f", best_top1, best_top5)
num_modules_per_stack=args.num_modules_per_stack, bn_affine=args.bn_affine, bn_momentum=args.bn_momentum, bn_track_running_stats=args.bn_track_running_stats) optim = torch.optim.SGD(model.parameters(), 0.025) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001) criterion = nn.CrossEntropyLoss() if args.arch is not None: logger.info('model retraining...') with open(args.arch, 'r') as f: arch = json.load(f) for trial in query_nb201_trial_stats(arch, 200, 'cifar100'): pprint.pprint(trial) apply_fixed_architecture(model, args.arch) dataloader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=0) dataloader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, num_workers=0) train(args, model, dataloader_train, dataloader_valid, criterion, optim, torch.device('cuda' if torch.cuda.is_available() else 'cpu')) exit(0) trainer = enas.EnasTrainer(model, loss=criterion, metrics=lambda output, target: accuracy(output, target, topk=(1,)), reward_function=reward_accuracy, optimizer=optim, callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")], batch_size=args.batch_size, num_epochs=args.epochs, dataset_train=dataset_train,
if args.train_mode == 'search': # this is architecture search logger.info('Creating ProxylessNasTrainer...') trainer = ProxylessNasTrainer(model, model_optim=optimizer, train_loader=data_provider.train, valid_loader=data_provider.valid, device=device, warmup=args.warmup, ckpt_path=args.checkpoint_path, arch_path=args.arch_path) logger.info('Start to train with ProxylessNasTrainer...') trainer.train() logger.info('Training done') trainer.export(args.arch_path) logger.info('Best architecture exported in %s', args.arch_path) elif args.train_mode == 'retrain': # this is retrain from nni.nas.pytorch.fixed import apply_fixed_architecture assert os.path.isfile(args.exported_arch_path), \ "exported_arch_path {} should be a file.".format(args.exported_arch_path) apply_fixed_architecture(model, args.exported_arch_path, device=device) trainer = Retrain(model, optimizer, device, data_provider, n_epochs=300) trainer.run()
def run(self, arc, validate=True, test=False): '''retrain the best-performing arch from scratch arc: the json file path of the best-performing arch ''' self.logger.info("=" * 20) self.logger.info("Retraining the best architecture ...") self.enable_writter = True self.reset() # init model and mutator self.mutator = apply_fixed_architecture(self.model, arc) size = self.model_size() arc_name = os.path.basename(arc) self.logger.info(f"{arc_name} Model size={size*4/1024**2} MB") # callbacks for callback in self.callbacks: callback.build(self.model, self.mutator, self) # resume self.start_epoch = 0 self.resume() # fintune # todo: improve robustness, bug of optimizer resume # if self.cfg.model.finetune: # self.logger.info("Freezing params of conv part ...") # for name, param in self.model.named_parameters(): # if 'dense' not in name: # param.requires_grad = False # dataparallel if len(self.cfg.trainer.device_ids) > 1: device_ids = self.cfg.trainer.device_ids num_gpus_available = torch.cuda.device_count() assert num_gpus_available >= len( device_ids), "you can only use {} device(s)".format( num_gpus_available) self.model = torch.nn.DataParallel(self.model, device_ids=device_ids) if self.kd_model: self.kd_model = torch.nn.DataParallel(self.kd_model, device_ids=device_ids) if test: meters = self.test_one_epoch(-1, self.test_loader) self.logger.info(f"Final test metrics= {meters}") return meters # start training for epoch in range(self.start_epoch, self.cfg.evaluator.num_epochs): for callback in self.callbacks: callback.on_epoch_begin(epoch) self.logger.info("Epoch %d Training", epoch) self.train_one_epoch(epoch, self.train_loader) if validate: self.logger.info("Epoch %d Validating", epoch) self.valid_one_epoch(epoch, self.test_loader) self.lr_scheduler.step() self.cur_meters = getattr(self, 'valid_meters', self.train_meters) for callback in self.callbacks: if isinstance(callback, CheckpointCallback): callback.update_best_metric( self.cur_meters.meters['save_metric'].avg) callback.on_epoch_end(epoch) self.logger.info("Final best Prec@1 = {:.4%}".format(self.best_metric))
def main(): config = RetrainConfig() main_proc = not config.distributed or config.local_rank == 0 if config.distributed: torch.cuda.set_device(config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, rank=config.local_rank, world_size=config.world_size) if main_proc: os.makedirs(config.output_path, exist_ok=True) if config.distributed: torch.distributed.barrier() logger = utils.get_logger(os.path.join(config.output_path, 'search.log')) if main_proc: config.print_params(logger.info) utils.reset_seed(config.seed) loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers train_loader = CyclicIterator(train_loader, train_sampler) # valid_loader = CyclicIterator(valid_loader, valid_sampler, False) model = Model(config.dataset, config.layers, in_channels=config.input_channels, channels=config.init_channels, retrain=True).cuda() if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth(config.n_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() fixed_arc_path = os.path.join('', config.arc_checkpoint) with open(fixed_arc_path, "r") as f: fixed_arc = json.load(f) fixed_arc = utils.encode_tensor(fixed_arc, torch.device("cuda")) genotypes = utils.parse_results(fixed_arc, n_nodes=4) genotypes_dict = {i: genotypes for i in range(3)} apply_fixed_architecture(model, fixed_arc_path) param_size = utils.param_size(model, criterion, [3, 512, 512]) if main_proc: logger.info("Param size: %.6f", param_size) logger.info("Genotype: %s", genotypes) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif 3.0 < param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.distributed: apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel(model, delay_allreduce=True) optimizer = torch.optim.AdamW(model.parameters(), config.lr) # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs, eta_min=1E-6) best_top1 = 0. epoch = 0 try: checkpoint = torch.load(config.model_checkpoint) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] model.eval() print("----------------------------") print("MODEL LOADED FROM CHECKPOINT" + config.model_checkpoint) print("----------------------------") except: print("----------------------------") print("MODEL NOT LOADED FROM CHECKPOINT") print("----------------------------") pass # for epoch in range(0, epoch): # lr_scheduler.step() for epoch in range(epoch, config.epochs): drop_prob = config.drop_path_prob * epoch / config.epochs if config.distributed: model.module.drop_path_prob(drop_prob) else: model.drop_path_prob(drop_prob) # training if config.distributed: train_sampler.set_epoch(epoch) train(logger, config, train_loader, model, optimizer, criterion, epoch, main_proc) if (epoch % config.log_frequency == 0): # validation top1 = validate(logger, config, valid_loader, model, criterion, epoch, main_proc) best_top1 = max(best_top1, top1) # lr_scheduler.step() logger.info("Final best Prec@1 = %.4f", best_top1)
def main(args): reset_seed(args.seed) prepare_logger(args) logger.info("These are the hyper-parameters you want to tune:\n%s", pprint.pformat(vars(args))) if args.model == 'nas': logger.info("Using NAS.\n") if args.fix_arch: if not os.path.exists(args.arc_checkpoint): print(args.arc_checkpoint, 'does not exist, don not fix archetect') args.fix_arch = False device = 'cuda' if torch.cuda.is_available() else 'cpu' if args.model == 'nas': if not args.fix_arch: model = CNN(32, 3, args.channels, 10, args.layers) trainset, testset = data_preprocess(args) else: model = CNN(32, 3, args.channels, 10, args.layers) apply_fixed_architecture(model, args.arc_checkpoint) model.to(device) train_loader, test_loader = data_preprocess(args) else: train_loader, test_loader = data_preprocess(args) model = models.__dict__[args.model]() model.to(device) criterion = nn.CrossEntropyLoss() if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay) else: if args.optimizer == 'sgd': optimizer_cls = optim.SGD elif args.optimizer == 'rmsprop': optimizer_cls = optim.RMSprop optimizer = optimizer_cls(model.parameters(), lr=args.initial_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=args.ending_lr) if args.model == 'nas' and not args.fix_arch: trainer = DartsTrainer(model, loss=criterion, metrics=lambda output, target: accuracyTopk( output, target, topk=(1, )), optimizer=optimizer, num_epochs=args.epochs, dataset_train=trainset, dataset_valid=testset, batch_size=args.batch_size, log_frequency=args.log_frequency, unrolled=args.unrolled, callbacks=[ LRSchedulerCallback(scheduler), ArchitectureCheckpoint("./checkpoints") ]) if args.visualization: trainer.enable_visualization() trainer.train() trainer.export("final_arch.json") else: for epoch in range(1, args.epochs + 1): train(model, train_loader, criterion, optimizer, scheduler, args, epoch, device) top1, _ = test(model, test_loader, criterion, args, epoch, device) nni.report_intermediate_result(top1) logger.info("Final accuracy is: %.6f", top1) nni.report_final_result(top1)
{'params': get_parameters(model, keys, mode='include'), 'weight_decay': 0}, ], lr=0.05, momentum=momentum, nesterov=nesterov) else: optimizer = torch.optim.SGD(get_parameters(model), lr=0.05, momentum=momentum, nesterov=nesterov, weight_decay=4e-5) if args.train_mode == 'search': # this is architecture search logger.info('Creating ProxylessNasTrainer...') trainer = ProxylessNasTrainer(model, model_optim=optimizer, train_loader=data_provider.train, valid_loader=data_provider.valid, device=device, warmup=args.warmup, ckpt_path=args.checkpoint_path, arch_path=args.arch_path) logger.info('Start to train with ProxylessNasTrainer...') trainer.train() logger.info('Training done') trainer.export(args.arch_path) logger.info('Best architecture exported in %s', args.arch_path) elif args.train_mode == 'retrain': # this is retrain from nni.nas.pytorch.fixed import apply_fixed_architecture assert os.path.isfile(args.exported_arch_path), \ "exported_arch_path {} should be a file.".format(args.exported_arch_path) apply_fixed_architecture(model, args.exported_arch_path) trainer = Retrain(model, optimizer, device, data_provider, n_epochs=300) trainer.run()