def new_epoch(self, epoch): # We sample as many architectures as we need if epoch < self.population_size: logger.info("Start sampling architectures to fill the population") # If there is no scope defined, let's use the search space default one model = torch.nn.Module( ) # hacky way to get arch and accuracy checkpointable model.arch = sample_random_architecture(self.search_space, self.scope) model.accuracy = model.arch.query(self.performance_metric, self.dataset) self.population.append(model) self._update_history(model) log_every_n_seconds( logging.INFO, "Population size {}".format(len(self.population))) else: sample = [] while len(sample) < self.sample_size: candidate = np.random.choice(list(self.population)) sample.append(candidate) parent = max(sample, key=lambda x: x.accuracy) child = torch.nn.Module( ) # hacky way to get arch and accuracy checkpointable child.arch = mutate(parent.arch) child.accuracy = child.arch.query(self.performance_metric, self.dataset) self.population.append(child) self._update_history(child)
def search(self, resume_from=""): """ Start the architecture search. Generates a json file with training statistics. Args: resume_from (str): Checkpoint file to resume from. If not given then train from scratch. """ logger.info("Start training") self.optimizer.before_training() checkpoint_freq = self.config.search.checkpoint_freq if self.optimizer.using_step_function: self.scheduler = self.build_search_scheduler( self.optimizer.op_optimizer, self.config) start_epoch = self._setup_checkpointers(resume_from, period=checkpoint_freq, scheduler=self.scheduler) else: start_epoch = self._setup_checkpointers(resume_from, period=checkpoint_freq) self.train_queue, self.valid_queue, _ = self.build_search_dataloaders( self.config) for e in range(start_epoch, self.epochs): self.optimizer.new_epoch(e) start_time = time.time() if self.optimizer.using_step_function: for step, (data_train, data_val) in enumerate( zip(self.train_queue, self.valid_queue)): data_train = (data_train[0].to(self.device), data_train[1].to(self.device, non_blocking=True)) data_val = (data_val[0].to(self.device), data_val[1].to(self.device, non_blocking=True)) stats = self.optimizer.step(data_train, data_val) logits_train, logits_val, train_loss, val_loss = stats self._store_accuracies(logits_train, data_train[1], 'train') self._store_accuracies(logits_val, data_val[1], 'val') log_every_n_seconds( logging.INFO, "Epoch {}-{}, Train loss: {:.5f}, validation loss: {:.5f}, learning rate: {}" .format(e, step, train_loss, val_loss, self.scheduler.get_last_lr()), n=5) if torch.cuda.is_available(): log_first_n(logging.INFO, "cuda consumption\n {}".format( torch.cuda.memory_summary()), n=3) self.train_loss.update(float(train_loss.detach().cpu())) self.val_loss.update(float(val_loss.detach().cpu())) self.scheduler.step() end_time = time.time() self.errors_dict.train_acc.append(self.train_top1.avg) self.errors_dict.train_loss.append(self.train_loss.avg) self.errors_dict.valid_acc.append(self.val_top1.avg) self.errors_dict.valid_loss.append(self.val_loss.avg) self.errors_dict.runtime.append(end_time - start_time) else: end_time = time.time() train_acc, train_loss, valid_acc, valid_loss = self.optimizer.train_statistics( ) self.errors_dict.train_acc.append(train_acc) self.errors_dict.train_loss.append(train_loss) self.errors_dict.valid_acc.append(valid_acc) self.errors_dict.valid_loss.append(valid_loss) self.errors_dict.runtime.append(end_time - start_time) self.train_top1.avg = train_acc self.val_top1.avg = valid_acc self.periodic_checkpointer.step(e) anytime_results = self.optimizer.test_statistics() if anytime_results: # record anytime performance self.errors_dict.arch_eval.append(anytime_results) log_every_n_seconds(logging.INFO, "Epoch {}, Anytime results: {}".format( e, anytime_results), n=5) self._log_to_json() self._log_and_reset_accuracies(e) self.optimizer.after_training() logger.info("Training finished")
def evaluate( self, retrain=True, search_model="", resume_from="", best_arch=None, ): """ Evaluate the final architecture as given from the optimizer. If the search space has an interface to a benchmark then query that. Otherwise train as defined in the config. Args: retrain (bool): Reset the weights from the architecure search search_model (str): Path to checkpoint file that was created during search. If not provided, then try to load 'model_final.pth' from search resume_from (str): Resume retraining from the given checkpoint file. best_arch: Parsed model you want to directly evaluate and ignore the final model from the optimizer. """ logger.info("Start evaluation") if not best_arch: if not search_model: search_model = os.path.join(self.config.save, "search", "model_final.pth") self._setup_checkpointers( search_model) # required to load the architecture best_arch = self.optimizer.get_final_architecture() logger.info("Final architecture:\n" + best_arch.modules_str()) if best_arch.QUERYABLE: metric = Metric.TEST_ACCURACY result = best_arch.query(metric=metric, dataset=self.config.dataset) logger.info("Queried results ({}): {}".format(metric, result)) else: best_arch.to(self.device) if retrain: logger.info("Starting retraining from scratch") best_arch.reset_weights(inplace=True) self.train_queue, self.valid_queue, self.test_queue = self.build_eval_dataloaders( self.config) optim = self.build_eval_optimizer(best_arch.parameters(), self.config) scheduler = self.build_eval_scheduler(optim, self.config) start_epoch = self._setup_checkpointers( resume_from, search=False, period=self.config.evaluation.checkpoint_freq, model=best_arch, # checkpointables start here optim=optim, scheduler=scheduler) grad_clip = self.config.evaluation.grad_clip loss = torch.nn.CrossEntropyLoss() best_arch.train() self.train_top1.reset() self.train_top5.reset() self.val_top1.reset() self.val_top5.reset() # Enable drop path best_arch.update_edges(update_func=lambda edge: edge.data.set( 'op', DropPathWrapper(edge.data.op)), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True) # train from scratch epochs = self.config.evaluation.epochs for e in range(start_epoch, epochs): if torch.cuda.is_available(): log_first_n(logging.INFO, "cuda consumption\n {}".format( torch.cuda.memory_summary()), n=20) # update drop path probability drop_path_prob = self.config.evaluation.drop_path_prob * e / epochs best_arch.update_edges( update_func=lambda edge: edge.data.set( 'drop_path_prob', drop_path_prob), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True) # Train queue for i, (input_train, target_train) in enumerate(self.train_queue): input_train = input_train.to(self.device) target_train = target_train.to(self.device, non_blocking=True) optim.zero_grad() logits_train = best_arch(input_train) train_loss = loss(logits_train, target_train) if hasattr(best_arch, 'auxilary_logits'): # darts specific stuff log_first_n(logging.INFO, "Auxiliary is used", n=10) auxiliary_loss = loss(best_arch.auxilary_logits(), target_train) train_loss += self.config.evaluation.auxiliary_weight * auxiliary_loss train_loss.backward() if grad_clip: torch.nn.utils.clip_grad_norm_( best_arch.parameters(), grad_clip) optim.step() self._store_accuracies(logits_train, target_train, 'train') log_every_n_seconds( logging.INFO, "Epoch {}-{}, Train loss: {:.5}, learning rate: {}" .format(e, i, train_loss, scheduler.get_last_lr()), n=5) # Validation queue if self.valid_queue: for i, (input_valid, target_valid) in enumerate(self.valid_queue): input_valid = input_valid.cuda().float() target_valid = target_valid.cuda().float() # just log the validation accuracy with torch.no_grad(): logits_valid = best_arch(input_valid) self._store_accuracies(logits_valid, target_valid, 'val') scheduler.step() self.periodic_checkpointer.step(e) self._log_and_reset_accuracies(e) # Disable drop path best_arch.update_edges(update_func=lambda edge: edge.data.set( 'op', edge.data.op.get_embedded_ops()), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True) # measure final test accuracy top1 = utils.AverageMeter() top5 = utils.AverageMeter() best_arch.eval() for i, data_test in enumerate(self.test_queue): input_test, target_test = data_test input_test = input_test.to(self.device) target_test = target_test.to(self.device, non_blocking=True) n = input_test.size(0) with torch.no_grad(): logits = best_arch(input_test) prec1, prec5 = utils.accuracy(logits, target_test, topk=(1, 5)) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) log_every_n_seconds(logging.INFO, "Inference batch {} of {}.".format( i, len(self.test_queue)), n=5) logger.info( "Evaluation finished. Test accuracies: top-1 = {:.5}, top-5 = {:.5}" .format(top1.avg, top5.avg))
def evaluate( self, retrain=True, search_model="", resume_from="", best_arch=None, ): """ Evaluate the final architecture as given from the optimizer. If the search space has an interface to a benchmark then query that. Otherwise train as defined in the config. Args: retrain (bool): Reset the weights from the architecure search search_model (str): Path to checkpoint file that was created during search. If not provided, then try to load 'model_final.pth' from search resume_from (str): Resume retraining from the given checkpoint file. multi_gpu (bool): Distribute training on multiple gpus. best_arch: Parsed model you want to directly evaluate and ignore the final model from the optimizer. """ #best_arch.to(self.device) self.config.evaluation.resume_from = resume_from if retrain: if self.config.gpu is not None: logger.warning( 'You have chosen a specific GPU. This will completely \ disable data parallelism.' ) if self.config.evaluation.dist_url == "env://" and self.config.evaluation.world_size == -1: self.config.evaluation.world_size = int(os.environ["WORLD_SIZE"]) self.config.evaluation.distributed = \ self.config.evaluation.world_size > 1 or self.config.evaluation.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if self.config.evaluation.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the # total world_size needs to be adjusted self.config.evaluation.world_size = ngpus_per_node * self.config.evaluation.world_size # Use torch.multiprocessing.spawn to launch distributed # processes: the main_worker process function mp.spawn(self.main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, self.config.evaluation, search_model, best_arch)) else: # Simply call main_worker function self.main_worker(self.config.gpu, ngpus_per_node, self.config.evaluation, search_model, best_arch) if not self.QUERYABLE: # Disable drop path best_arch.update_edges( update_func=lambda edge: edge.data.set('op', edge.data.op.get_embedded_ops()), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True ) # measure final test accuracy top1 = utils.AverageMeter() top5 = utils.AverageMeter() best_arch.eval() for i, data_test in enumerate(self.test_queue): input_test, target_test = data_test input_test = input_test.to(self.device) target_test = target_test.to(self.device, non_blocking=True) n = input_test.size(0) with torch.no_grad(): logits = best_arch(input_test) prec1, prec5 = utils.accuracy(logits, target_test, topk=(1, 5)) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) log_every_n_seconds( logging.INFO, "Inference batch {} of {}.".format( i, len(self.test_queue) ), n=5 ) logger.info("Evaluation finished. Test accuracies: top-1 = {:.5}, \ top-5 = {:.5}".format(top1.avg, top5.avg))
def main_worker(self, gpu, ngpus_per_node, args, search_model, best_arch): logger.info("Start evaluation") if not best_arch: if not search_model: search_model = os.path.join(self.config.save, "search", "model_final.pth") self._setup_checkpointers(search_model) # required to load the architecture best_arch = self.optimizer.get_final_architecture() logger.info("Final architecture:\n" + best_arch.modules_str()) if best_arch.QUERYABLE: metric = Metric.TEST_ACCURACY result = best_arch.query( metric=metric, dataset=self.config.dataset ) logger.info("Queried results ({}): {}".format(metric, result)) self.QUERYABLE = True return best_arch.reset_weights(inplace=True) logger.info("Starting retraining from scratch") args.gpu = gpu if gpu is not None: logger.info("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if not torch.cuda.is_available(): logger.warning("Using CPU, this will be slow!") elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices if args.gpu is not None: torch.cuda.set_device(args.gpu) best_arch.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) best_arch = \ torch.nn.parallel.DistributedDataParallel(best_arch, device_ids=[args.gpu]) else: best_arch.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set best_arch = torch.nn.parallel.DistributedDataParallel(best_arch) elif args.gpu is not None: torch.cuda.set_device(args.gpu) best_arch = best_arch.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs best_arch = torch.nn.DataParallel(best_arch).cuda() cudnn.benchmark = True self.train_queue, self.valid_queue, self.test_queue =\ self.build_eval_dataloaders(self.config) optim = self.build_eval_optimizer(best_arch.parameters(), self.config) scheduler = self.build_eval_scheduler(optim, self.config) start_epoch = self._setup_checkpointers(args.resume_from, search=False, period=self.config.evaluation.checkpoint_freq, model=best_arch, # checkpointables start here optim=optim, scheduler=scheduler ) grad_clip = self.config.evaluation.grad_clip loss = torch.nn.CrossEntropyLoss() best_arch.train() self.train_top1.reset() self.train_top5.reset() self.val_top1.reset() self.val_top5.reset() # Enable drop path if isinstance(best_arch, torch.nn.DataParallel): best_arch.module.update_edges( update_func=lambda edge: edge.data.set('op', DropPathWrapper(edge.data.op)), scope=best_arch.module.OPTIMIZER_SCOPE, private_edge_data=True ) else: best_arch.update_edges( update_func=lambda edge: edge.data.set('op', DropPathWrapper(edge.data.op)), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True ) # train from scratch epochs = self.config.evaluation.epochs for e in range(start_epoch, epochs): # update drop path probability drop_path_prob = self.config.evaluation.drop_path_prob * e / epochs if isinstance(best_arch, torch.nn.DataParallel): best_arch.module.update_edges( update_func=lambda edge: edge.data.set('drop_path_prob', drop_path_prob), scope=best_arch.module.OPTIMIZER_SCOPE, private_edge_data=True ) else: best_arch.update_edges( update_func=lambda edge: edge.data.set('drop_path_prob', drop_path_prob), scope=best_arch.OPTIMIZER_SCOPE, private_edge_data=True ) # Train queue for i, (input_train, target_train) in enumerate(self.train_queue): input_train = input_train.to(self.device) target_train = target_train.to(self.device, non_blocking=True) optim.zero_grad() logits_train = best_arch(input_train) train_loss = loss(logits_train, target_train) if hasattr(best_arch, 'auxilary_logits'): # darts specific stuff log_first_n(logging.INFO, "Auxiliary is used", n=10) auxiliary_loss = loss(best_arch.auxilary_logits(), target_train) train_loss += self.config.evaluation.auxiliary_weight * auxiliary_loss train_loss.backward() if grad_clip: torch.nn.utils.clip_grad_norm_(best_arch.parameters(), grad_clip) optim.step() self._store_accuracies(logits_train, target_train, 'train') log_every_n_seconds( logging.INFO, "Epoch {}-{}, Train loss: {:.5}, learning rate: {}".format( e, i, train_loss, scheduler.get_last_lr() ), n=5 ) if torch.cuda.is_available(): log_first_n( logging.INFO, "cuda consumption\n {}".format( torch.cuda.memory_summary() ), n=3 ) # Validation queue if self.valid_queue: for i, (input_valid, target_valid) in enumerate(self.valid_queue): input_valid = input_valid.to(self.device).float() target_valid = target_valid.to(self.device, non_blocking=True).float() # just log the validation accuracy logits_valid = best_arch(input_valid) self._store_accuracies(logits_valid, target_valid, 'val') scheduler.step() self.periodic_checkpointer.step(e) self._log_and_reset_accuracies(e)