def train(cfg): # Set up environment. init_distributed_training(cfg) local_rank_id = get_local_rank() # Set random seed from configs. np.random.seed(cfg.RNG_SEED + 10 * local_rank_id) torch.manual_seed(cfg.RNG_SEED + 10 * local_rank_id) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) logger.info('init start') # 迭代轮数从1开始计数 arguments = {"cur_epoch": 1} device = get_device(local_rank_id) model = build_recognizer(cfg, device) criterion = build_criterion(cfg, device) optimizer = build_optimizer(cfg, model) lr_scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT_DIR, save_to_disk=True) if cfg.TRAIN.RESUME: logger.info('resume start') extra_checkpoint_data = checkpointer.load(map_location=device) if isinstance(extra_checkpoint_data, dict): arguments['cur_epoch'] = extra_checkpoint_data['cur_epoch'] if cfg.LR_SCHEDULER.IS_WARMUP: logger.info('warmup start') if lr_scheduler.finished: optimizer.load_state_dict( lr_scheduler.after_scheduler.optimizer.state_dict()) else: optimizer.load_state_dict( lr_scheduler.optimizer.state_dict()) lr_scheduler.optimizer = optimizer lr_scheduler.after_scheduler.optimizer = optimizer logger.info('warmup end') logger.info('resume end') data_loader = build_dataloader(cfg, is_train=True) logger.info('init end') synchronize() do_train(cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device)
def test(cfg): # Set up environment. init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True device = get_device(local_rank=get_local_rank()) model = build_recognizer(cfg, device=device) synchronize() do_evaluation(cfg, model, device)
def train(gpu, args, cfg): rank = args.nr * args.gpus + gpu setup(rank, args.world_size) logger = setup_logger(cfg.TRAIN.NAME) arguments = {"iteration": 0} torch.cuda.set_device(gpu) device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu') map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} model = build_model(cfg, gpu, map_location=map_location) criterion = build_criterion(cfg) optimizer = build_optimizer(cfg, model) lr_scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT.DIR, save_to_disk=True, logger=logger) if args.resume: if is_master_proc(): logger.info('resume ...') extra_checkpoint_data = checkpointer.load(map_location=map_location, rank=rank) if extra_checkpoint_data != dict(): arguments['iteration'] = extra_checkpoint_data['iteration'] if cfg.LR_SCHEDULER.IS_WARMUP: if is_master_proc(): logger.info('warmup ...') if lr_scheduler.finished: optimizer.load_state_dict(lr_scheduler.after_scheduler.optimizer.state_dict()) else: optimizer.load_state_dict(lr_scheduler.optimizer.state_dict()) lr_scheduler.optimizer = optimizer lr_scheduler.after_scheduler.optimizer = optimizer data_loader = build_dataloader(cfg, is_train=True, start_iter=arguments['iteration']) synchronize() do_train(args, cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device, logger) cleanup()
def do_train(args, cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device, logger): meters = MetricLogger() summary_writer = None if is_master_proc(): logger.info("Start training ...") if args.use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT.DIR, 'tf_logs')) model.train() start_iter = arguments['iteration'] max_iter = cfg.TRAIN.MAX_ITER synchronize() start_training_time = time.time() end = time.time() for iteration, (images, targets) in enumerate(data_loader, start_iter): synchronize() iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) outputs = model(images) loss = criterion(outputs, targets) # compute top-k accuray topk_list = topk_accuracy(outputs, targets, topk=(1, 5)) meters.update(loss=loss / len(targets), acc_1=topk_list[0], acc_5=topk_list[1]) optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() if iteration % len(data_loader) == 0 and hasattr( data_loader.batch_sampler, "set_epoch"): data_loader.batch_sampler.set_epoch(iteration) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if is_master_proc(): if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if not args.stop_save and iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if not args.stop_eval and args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, device, iteration=iteration) if summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) model.train() if is_master_proc() and not args.stop_eval: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) summary_writer.close() checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) if is_master_proc(): logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device): logger = logging.setup_logging(__name__) meters = MetricLogger() summary_writer = None use_tensorboard = cfg.TRAIN.USE_TENSORBOARD log_step = cfg.TRAIN.LOG_STEP save_step = cfg.TRAIN.SAVE_STEP eval_step = cfg.TRAIN.EVAL_STEP max_iter = cfg.TRAIN.MAX_ITER start_iter = arguments['iteration'] if is_master_proc() and use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) evaluator = data_loader.dataset.evaluator synchronize() start_training_time = time.time() end = time.time() logger.info("Start training ...") model.train() for iteration, (images, targets) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict['loss'] optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() acc_list = evaluator.evaluate_train(output_dict, targets) update_meters(cfg.NUM_GPUS, meters, loss_dict, acc_list) if iteration % len(data_loader) == 0 and hasattr( data_loader.batch_sampler, "set_epoch"): data_loader.batch_sampler.set_epoch(iteration) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if is_master_proc(): if summary_writer: global_step = iteration for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if save_step > 0 and iteration % save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if eval_step > 0 and iteration % eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, device, iteration=iteration) model.train() if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) if eval_step > 0: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=arguments["iteration"]) summary_writer.close() checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device): meters = MetricLogger() evaluator = data_loader.dataset.evaluator summary_writer = None use_tensorboard = cfg.TRAIN.USE_TENSORBOARD if is_master_proc() and use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) log_step = cfg.TRAIN.LOG_STEP save_epoch = cfg.TRAIN.SAVE_EPOCH eval_epoch = cfg.TRAIN.EVAL_EPOCH max_epoch = cfg.TRAIN.MAX_EPOCH start_epoch = arguments['cur_epoch'] epoch_iters = len(data_loader) max_iter = (max_epoch - start_epoch) * epoch_iters synchronize() model.train() logger.info("Start training ...") # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch)) start_training_time = time.time() end = time.time() for cur_epoch in range(start_epoch, max_epoch + 1): shuffle_dataset(data_loader, cur_epoch) for iteration, (images, targets) in enumerate(data_loader): images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict['loss'] optimizer.zero_grad() loss.backward() optimizer.step() acc_list = evaluator.evaluate_train(output_dict, targets) update_stats(cfg.NUM_GPUS, meters, loss_dict, acc_list) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if (iteration + 1) % log_step == 0: logger.info( log_iter_stats(iteration, epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) if is_master_proc() and summary_writer: global_step = (cur_epoch - 1) * epoch_iters + (iteration + 1) for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) logger.info( log_epoch_stats(epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) arguments["cur_epoch"] = cur_epoch lr_scheduler.step() if is_master_proc( ) and save_epoch > 0 and cur_epoch % save_epoch == 0 and cur_epoch != max_epoch: checkpointer.save("model_{:04d}".format(cur_epoch), **arguments) if eval_epoch > 0 and cur_epoch % eval_epoch == 0 and cur_epoch != max_epoch: eval_results = do_evaluation(cfg, model, device, cur_epoch=cur_epoch) model.train() if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=cur_epoch + 1) if eval_epoch > 0: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=arguments["cur_epoch"]) summary_writer.close() if is_master_proc(): checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model