def inference(cfg, model, device, **kwargs): cur_epoch = kwargs.get('cur_epoch', None) dataset_name = cfg.DATASETS.TEST.NAME num_gpus = cfg.NUM_GPUS data_loader = build_dataloader(cfg, is_train=False) dataset = data_loader.dataset evaluator = data_loader.dataset.evaluator evaluator.clean() logger.info("Evaluating {} dataset({} video clips):".format(dataset_name, len(dataset))) if is_master_proc(): for images, targets in tqdm(data_loader): compute_on_dataset(images, targets, device, model, num_gpus, evaluator) else: for images, targets in data_loader: compute_on_dataset(images, targets, device, model, num_gpus, evaluator) result_str, acc_dict = evaluator.get() logger.info(result_str) if is_master_proc(): output_dir = cfg.OUTPUT_DIR result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) \ if cur_epoch is None else os.path.join(output_dir, 'result_{:04d}.txt'.format(cur_epoch)) with open(result_path, "w") as f: f.write(result_str) return acc_dict
def setup_logging(output_dir=None): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ # Set up logging format. _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" if du.is_master_proc(): # Enable logging for the master process. logging.root.handlers = [] else: # Suppress logging for non-master processes. _suppress_print() logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", datefmt="%m/%d %H:%M:%S", ) if du.is_master_proc(): ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(plain_formatter) logger.addHandler(ch) if output_dir is not None and du.is_master_proc(du.get_world_size()): filename = os.path.join(output_dir, "stdout.log") fh = logging.FileHandler(filename) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh)
def setup_logging(name, output_dir=None): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ if not du.is_master_proc(du.get_world_size()): # Suppress logging for non-master processes. _suppress_print() logger = NllLogger(f'{name}.{du.get_rank()}') return logger logger = logging.getLogger(name) logging.root.handlers = [] for handler in logger.handlers: logger.removeHandler(handler) logger.setLevel(logging.DEBUG) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", datefmt="%m/%d %H:%M:%S", ) ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(plain_formatter) logger.addHandler(ch) if output_dir: fh = logging.FileHandler(os.path.join(output_dir, 'log.txt')) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh) return logger
def train(gpu, args, cfg): rank = args.nr * args.gpus + gpu setup(rank, args.world_size) logger = setup_logger(cfg.TRAIN.NAME) arguments = {"iteration": 0} torch.cuda.set_device(gpu) device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu') map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} model = build_model(cfg, gpu, map_location=map_location) criterion = build_criterion(cfg) optimizer = build_optimizer(cfg, model) lr_scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT.DIR, save_to_disk=True, logger=logger) if args.resume: if is_master_proc(): logger.info('resume ...') extra_checkpoint_data = checkpointer.load(map_location=map_location, rank=rank) if extra_checkpoint_data != dict(): arguments['iteration'] = extra_checkpoint_data['iteration'] if cfg.LR_SCHEDULER.IS_WARMUP: if is_master_proc(): logger.info('warmup ...') if lr_scheduler.finished: optimizer.load_state_dict(lr_scheduler.after_scheduler.optimizer.state_dict()) else: optimizer.load_state_dict(lr_scheduler.optimizer.state_dict()) lr_scheduler.optimizer = optimizer lr_scheduler.after_scheduler.optimizer = optimizer data_loader = build_dataloader(cfg, is_train=True, start_iter=arguments['iteration']) synchronize() do_train(args, cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device, logger) cleanup()
def build_model(cfg, gpu, map_location=None, logger=None): model = registry.RECOGNIZER[cfg.MODEL.RECOGNIZER.NAME]( cfg, map_location=map_location).cuda(gpu) world_size = du.get_world_size() rank = du.get_rank() if cfg.MODEL.SYNC_BN and world_size > 1: process_group = simple_group_split(world_size, rank, 1) convert_sync_bn(model, process_group, gpu=gpu) if cfg.MODEL.PRETRAINED != "": if du.is_master_proc() and logger: logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}') checkpointer = CheckPointer(model, logger=logger) checkpointer.load(cfg.MODEL.PRETRAINED, map_location=map_location, rank=rank) if du.get_world_size() > 1: model = DDP(model, device_ids=[gpu], output_device=gpu, find_unused_parameters=True) return model
def do_train(args, cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device, logger): meters = MetricLogger() summary_writer = None if is_master_proc(): logger.info("Start training ...") if args.use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT.DIR, 'tf_logs')) model.train() start_iter = arguments['iteration'] max_iter = cfg.TRAIN.MAX_ITER synchronize() start_training_time = time.time() end = time.time() for iteration, (images, targets) in enumerate(data_loader, start_iter): synchronize() iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) outputs = model(images) loss = criterion(outputs, targets) # compute top-k accuray topk_list = topk_accuracy(outputs, targets, topk=(1, 5)) meters.update(loss=loss / len(targets), acc_1=topk_list[0], acc_5=topk_list[1]) optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() if iteration % len(data_loader) == 0 and hasattr( data_loader.batch_sampler, "set_epoch"): data_loader.batch_sampler.set_epoch(iteration) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if is_master_proc(): if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if not args.stop_save and iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if not args.stop_eval and args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, device, iteration=iteration) if summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) model.train() if is_master_proc() and not args.stop_eval: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) summary_writer.close() checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) if is_master_proc(): logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device): logger = logging.setup_logging(__name__) meters = MetricLogger() summary_writer = None use_tensorboard = cfg.TRAIN.USE_TENSORBOARD log_step = cfg.TRAIN.LOG_STEP save_step = cfg.TRAIN.SAVE_STEP eval_step = cfg.TRAIN.EVAL_STEP max_iter = cfg.TRAIN.MAX_ITER start_iter = arguments['iteration'] if is_master_proc() and use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) evaluator = data_loader.dataset.evaluator synchronize() start_training_time = time.time() end = time.time() logger.info("Start training ...") model.train() for iteration, (images, targets) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict['loss'] optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() acc_list = evaluator.evaluate_train(output_dict, targets) update_meters(cfg.NUM_GPUS, meters, loss_dict, acc_list) if iteration % len(data_loader) == 0 and hasattr( data_loader.batch_sampler, "set_epoch"): data_loader.batch_sampler.set_epoch(iteration) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if is_master_proc(): if summary_writer: global_step = iteration for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if save_step > 0 and iteration % save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if eval_step > 0 and iteration % eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, device, iteration=iteration) model.train() if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=iteration) if eval_step > 0: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=arguments["iteration"]) summary_writer.close() checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, arguments, data_loader, model, criterion, optimizer, lr_scheduler, checkpointer, device): meters = MetricLogger() evaluator = data_loader.dataset.evaluator summary_writer = None use_tensorboard = cfg.TRAIN.USE_TENSORBOARD if is_master_proc() and use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) log_step = cfg.TRAIN.LOG_STEP save_epoch = cfg.TRAIN.SAVE_EPOCH eval_epoch = cfg.TRAIN.EVAL_EPOCH max_epoch = cfg.TRAIN.MAX_EPOCH start_epoch = arguments['cur_epoch'] epoch_iters = len(data_loader) max_iter = (max_epoch - start_epoch) * epoch_iters synchronize() model.train() logger.info("Start training ...") # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch)) start_training_time = time.time() end = time.time() for cur_epoch in range(start_epoch, max_epoch + 1): shuffle_dataset(data_loader, cur_epoch) for iteration, (images, targets) in enumerate(data_loader): images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict['loss'] optimizer.zero_grad() loss.backward() optimizer.step() acc_list = evaluator.evaluate_train(output_dict, targets) update_stats(cfg.NUM_GPUS, meters, loss_dict, acc_list) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if (iteration + 1) % log_step == 0: logger.info( log_iter_stats(iteration, epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) if is_master_proc() and summary_writer: global_step = (cur_epoch - 1) * epoch_iters + (iteration + 1) for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) logger.info( log_epoch_stats(epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) arguments["cur_epoch"] = cur_epoch lr_scheduler.step() if is_master_proc( ) and save_epoch > 0 and cur_epoch % save_epoch == 0 and cur_epoch != max_epoch: checkpointer.save("model_{:04d}".format(cur_epoch), **arguments) if eval_epoch > 0 and cur_epoch % eval_epoch == 0 and cur_epoch != max_epoch: eval_results = do_evaluation(cfg, model, device, cur_epoch=cur_epoch) model.train() if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=cur_epoch + 1) if eval_epoch > 0: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, device) if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=arguments["cur_epoch"]) summary_writer.close() if is_master_proc(): checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model