def setup_logging(output_dir=None): """ Sets up the logging for multiple processes. Only enable the logging for the master process, and suppress logging for the non-master processes. """ # Set up logging format. _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" if du.is_master_proc(): # Enable logging for the master process. logging.root.handlers = [] else: # Suppress logging for non-master processes. _suppress_print() logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", datefmt="%m/%d %H:%M:%S", ) if du.is_master_proc(): ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(plain_formatter) logger.addHandler(ch) if output_dir is not None and du.is_master_proc(du.get_world_size()): filename = os.path.join(output_dir, "stdout.log") fh = logging.FileHandler(filename) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh)
def inference(cfg, model, device, **kwargs): cur_epoch = kwargs.get('cur_epoch', None) dataset_name = cfg.DATASET.NAME num_gpus = cfg.NUM_GPUS data_loader = build_dataloader(cfg, is_train=False) dataset = data_loader.dataset evaluator = data_loader.dataset.evaluator evaluator.clean() logger.info("Evaluating {} dataset({} video clips):".format(dataset_name, len(dataset))) if is_master_proc(): for images, targets in tqdm(data_loader): compute_on_dataset(images, targets, device, model, num_gpus, evaluator) else: for images, targets in data_loader: compute_on_dataset(images, targets, device, model, num_gpus, evaluator) result_str, acc_dict = evaluator.get() logger.info(result_str) if is_master_proc(): output_dir = cfg.OUTPUT_DIR result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) \ if cur_epoch is None else os.path.join(output_dir, 'result_{:04d}.txt'.format(cur_epoch)) with open(result_path, "w") as f: f.write(result_str) return acc_dict
def inference(cfg, model, test_data_loader, device, **kwargs): cur_epoch = kwargs.get('cur_epoch', None) dataset_name = cfg.DATASET.NAME num_gpus = cfg.NUM_GPUS dataset = test_data_loader.dataset evaluator = test_data_loader.dataset.evaluator evaluator.clean() logger.info("Evaluating {} dataset({} video clips):".format( dataset_name, len(dataset))) data_loader = Prefetcher( test_data_loader, device) if cfg.DATALOADER.PREFETCHER else test_data_loader if is_master_proc(): for images, targets in tqdm(data_loader): if not cfg.DATALOADER.PREFETCHER: images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) compute_on_dataset(images, targets, model, num_gpus, evaluator) else: for images, targets in data_loader: if not cfg.DATALOADER.PREFETCHER: images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) compute_on_dataset(images, targets, model, num_gpus, evaluator) if cfg.DATALOADER.PREFETCHER: data_loader.release() result_str, acc_dict = evaluator.get() logger.info(result_str) if is_master_proc(): output_dir = cfg.OUTPUT_DIR result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) \ if cur_epoch is None else os.path.join(output_dir, 'result_{:04d}.txt'.format(cur_epoch)) with open(result_path, "w") as f: f.write(result_str) return acc_dict
def do_train(cfg, arguments, train_data_loader, test_data_loader, model, criterion, optimizer, lr_scheduler, check_pointer, device): meters = MetricLogger() evaluator = train_data_loader.dataset.evaluator summary_writer = None use_tensorboard = cfg.TRAIN.USE_TENSORBOARD if is_master_proc() and use_tensorboard: from torch.utils.tensorboard import SummaryWriter summary_writer = SummaryWriter(log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) log_step = cfg.TRAIN.LOG_STEP save_epoch = cfg.TRAIN.SAVE_EPOCH eval_epoch = cfg.TRAIN.EVAL_EPOCH max_epoch = cfg.TRAIN.MAX_EPOCH gradient_accumulate_step = cfg.TRAIN.GRADIENT_ACCUMULATE_STEP start_epoch = arguments['cur_epoch'] epoch_iters = len(train_data_loader) max_iter = (max_epoch - start_epoch) * epoch_iters current_iterations = 0 if cfg.TRAIN.HYBRID_PRECISION: # Creates a GradScaler once at the beginning of training. scaler = GradScaler() synchronize() model.train() logger.info("Start training ...") # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch)) start_training_time = time.time() end = time.time() for cur_epoch in range(start_epoch, max_epoch + 1): shuffle_dataset(train_data_loader, cur_epoch) data_loader = Prefetcher(train_data_loader, device) if cfg.DATALOADER.PREFETCHER else train_data_loader for iteration, (images, targets) in enumerate(data_loader): if not cfg.DATALOADER.PREFETCHER: images = images.to(device=device, non_blocking=True) targets = targets.to(device=device, non_blocking=True) if cfg.TRAIN.HYBRID_PRECISION: # Runs the forward pass with autocasting. with autocast(): output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict[KEY_LOSS] / gradient_accumulate_step current_iterations += 1 if current_iterations % gradient_accumulate_step != 0: if isinstance(model, DistributedDataParallel): # multi-gpu distributed training with model.no_sync(): scaler.scale(loss).backward() else: scaler.scale(loss).backward() else: scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() current_iterations = 0 optimizer.zero_grad() else: output_dict = model(images) loss_dict = criterion(output_dict, targets) loss = loss_dict[KEY_LOSS] / gradient_accumulate_step current_iterations += 1 if current_iterations % gradient_accumulate_step != 0: if isinstance(model, DistributedDataParallel): # multi-gpu distributed training with model.no_sync(): loss.backward() else: loss.backward() else: loss.backward() optimizer.step() current_iterations = 0 optimizer.zero_grad() acc_list = evaluator.evaluate_train(output_dict, targets) update_stats(cfg.NUM_GPUS, meters, loss_dict, acc_list) batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if (iteration + 1) % log_step == 0: logger.info(log_iter_stats( iteration, epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) if is_master_proc() and summary_writer: global_step = (cur_epoch - 1) * epoch_iters + (iteration + 1) for name, meter in meters.meters.items(): summary_writer.add_scalar('{}/avg'.format(name), float(meter.avg), global_step=global_step) summary_writer.add_scalar('{}/global_avg'.format(name), meter.global_avg, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if not cfg.DATALOADER.PREFETCHER: data_loader.release() logger.info(log_epoch_stats(epoch_iters, cur_epoch, max_epoch, optimizer.param_groups[0]['lr'], meters)) arguments["cur_epoch"] = cur_epoch lr_scheduler.step() if is_master_proc() and save_epoch > 0 and cur_epoch % save_epoch == 0 and cur_epoch != max_epoch: check_pointer.save("model_{:04d}".format(cur_epoch), **arguments) if eval_epoch > 0 and cur_epoch % eval_epoch == 0 and cur_epoch != max_epoch: if cfg.MODEL.NORM.PRECISE_BN: calculate_and_update_precise_bn( train_data_loader, model, min(cfg.MODEL.NORM.NUM_BATCHES_PRECISE, len(train_data_loader)), cfg.NUM_GPUS > 0, ) eval_results = do_evaluation(cfg, model, test_data_loader, device, cur_epoch=cur_epoch) model.train() if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=cur_epoch + 1) if eval_epoch > 0: logger.info('Start final evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation(cfg, model, test_data_loader, device) if is_master_proc() and summary_writer: for key, value in eval_results.items(): summary_writer.add_scalar(f'eval/{key}', value, global_step=arguments["cur_epoch"]) summary_writer.close() if is_master_proc(): check_pointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) return model