Esempio n. 1
0
def inference(cfg, model, device, **kwargs):
    cur_epoch = kwargs.get('cur_epoch', None)
    dataset_name = cfg.DATASETS.TEST.NAME
    num_gpus = cfg.NUM_GPUS

    data_loader = build_dataloader(cfg, is_train=False)
    dataset = data_loader.dataset
    evaluator = data_loader.dataset.evaluator
    evaluator.clean()

    logger.info("Evaluating {} dataset({} video clips):".format(dataset_name, len(dataset)))

    if is_master_proc():
        for images, targets in tqdm(data_loader):
            compute_on_dataset(images, targets, device, model, num_gpus, evaluator)
    else:
        for images, targets in data_loader:
            compute_on_dataset(images, targets, device, model, num_gpus, evaluator)

    result_str, acc_dict = evaluator.get()
    logger.info(result_str)

    if is_master_proc():
        output_dir = cfg.OUTPUT_DIR
        result_path = os.path.join(output_dir,
                                   'result_{}.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) \
            if cur_epoch is None else os.path.join(output_dir, 'result_{:04d}.txt'.format(cur_epoch))

        with open(result_path, "w") as f:
            f.write(result_str)

    return acc_dict
Esempio n. 2
0
def setup_logging(output_dir=None):
    """
    Sets up the logging for multiple processes. Only enable the logging for the
    master process, and suppress logging for the non-master processes.
    """
    # Set up logging format.
    _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s"

    if du.is_master_proc():
        # Enable logging for the master process.
        logging.root.handlers = []
    else:
        # Suppress logging for non-master processes.
        _suppress_print()

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    logger.propagate = False
    plain_formatter = logging.Formatter(
        "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s",
        datefmt="%m/%d %H:%M:%S",
    )

    if du.is_master_proc():
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(plain_formatter)
        logger.addHandler(ch)

    if output_dir is not None and du.is_master_proc(du.get_world_size()):
        filename = os.path.join(output_dir, "stdout.log")
        fh = logging.FileHandler(filename)
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(plain_formatter)
        logger.addHandler(fh)
Esempio n. 3
0
def setup_logging(name, output_dir=None):
    """
    Sets up the logging for multiple processes. Only enable the logging for the
    master process, and suppress logging for the non-master processes.
    """
    if not du.is_master_proc(du.get_world_size()):
        # Suppress logging for non-master processes.
        _suppress_print()
        logger = NllLogger(f'{name}.{du.get_rank()}')
        return logger

    logger = logging.getLogger(name)
    logging.root.handlers = []
    for handler in logger.handlers:
        logger.removeHandler(handler)

    logger.setLevel(logging.DEBUG)
    logger.propagate = False

    plain_formatter = logging.Formatter(
        "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s",
        datefmt="%m/%d %H:%M:%S",
    )
    ch = logging.StreamHandler(stream=sys.stdout)
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(plain_formatter)
    logger.addHandler(ch)

    if output_dir:
        fh = logging.FileHandler(os.path.join(output_dir, 'log.txt'))
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(plain_formatter)
        logger.addHandler(fh)

    return logger
Esempio n. 4
0
def train(gpu, args, cfg):
    rank = args.nr * args.gpus + gpu
    setup(rank, args.world_size)

    logger = setup_logger(cfg.TRAIN.NAME)
    arguments = {"iteration": 0}

    torch.cuda.set_device(gpu)
    device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    model = build_model(cfg, gpu, map_location=map_location)
    criterion = build_criterion(cfg)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT.DIR,
                                save_to_disk=True, logger=logger)
    if args.resume:
        if is_master_proc():
            logger.info('resume ...')
        extra_checkpoint_data = checkpointer.load(map_location=map_location, rank=rank)
        if extra_checkpoint_data != dict():
            arguments['iteration'] = extra_checkpoint_data['iteration']
            if cfg.LR_SCHEDULER.IS_WARMUP:
                if is_master_proc():
                    logger.info('warmup ...')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer

    data_loader = build_dataloader(cfg, is_train=True, start_iter=arguments['iteration'])

    synchronize()
    do_train(args, cfg, arguments,
             data_loader, model, criterion, optimizer, lr_scheduler,
             checkpointer, device, logger)
    cleanup()
Esempio n. 5
0
def build_model(cfg, gpu, map_location=None, logger=None):
    model = registry.RECOGNIZER[cfg.MODEL.RECOGNIZER.NAME](
        cfg, map_location=map_location).cuda(gpu)

    world_size = du.get_world_size()
    rank = du.get_rank()
    if cfg.MODEL.SYNC_BN and world_size > 1:
        process_group = simple_group_split(world_size, rank, 1)
        convert_sync_bn(model, process_group, gpu=gpu)
    if cfg.MODEL.PRETRAINED != "":
        if du.is_master_proc() and logger:
            logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}')
        checkpointer = CheckPointer(model, logger=logger)
        checkpointer.load(cfg.MODEL.PRETRAINED,
                          map_location=map_location,
                          rank=rank)

    if du.get_world_size() > 1:
        model = DDP(model,
                    device_ids=[gpu],
                    output_device=gpu,
                    find_unused_parameters=True)

    return model
Esempio n. 6
0
def do_train(args, cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device, logger):
    meters = MetricLogger()
    summary_writer = None

    if is_master_proc():
        logger.info("Start training ...")
        if args.use_tensorboard:
            from torch.utils.tensorboard import SummaryWriter
            summary_writer = SummaryWriter(
                log_dir=os.path.join(cfg.OUTPUT.DIR, 'tf_logs'))

    model.train()
    start_iter = arguments['iteration']
    max_iter = cfg.TRAIN.MAX_ITER

    synchronize()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets) in enumerate(data_loader, start_iter):
        synchronize()
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = targets.to(device)

        outputs = model(images)
        loss = criterion(outputs, targets)
        # compute top-k accuray
        topk_list = topk_accuracy(outputs, targets, topk=(1, 5))
        meters.update(loss=loss / len(targets),
                      acc_1=topk_list[0],
                      acc_5=topk_list[1])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        if iteration % len(data_loader) == 0 and hasattr(
                data_loader.batch_sampler, "set_epoch"):
            data_loader.batch_sampler.set_epoch(iteration)

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time)
        if is_master_proc():
            if iteration % args.log_step == 0:
                eta_seconds = meters.time.global_avg * (max_iter - iteration)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                logger.info(
                    meters.delimiter.join([
                        "iter: {iter:06d}",
                        "lr: {lr:.5f}",
                        '{meters}',
                        "eta: {eta}",
                        'mem: {mem}M',
                    ]).format(
                        iter=iteration,
                        lr=optimizer.param_groups[0]['lr'],
                        meters=str(meters),
                        eta=eta_string,
                        mem=round(torch.cuda.max_memory_allocated() / 1024.0 /
                                  1024.0),
                    ))
                if summary_writer:
                    global_step = iteration
                    for name, meter in meters.meters.items():
                        summary_writer.add_scalar('{}/avg'.format(name),
                                                  float(meter.avg),
                                                  global_step=global_step)
                        summary_writer.add_scalar('{}/global_avg'.format(name),
                                                  meter.global_avg,
                                                  global_step=global_step)
                    summary_writer.add_scalar('lr',
                                              optimizer.param_groups[0]['lr'],
                                              global_step=global_step)

            if not args.stop_save and iteration % args.save_step == 0:
                checkpointer.save("model_{:06d}".format(iteration),
                                  **arguments)
            if not args.stop_eval and args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter:
                eval_results = do_evaluation(cfg,
                                             model,
                                             device,
                                             iteration=iteration)
                if summary_writer:
                    for key, value in eval_results.items():
                        summary_writer.add_scalar(f'eval/{key}',
                                                  value,
                                                  global_step=iteration)
                model.train()

    if is_master_proc() and not args.stop_eval:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=iteration)
            summary_writer.close()
        checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    if is_master_proc():
        logger.info("Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / max_iter))
    return model
Esempio n. 7
0
File: trainer.py Progetto: ZJCV/TSM
def do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device):
    logger = logging.setup_logging(__name__)
    meters = MetricLogger()
    summary_writer = None

    use_tensorboard = cfg.TRAIN.USE_TENSORBOARD
    log_step = cfg.TRAIN.LOG_STEP
    save_step = cfg.TRAIN.SAVE_STEP
    eval_step = cfg.TRAIN.EVAL_STEP
    max_iter = cfg.TRAIN.MAX_ITER
    start_iter = arguments['iteration']

    if is_master_proc() and use_tensorboard:
        from torch.utils.tensorboard import SummaryWriter
        summary_writer = SummaryWriter(
            log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))
    evaluator = data_loader.dataset.evaluator

    synchronize()
    start_training_time = time.time()
    end = time.time()
    logger.info("Start training ...")
    model.train()
    for iteration, (images, targets) in enumerate(data_loader, start_iter):
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device=device, non_blocking=True)
        targets = targets.to(device=device, non_blocking=True)

        output_dict = model(images)
        loss_dict = criterion(output_dict, targets)
        loss = loss_dict['loss']

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        acc_list = evaluator.evaluate_train(output_dict, targets)
        update_meters(cfg.NUM_GPUS, meters, loss_dict, acc_list)

        if iteration % len(data_loader) == 0 and hasattr(
                data_loader.batch_sampler, "set_epoch"):
            data_loader.batch_sampler.set_epoch(iteration)

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time)
        if iteration % log_step == 0:
            eta_seconds = meters.time.global_avg * (max_iter - iteration)
            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
            logger.info(
                meters.delimiter.join([
                    "iter: {iter:06d}",
                    "lr: {lr:.5f}",
                    '{meters}',
                    "eta: {eta}",
                    'mem: {mem}M',
                ]).format(
                    iter=iteration,
                    lr=optimizer.param_groups[0]['lr'],
                    meters=str(meters),
                    eta=eta_string,
                    mem=round(torch.cuda.max_memory_allocated() / 1024.0 /
                              1024.0),
                ))
        if is_master_proc():
            if summary_writer:
                global_step = iteration
                for name, meter in meters.meters.items():
                    summary_writer.add_scalar('{}/avg'.format(name),
                                              float(meter.avg),
                                              global_step=global_step)
                    summary_writer.add_scalar('{}/global_avg'.format(name),
                                              meter.global_avg,
                                              global_step=global_step)
                summary_writer.add_scalar('lr',
                                          optimizer.param_groups[0]['lr'],
                                          global_step=global_step)

            if save_step > 0 and iteration % save_step == 0:
                checkpointer.save("model_{:06d}".format(iteration),
                                  **arguments)
        if eval_step > 0 and iteration % eval_step == 0 and not iteration == max_iter:
            eval_results = do_evaluation(cfg,
                                         model,
                                         device,
                                         iteration=iteration)
            model.train()
            if is_master_proc() and summary_writer:
                for key, value in eval_results.items():
                    summary_writer.add_scalar(f'eval/{key}',
                                              value,
                                              global_step=iteration)

    if eval_step > 0:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if is_master_proc() and summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=arguments["iteration"])
            summary_writer.close()
    checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    return model
Esempio n. 8
0
def do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device):
    meters = MetricLogger()
    evaluator = data_loader.dataset.evaluator
    summary_writer = None
    use_tensorboard = cfg.TRAIN.USE_TENSORBOARD
    if is_master_proc() and use_tensorboard:
        from torch.utils.tensorboard import SummaryWriter
        summary_writer = SummaryWriter(
            log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))

    log_step = cfg.TRAIN.LOG_STEP
    save_epoch = cfg.TRAIN.SAVE_EPOCH
    eval_epoch = cfg.TRAIN.EVAL_EPOCH
    max_epoch = cfg.TRAIN.MAX_EPOCH
    start_epoch = arguments['cur_epoch']
    epoch_iters = len(data_loader)
    max_iter = (max_epoch - start_epoch) * epoch_iters

    synchronize()
    model.train()
    logger.info("Start training ...")
    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch))
    start_training_time = time.time()
    end = time.time()
    for cur_epoch in range(start_epoch, max_epoch + 1):
        shuffle_dataset(data_loader, cur_epoch)
        for iteration, (images, targets) in enumerate(data_loader):
            images = images.to(device=device, non_blocking=True)
            targets = targets.to(device=device, non_blocking=True)

            output_dict = model(images)
            loss_dict = criterion(output_dict, targets)
            loss = loss_dict['loss']

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc_list = evaluator.evaluate_train(output_dict, targets)
            update_stats(cfg.NUM_GPUS, meters, loss_dict, acc_list)

            batch_time = time.time() - end
            end = time.time()
            meters.update(time=batch_time)
            if (iteration + 1) % log_step == 0:
                logger.info(
                    log_iter_stats(iteration, epoch_iters, cur_epoch,
                                   max_epoch, optimizer.param_groups[0]['lr'],
                                   meters))
            if is_master_proc() and summary_writer:
                global_step = (cur_epoch - 1) * epoch_iters + (iteration + 1)
                for name, meter in meters.meters.items():
                    summary_writer.add_scalar('{}/avg'.format(name),
                                              float(meter.avg),
                                              global_step=global_step)
                    summary_writer.add_scalar('{}/global_avg'.format(name),
                                              meter.global_avg,
                                              global_step=global_step)
                summary_writer.add_scalar('lr',
                                          optimizer.param_groups[0]['lr'],
                                          global_step=global_step)

        logger.info(
            log_epoch_stats(epoch_iters, cur_epoch, max_epoch,
                            optimizer.param_groups[0]['lr'], meters))
        arguments["cur_epoch"] = cur_epoch
        lr_scheduler.step()
        if is_master_proc(
        ) and save_epoch > 0 and cur_epoch % save_epoch == 0 and cur_epoch != max_epoch:
            checkpointer.save("model_{:04d}".format(cur_epoch), **arguments)
        if eval_epoch > 0 and cur_epoch % eval_epoch == 0 and cur_epoch != max_epoch:
            eval_results = do_evaluation(cfg,
                                         model,
                                         device,
                                         cur_epoch=cur_epoch)
            model.train()
            if is_master_proc() and summary_writer:
                for key, value in eval_results.items():
                    summary_writer.add_scalar(f'eval/{key}',
                                              value,
                                              global_step=cur_epoch + 1)

    if eval_epoch > 0:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if is_master_proc() and summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=arguments["cur_epoch"])
            summary_writer.close()
    if is_master_proc():
        checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    return model