Beispiel #1
0
def train(cfg):
    # Set up environment.
    init_distributed_training(cfg)
    local_rank_id = get_local_rank()

    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED + 10 * local_rank_id)
    torch.manual_seed(cfg.RNG_SEED + 10 * local_rank_id)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    logger.info('init start')
    # 迭代轮数从1开始计数
    arguments = {"cur_epoch": 1}

    device = get_device(local_rank_id)
    model = build_recognizer(cfg, device)
    criterion = build_criterion(cfg, device)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model,
                                optimizer=optimizer,
                                scheduler=lr_scheduler,
                                save_dir=cfg.OUTPUT_DIR,
                                save_to_disk=True)
    if cfg.TRAIN.RESUME:
        logger.info('resume start')
        extra_checkpoint_data = checkpointer.load(map_location=device)
        if isinstance(extra_checkpoint_data, dict):
            arguments['cur_epoch'] = extra_checkpoint_data['cur_epoch']
            if cfg.LR_SCHEDULER.IS_WARMUP:
                logger.info('warmup start')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(
                        lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(
                        lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer
                logger.info('warmup end')
        logger.info('resume end')

    data_loader = build_dataloader(cfg, is_train=True)

    logger.info('init end')
    synchronize()
    do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device)
Beispiel #2
0
def test(cfg):
    # Set up environment.
    init_distributed_training(cfg)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

    device = get_device(local_rank=get_local_rank())
    model = build_recognizer(cfg, device=device)

    synchronize()
    do_evaluation(cfg, model, device)
Beispiel #3
0
def train(gpu, args, cfg):
    rank = args.nr * args.gpus + gpu
    setup(rank, args.world_size)

    logger = setup_logger(cfg.TRAIN.NAME)
    arguments = {"iteration": 0}

    torch.cuda.set_device(gpu)
    device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    model = build_model(cfg, gpu, map_location=map_location)
    criterion = build_criterion(cfg)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT.DIR,
                                save_to_disk=True, logger=logger)
    if args.resume:
        if is_master_proc():
            logger.info('resume ...')
        extra_checkpoint_data = checkpointer.load(map_location=map_location, rank=rank)
        if extra_checkpoint_data != dict():
            arguments['iteration'] = extra_checkpoint_data['iteration']
            if cfg.LR_SCHEDULER.IS_WARMUP:
                if is_master_proc():
                    logger.info('warmup ...')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer

    data_loader = build_dataloader(cfg, is_train=True, start_iter=arguments['iteration'])

    synchronize()
    do_train(args, cfg, arguments,
             data_loader, model, criterion, optimizer, lr_scheduler,
             checkpointer, device, logger)
    cleanup()
Beispiel #4
0
def do_train(args, cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device, logger):
    meters = MetricLogger()
    summary_writer = None

    if is_master_proc():
        logger.info("Start training ...")
        if args.use_tensorboard:
            from torch.utils.tensorboard import SummaryWriter
            summary_writer = SummaryWriter(
                log_dir=os.path.join(cfg.OUTPUT.DIR, 'tf_logs'))

    model.train()
    start_iter = arguments['iteration']
    max_iter = cfg.TRAIN.MAX_ITER

    synchronize()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets) in enumerate(data_loader, start_iter):
        synchronize()
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = targets.to(device)

        outputs = model(images)
        loss = criterion(outputs, targets)
        # compute top-k accuray
        topk_list = topk_accuracy(outputs, targets, topk=(1, 5))
        meters.update(loss=loss / len(targets),
                      acc_1=topk_list[0],
                      acc_5=topk_list[1])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        if iteration % len(data_loader) == 0 and hasattr(
                data_loader.batch_sampler, "set_epoch"):
            data_loader.batch_sampler.set_epoch(iteration)

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time)
        if is_master_proc():
            if iteration % args.log_step == 0:
                eta_seconds = meters.time.global_avg * (max_iter - iteration)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                logger.info(
                    meters.delimiter.join([
                        "iter: {iter:06d}",
                        "lr: {lr:.5f}",
                        '{meters}',
                        "eta: {eta}",
                        'mem: {mem}M',
                    ]).format(
                        iter=iteration,
                        lr=optimizer.param_groups[0]['lr'],
                        meters=str(meters),
                        eta=eta_string,
                        mem=round(torch.cuda.max_memory_allocated() / 1024.0 /
                                  1024.0),
                    ))
                if summary_writer:
                    global_step = iteration
                    for name, meter in meters.meters.items():
                        summary_writer.add_scalar('{}/avg'.format(name),
                                                  float(meter.avg),
                                                  global_step=global_step)
                        summary_writer.add_scalar('{}/global_avg'.format(name),
                                                  meter.global_avg,
                                                  global_step=global_step)
                    summary_writer.add_scalar('lr',
                                              optimizer.param_groups[0]['lr'],
                                              global_step=global_step)

            if not args.stop_save and iteration % args.save_step == 0:
                checkpointer.save("model_{:06d}".format(iteration),
                                  **arguments)
            if not args.stop_eval and args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter:
                eval_results = do_evaluation(cfg,
                                             model,
                                             device,
                                             iteration=iteration)
                if summary_writer:
                    for key, value in eval_results.items():
                        summary_writer.add_scalar(f'eval/{key}',
                                                  value,
                                                  global_step=iteration)
                model.train()

    if is_master_proc() and not args.stop_eval:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=iteration)
            summary_writer.close()
        checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    if is_master_proc():
        logger.info("Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / max_iter))
    return model
Beispiel #5
0
def do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device):
    logger = logging.setup_logging(__name__)
    meters = MetricLogger()
    summary_writer = None

    use_tensorboard = cfg.TRAIN.USE_TENSORBOARD
    log_step = cfg.TRAIN.LOG_STEP
    save_step = cfg.TRAIN.SAVE_STEP
    eval_step = cfg.TRAIN.EVAL_STEP
    max_iter = cfg.TRAIN.MAX_ITER
    start_iter = arguments['iteration']

    if is_master_proc() and use_tensorboard:
        from torch.utils.tensorboard import SummaryWriter
        summary_writer = SummaryWriter(
            log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))
    evaluator = data_loader.dataset.evaluator

    synchronize()
    start_training_time = time.time()
    end = time.time()
    logger.info("Start training ...")
    model.train()
    for iteration, (images, targets) in enumerate(data_loader, start_iter):
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device=device, non_blocking=True)
        targets = targets.to(device=device, non_blocking=True)

        output_dict = model(images)
        loss_dict = criterion(output_dict, targets)
        loss = loss_dict['loss']

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        acc_list = evaluator.evaluate_train(output_dict, targets)
        update_meters(cfg.NUM_GPUS, meters, loss_dict, acc_list)

        if iteration % len(data_loader) == 0 and hasattr(
                data_loader.batch_sampler, "set_epoch"):
            data_loader.batch_sampler.set_epoch(iteration)

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time)
        if iteration % log_step == 0:
            eta_seconds = meters.time.global_avg * (max_iter - iteration)
            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
            logger.info(
                meters.delimiter.join([
                    "iter: {iter:06d}",
                    "lr: {lr:.5f}",
                    '{meters}',
                    "eta: {eta}",
                    'mem: {mem}M',
                ]).format(
                    iter=iteration,
                    lr=optimizer.param_groups[0]['lr'],
                    meters=str(meters),
                    eta=eta_string,
                    mem=round(torch.cuda.max_memory_allocated() / 1024.0 /
                              1024.0),
                ))
        if is_master_proc():
            if summary_writer:
                global_step = iteration
                for name, meter in meters.meters.items():
                    summary_writer.add_scalar('{}/avg'.format(name),
                                              float(meter.avg),
                                              global_step=global_step)
                    summary_writer.add_scalar('{}/global_avg'.format(name),
                                              meter.global_avg,
                                              global_step=global_step)
                summary_writer.add_scalar('lr',
                                          optimizer.param_groups[0]['lr'],
                                          global_step=global_step)

            if save_step > 0 and iteration % save_step == 0:
                checkpointer.save("model_{:06d}".format(iteration),
                                  **arguments)
        if eval_step > 0 and iteration % eval_step == 0 and not iteration == max_iter:
            eval_results = do_evaluation(cfg,
                                         model,
                                         device,
                                         iteration=iteration)
            model.train()
            if is_master_proc() and summary_writer:
                for key, value in eval_results.items():
                    summary_writer.add_scalar(f'eval/{key}',
                                              value,
                                              global_step=iteration)

    if eval_step > 0:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if is_master_proc() and summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=arguments["iteration"])
            summary_writer.close()
    checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    return model
Beispiel #6
0
def do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device):
    meters = MetricLogger()
    evaluator = data_loader.dataset.evaluator
    summary_writer = None
    use_tensorboard = cfg.TRAIN.USE_TENSORBOARD
    if is_master_proc() and use_tensorboard:
        from torch.utils.tensorboard import SummaryWriter
        summary_writer = SummaryWriter(
            log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))

    log_step = cfg.TRAIN.LOG_STEP
    save_epoch = cfg.TRAIN.SAVE_EPOCH
    eval_epoch = cfg.TRAIN.EVAL_EPOCH
    max_epoch = cfg.TRAIN.MAX_EPOCH
    start_epoch = arguments['cur_epoch']
    epoch_iters = len(data_loader)
    max_iter = (max_epoch - start_epoch) * epoch_iters

    synchronize()
    model.train()
    logger.info("Start training ...")
    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch))
    start_training_time = time.time()
    end = time.time()
    for cur_epoch in range(start_epoch, max_epoch + 1):
        shuffle_dataset(data_loader, cur_epoch)
        for iteration, (images, targets) in enumerate(data_loader):
            images = images.to(device=device, non_blocking=True)
            targets = targets.to(device=device, non_blocking=True)

            output_dict = model(images)
            loss_dict = criterion(output_dict, targets)
            loss = loss_dict['loss']

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc_list = evaluator.evaluate_train(output_dict, targets)
            update_stats(cfg.NUM_GPUS, meters, loss_dict, acc_list)

            batch_time = time.time() - end
            end = time.time()
            meters.update(time=batch_time)
            if (iteration + 1) % log_step == 0:
                logger.info(
                    log_iter_stats(iteration, epoch_iters, cur_epoch,
                                   max_epoch, optimizer.param_groups[0]['lr'],
                                   meters))
            if is_master_proc() and summary_writer:
                global_step = (cur_epoch - 1) * epoch_iters + (iteration + 1)
                for name, meter in meters.meters.items():
                    summary_writer.add_scalar('{}/avg'.format(name),
                                              float(meter.avg),
                                              global_step=global_step)
                    summary_writer.add_scalar('{}/global_avg'.format(name),
                                              meter.global_avg,
                                              global_step=global_step)
                summary_writer.add_scalar('lr',
                                          optimizer.param_groups[0]['lr'],
                                          global_step=global_step)

        logger.info(
            log_epoch_stats(epoch_iters, cur_epoch, max_epoch,
                            optimizer.param_groups[0]['lr'], meters))
        arguments["cur_epoch"] = cur_epoch
        lr_scheduler.step()
        if is_master_proc(
        ) and save_epoch > 0 and cur_epoch % save_epoch == 0 and cur_epoch != max_epoch:
            checkpointer.save("model_{:04d}".format(cur_epoch), **arguments)
        if eval_epoch > 0 and cur_epoch % eval_epoch == 0 and cur_epoch != max_epoch:
            eval_results = do_evaluation(cfg,
                                         model,
                                         device,
                                         cur_epoch=cur_epoch)
            model.train()
            if is_master_proc() and summary_writer:
                for key, value in eval_results.items():
                    summary_writer.add_scalar(f'eval/{key}',
                                              value,
                                              global_step=cur_epoch + 1)

    if eval_epoch > 0:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        eval_results = do_evaluation(cfg, model, device)

        if is_master_proc() and summary_writer:
            for key, value in eval_results.items():
                summary_writer.add_scalar(f'eval/{key}',
                                          value,
                                          global_step=arguments["cur_epoch"])
            summary_writer.close()
    if is_master_proc():
        checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    return model