Beispiel #1
0
def load_model(cfg):
    model = build_model(cfg)
    if cfg.data_parallel:
        model = torch.nn.DataParallel(model)
    checkpoint = torch.load(cfg.infer_cfg.ckpt)
    model.load_state_dict(checkpoint['model'])
    return model
Beispiel #2
0
def load_model(cfg):
    checkpoint = torch.load(cfg.train_cfg.ckpt)
    model = build_model(cfg)
    if cfg.train_cfg.data_parallel:
        model = torch.nn.DataParallel(model)
    model.load_state_dict(checkpoint['model'])

    optimizer = build_optimizer(cfg.train_cfg.optimizer, model.parameters())
    optimizer.load_state_dict(checkpoint['optimizer'])
    return checkpoint, model, optimizer
Beispiel #3
0
def main():
    args = parse_args()
    cfg = Config.fromfile(args.cfg)
    # prepare log
    log_dir = pathlib.Path(cfg.exp_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(log_dir / 'summary')

    # model
    if cfg.train_cfg.resume:
        checkpoint, model, optimizer = load_model(cfg)
        best_dev_loss = checkpoint['best_dev_loss']
        start_epoch = checkpoint['epoch']
        del checkpoint
    else:
        model = build_model(cfg)
        if cfg.train_cfg.data_parallel:
            model = torch.nn.DataParallel(model)
        optimizer = build_optimizer(cfg.train_cfg.optimizer,
                                    model.parameters())
        best_dev_loss = 1e9
        start_epoch = 0
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                **cfg.train_cfg.lr_scheduler)

    # prepare data
    train_loader, dev_loader, display_loader = create_data_loaders(cfg)

    for epoch in range(start_epoch, cfg.train_cfg.num_epochs):
        print('Epoch %d' % epoch)
        scheduler.step(epoch)
        train_loss, train_time = train_epoch(cfg, epoch, model, train_loader,
                                             optimizer, writer)
        dev_loss, dev_time = evaluate(cfg, epoch, model, dev_loader, writer)
        visualize(cfg, epoch, model, display_loader, writer)

        is_new_best = dev_loss < best_dev_loss
        best_dev_loss = min(best_dev_loss, dev_loss)
        save_model(log_dir, epoch, model, optimizer, best_dev_loss,
                   is_new_best)
        logging.info(
            f'Epoch = [{epoch:4d}/{cfg.train_cfg.num_epochs:4d}] TrainLoss = {train_loss:.4g} '
            f'DevLoss = {dev_loss:.4g} TrainTime = {train_time:.4f}s DevTime = {dev_time:.4f}s',
        )
        if (epoch + 1) % 5 == 0:
            time.sleep(60 * 5)
    writer.close()