Example #1
0
def train(**kwargs):
    """
    训练
    训练的主要步骤如下:
    - 定义网络
    - 定义数据
    - 定义损失函数和优化器
    - 计算重要指标
    - 开始训练
      - 训练网络
      - 可视化各种指标
      - 计算在验证集上的指标
    :param kwargs:
    :return:
    """
    # 根据命令行更新参数
    opt.parse(kwargs)
    vis = Visualizer(opt.env)

    # Step 1 定义网络
    model = getattr(models, opt.model)()
    # model = models.ResNet34()
    if opt.load_model_path:
        model.load(opt.load_model_path)
    if opt.use_gpu:
        model.cuda()

    # step2: 数据
    train_data = DogCat(opt.train_data_root, train=True)
    val_data = DogCat(opt.train_data_root, train=False)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers)
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers)

    # step3: 目标函数和优化器
    criterion = t.nn.CrossEntropyLoss()
    lr = opt.lr
    optimizer = t.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=opt.weight_decay)

    # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵
    loss_meter = meter.AverageValueMeter()
    confusion_matrix = meter.ConfusionMeter(2)
    previous_loss = 0.0

    # 训练
    for epoch in range(opt.max_epoch):
        loss_meter.reset()
        confusion_matrix.reset()
        ii = 0
        for data, label in train_dataloader:
            if opt.use_gpu:
                data = data.cuda()
                label = label.cuda()
            optimizer.zero_grad()
            score = model(data)
            loss = criterion(score, label.long())
            loss.backward()
            optimizer.step()

            # 更新统计指标和可视化
            num = loss.data.item()
            loss_meter.add(num)
            confusion_matrix.add(score.data, label.data)
            if ii % opt.print_feq == opt.print_feq - 1:
                vis.plot("loss", loss_meter.value()[0])
            ii += 1

        model.save()

        # 计算验证集上的指标及可视化
        val_cm, val_accuracy = val(model, val_dataloader)
        vis.plot('val_accuracy', val_accuracy)
        vis.log(
            "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}"
            .format(epoch=epoch,
                    loss=loss_meter.value()[0],
                    val_cm=str(val_cm.value()),
                    train_cm=str(confusion_matrix.value()),
                    lr=lr))

        # 如果损失不再下降,则降低学习率
        if loss_meter.value()[0] > previous_loss:
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        previous_loss = loss_meter.value()[0]
Example #2
0
def main():
    # load data
    train_loader = torch.utils.data.DataLoader(NYUDepthDataset(
        cfg.trainval_data_root,
        'train',
        sample_num=cfg.sample_num,
        superpixel=False,
        relative=True,
        transform=True),
                                               batch_size=cfg.batch_size,
                                               shuffle=True,
                                               num_workers=cfg.num_workers,
                                               drop_last=True)
    print('Train Batches:', len(train_loader))

    # val_loader = torch.utils.data.DataLoader(NYUDepthDataset(cfg.trainval_data_root, 'val', transform=True),
    #                                          batch_size=cfg.batch_size, shuffle=True,
    #                                          num_workers=cfg.num_workers, drop_last=True)
    # print('Validation Batches:', len(val_loader))

    test_set = NyuDepthMat(
        cfg.test_data_root,
        '/home/ans/PycharmProjects/SDFCN/data/testIdxs.txt')
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=cfg.batch_size,
                                              shuffle=True,
                                              drop_last=True)

    # train_set = NyuDepthMat(cfg.test_data_root, '/home/ans/PycharmProjects/SDFCN/data/trainIdxs.txt')
    # train_loader = torch.utils.data.DataLoader(train_set,
    #                                           batch_size=cfg.batch_size,
    #                                           shuffle=True, drop_last=True)
    # train_loader = test_loader
    #
    val_loader = test_loader
    # load model and weight
    # model = FCRN(cfg.batch_size)
    model = DUCNet(model=torchvision.models.resnet50(pretrained=True))
    init_upsample = False
    # print(model)

    # loss_fn = berHu()

    if cfg.use_gpu:
        print('Use CUDA')
        model = model.cuda()
        berhu_loss = berHu().cuda()
        rela_loss = relativeloss().cuda()
        loss_fn = torch.nn.MSELoss().cuda()
    else:
        exit(0)

    start_epoch = 0
    # resume_from_file = False
    best_val_err = 10e3

    vis = Visualizer(cfg.env)
    print('Created visdom environment:', cfg.env)
    # 4.Optim
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    print("optimizer set.")
    scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.step, gamma=0.1)

    for epoch in range(cfg.num_epochs):

        scheduler.step()
        print('Starting train epoch %d / %d, lr=%f' %
              (start_epoch + epoch + 1, cfg.num_epochs,
               optimizer.state_dict()['param_groups'][0]['lr']))

        model.train()
        running_loss = 0
        count = 0
        epoch_loss = 0

        for i_batch, sample_batched in enumerate(train_loader):
            input_var = Variable(sample_batched['rgb'].type(dtype))
            depth_var = Variable(sample_batched['depth'].type(dtype))

            optimizer.zero_grad()
            output = model(input_var)
            # loss = loss_fn(output, depth_var)
            loss1 = loss_fn(output, depth_var)
            Ah, Aw, Bh, Bw = generate_relative_pos(sample_batched['center'])

            loss2 = rela_loss(output[..., 0, Ah, Aw], output[..., 0, Bh, Bw],
                              sample_batched['ord'])
            loss = loss1 + loss2

            if i_batch % cfg.print_freq == cfg.print_freq - 1:
                print('{0} batches, loss:{1}, berhu:{2}, relative:{3}'.format(
                    i_batch + 1,
                    loss.data.cpu().item(),
                    loss1.data.cpu().item(),
                    loss2.data.cpu().item()))
                vis.plot('loss', loss.data.cpu().item())

            if i_batch % (cfg.print_freq * 10) == (cfg.print_freq * 10) - 1:
                vis.depth('pred', output)
                # vis.imshow('img', sample_batched['rgb'].type(dtype))
                vis.depth('depth', sample_batched['depth'].type(dtype))

            count += 1
            running_loss += loss.data.cpu().numpy()

            loss.backward()
            optimizer.step()

        epoch_loss = running_loss / count
        print('epoch loss:', epoch_loss)

        val_error, val_rmse = validate(val_loader, model, loss_fn, vis=vis)
        vis.plot('val_error', val_error)
        vis.plot('val_rmse', val_rmse)
        vis.log('epoch:{epoch},lr={lr},epoch_loss:{loss},val_error:{val_cm}'.
                format(epoch=start_epoch + epoch + 1,
                       loss=epoch_loss,
                       val_cm=val_error,
                       lr=optimizer.state_dict()['param_groups'][0]['lr']))

        if val_error < best_val_err:
            best_val_err = val_error
            if not os.path.exists(cfg.checkpoint_dir):
                os.mkdir(cfg.checkpoint_dir)

            torch.save(
                {
                    'epoch': start_epoch + epoch + 1,
                    'state_dict': model.state_dict(),
                    # 'optimitezer': optimizer.state_dict(),
                },
                os.path.join(
                    cfg.checkpoint_dir,
                    '{}_{}_epoch_{}_{}'.format(cfg.checkpoint, cfg.env,
                                               start_epoch + epoch + 1,
                                               cfg.checkpoint_postfix)))

    torch.save(
        {
            'epoch': start_epoch + epoch + 1,
            'state_dict': model.state_dict(),
            # 'optimitezer': optimizer.state_dict(),
        },
        os.path.join(
            cfg.checkpoint_dir,
            '{}_{}_epoch_{}_{}'.format(cfg.checkpoint, cfg.env,
                                       start_epoch + epoch + 1,
                                       cfg.checkpoint_postfix)))
Example #3
0
def main():
    # load data
    train_loader = torch.utils.data.DataLoader(NYUDepthDataset(
        cfg.trainval_data_root,
        'train',
        sample_num=cfg.sample_num,
        superpixel=False,
        relative=False,
        transform=True),
                                               batch_size=cfg.batch_size,
                                               shuffle=True,
                                               num_workers=cfg.num_workers,
                                               drop_last=True)
    print('Train Batches:', len(train_loader))

    # val_loader = torch.utils.data.DataLoader(NYUDepthDataset(cfg.trainval_data_root, 'val', transform=True),
    #                                          batch_size=cfg.batch_size, shuffle=True,
    #                                          num_workers=cfg.num_workers, drop_last=True)
    # print('Validation Batches:', len(val_loader))

    test_set = NyuDepthMat(
        cfg.test_data_root,
        '/home/ans/PycharmProjects/SDFCN/data/testIdxs.txt')
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=cfg.batch_size,
                                              shuffle=True,
                                              drop_last=True)

    # train_set = NyuDepthMat(cfg.test_data_root, '/home/ans/PycharmProjects/SDFCN/data/trainIdxs.txt')
    # train_loader = torch.utils.data.DataLoader(train_set,
    #                                           batch_size=cfg.batch_size,
    #                                           shuffle=True, drop_last=True)
    # train_loader = test_loader
    #
    val_loader = test_loader
    # load model and weight
    # model = FCRN(cfg.batch_size)
    model = ResDUCNet(model=torchvision.models.resnet50(pretrained=False))
    init_upsample = False
    # print(model)

    loss_fn = berHu()

    if cfg.use_gpu:
        print('Use CUDA')
        model = model.cuda()
        # loss_fn = berHu().cuda()
        # loss_fn = torch.nn.MSELoss().cuda()
        loss_fn = torch.nn.L1Loss().cuda()

    start_epoch = 0
    best_val_err = 10e3

    if cfg.resume_from_file:
        if os.path.isfile(cfg.resume_file):
            print("=> loading checkpoint '{}'".format(cfg.resume_file))
            checkpoint = torch.load(cfg.resume_file)
            # start_epoch = checkpoint['epoch']
            start_epoch = 0
            # model.load_state_dict(checkpoint['state_dict'])
            model.load_state_dict(checkpoint['model_state'])
            # print("=> loaded checkpoint '{}' (epoch {})"
            #       .format(cfg.resume_file, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(cfg.resume_file))
    # else:
    #     if init_upsample:
    #         print('Loading weights from ', cfg.weights_file)
    #         # bone_state_dict = load_weights(model, cfg.weights_file, dtype)
    #         model.load_state_dict(load_weights(model, cfg.weights_file, dtype))
    #     else:
    #         print('Loading weights from ', cfg.resnet50_file)
    #         pretrained_dict = torch.load(cfg.resnet50_file)
    #         model_dict = model.state_dict()
    #         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    #         model_dict.update(pretrained_dict)
    #         model.load_state_dict(model_dict)
    #     print('Weights loaded.')

    # val_error, val_rmse = validate(val_loader, model, loss_fn)
    # print('before train: val_error %f, rmse: %f' % (val_error, val_rmse))

    vis = Visualizer(cfg.env)
    # 4.Optim
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    print("optimizer set.")
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=cfg.step,
                                    gamma=cfg.lr_decay)

    for epoch in range(cfg.num_epochs):

        scheduler.step()
        # print(optimizer.state_dict()['param_groups'][0]['lr'])
        print('Starting train epoch %d / %d, lr=%f' %
              (start_epoch + epoch + 1, cfg.num_epochs,
               optimizer.state_dict()['param_groups'][0]['lr']))

        model.train()
        running_loss = 0
        count = 0
        epoch_loss = 0

        for i_batch, sample_batched in enumerate(train_loader):
            input_var = Variable(sample_batched['rgb'].type(dtype))
            depth_var = Variable(sample_batched['depth'].type(dtype))

            optimizer.zero_grad()
            output = model(input_var)
            loss = loss_fn(output, depth_var)

            if i_batch % cfg.print_freq == cfg.print_freq - 1:
                print('{0} batches, loss:{1}'.format(i_batch + 1,
                                                     loss.data.cpu().item()))
                vis.plot('loss', loss.data.cpu().item())

            if i_batch % (cfg.print_freq * 10) == (cfg.print_freq * 10) - 1:
                vis.depth('pred', output)
                # vis.imshow('img', sample_batched['rgb'].type(dtype))
                vis.depth('depth', sample_batched['depth'].type(dtype))

            count += 1
            running_loss += loss.data.cpu().numpy()

            loss.backward()
            optimizer.step()

        epoch_loss = running_loss / count
        print('epoch loss:', epoch_loss)

        val_error, val_rmse = validate(val_loader, model, loss_fn, vis=vis)
        vis.plot('val_error', val_error)
        vis.plot('val_rmse', val_rmse)
        vis.log('epoch:{epoch},lr={lr},epoch_loss:{loss},val_error:{val_cm}'.
                format(epoch=start_epoch + epoch + 1,
                       loss=epoch_loss,
                       val_cm=val_error,
                       lr=optimizer.state_dict()['param_groups'][0]['lr']))

        if val_error < best_val_err:
            best_val_err = val_error
            if not os.path.exists(cfg.checkpoint_dir):
                os.mkdir(cfg.checkpoint_dir)

            torch.save(
                {
                    'epoch': start_epoch + epoch + 1,
                    'state_dict': model.state_dict(),
                    # 'optimitezer': optimizer.state_dict(),
                },
                os.path.join(
                    cfg.checkpoint_dir,
                    '{}_{}_epoch_{}_{}'.format(cfg.checkpoint, cfg.env,
                                               start_epoch + epoch + 1,
                                               cfg.checkpoint_postfix)))

    torch.save(
        {
            'epoch': start_epoch + epoch + 1,
            'state_dict': model.state_dict(),
            # 'optimitezer': optimizer.state_dict(),
        },
        os.path.join(
            cfg.checkpoint_dir,
            '{}_{}_epoch_{}_{}'.format(cfg.checkpoint, cfg.env,
                                       start_epoch + epoch + 1,
                                       cfg.checkpoint_postfix)))
Example #4
0
def train(opt):
    # 更新配置
    vis = Visualizer(opt.env)

    # step1: 加载模型
    model = getattr(models, opt.model)()
    if opt.load_model_path:
        model.load(opt.load_model_path)
    if opt.use_gpu: model.cuda()

    # step2: 数据
    train_data = DogCat(opt.train_data_root, train=True)
    val_data = DogCat(opt.train_data_root, train=False)
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers)
    # 验证集 data 不做变换
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers)

    # step3: 目标函数和优化器
    # 交叉熵损失
    criterion = t.nn.CrossEntropyLoss()
    # 学习率
    lr = opt.lr
    # Adam 优化器
    optimizer = t.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=opt.weight_decay)

    # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵
    # 计算所有数的平均值和标准差,用来统计一个 epoch 中损失的平均值
    loss_meter = meter.AverageValueMeter()
    # 统计分类问题中的分类情况,错误矩阵
    confusion_matrix = meter.ConfusionMeter(2)
    previous_loss = 1e100

    # 训练
    for epoch in range(opt.max_epoch):

        loss_meter.reset()
        confusion_matrix.reset()

        # index,(data, label)
        for ii, (data, label) in enumerate(train_dataloader):

            # 训练模型
            input = data
            target = label
            if opt.use_gpu:
                input = input.cuda()
                target = target.cuda()
            # 梯度清零
            optimizer.zero_grad()
            score = model(input)
            # 损失
            loss = criterion(score, target)
            loss.backward()
            # 优化步骤
            optimizer.step()

            # 更新统计指标以及可视化
            loss_meter.add(loss.item())
            confusion_matrix.add(score.data, target.data)

            if ii % opt.print_freq == opt.print_freq - 1:
                vis.plot('loss', loss_meter.value()[0])

        # checkpoint
        model.save()

        # 计算验证集上的指标及可视化
        val_cm, val_accuracy = val(model, val_dataloader)
        vis.plot('val_accuracy', val_accuracy)
        vis.log(
            "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}"
            .format(epoch=epoch,
                    loss=loss_meter.value()[0],
                    val_cm=str(val_cm.value()),
                    train_cm=str(confusion_matrix.value()),
                    lr=lr))

        # 如果损失不再下降,则降低学习率
        if loss_meter.value()[0] > previous_loss:
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        previous_loss = loss_meter.value()[0]