def train_eval_model(model,
                     criterion,
                     optimizer,
                     dataloader,
                     tfboard_writer,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0):
    print('Start training...')

    since = time.time()
    dataset_size = len(dataloader['train'].dataset)

    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    #model_path = str(checkpoint_path / 'params_{:04}.pt'.format(2))
    #print('Loading model parameters from {}'.format(model_path))
    #load_model(model, model_path)
    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path /
                         'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path /
                         'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    margin_loss = MarginLoss(30)
    marginedge_loss = MarginLoss(1, 0.3)
    scheduler = optim.lr_scheduler.ExponentialLR(
        optimizer,
        gamma=cfg.TRAIN.LR_DECAY,
        last_epoch=cfg.TRAIN.START_EPOCH - 1)
    #scheduler.step()
    for epoch in range(start_epoch, num_epochs):
        score_thresh = min(epoch * 0.1, 0.5)
        print('Epoch {}/{},score_thresh {}'.format(epoch, num_epochs - 1,
                                                   score_thresh))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(
            ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader['train']:
            data1, data2 = [_.cuda() for _ in inputs['images']]

            P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']]
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]

            weights = inputs['ws'].cuda()
            perm_mat = inputs['gt_perm_mat'].cuda()
            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                s_pred, d_pred,match_emb1,match_emb2,match_edgeemb1,match_edgeemb2,perm_mat,n1_gt,n2_gt = \
                    model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt,perm_mat=perm_mat,score_thresh=score_thresh)

                multi_loss = []
                loss_lsm = criterion(s_pred, perm_mat, n1_gt, n2_gt, weights)

                loss_marg = margin_loss(match_emb1, match_emb2, perm_mat,
                                        n1_gt, n2_gt)
                loss_edgemarg = marginedge_loss(match_edgeemb1, match_edgeemb2,
                                                perm_mat, n1_gt, n2_gt)
                loss = (loss_marg + loss_edgemarg
                        ) * 0.25 + loss_lsm  #(loss_marg)*0.5+loss_pca
                # backward + optimize
                loss.backward()
                optimizer.step()

                # tfboard writer
                loss_dict = {
                    'loss_{}'.format(i): l.item()
                    for i, l in enumerate(multi_loss)
                }
                loss_dict['loss'] = loss.item()
                tfboard_writer.add_scalars(
                    'loss', loss_dict,
                    epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                # statistics
                running_loss += loss.item() * perm_mat.size(0)
                epoch_loss += loss.item() * perm_mat.size(0)

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / (
                        time.time() - running_since)
                    print(
                        'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                        .format(
                            epoch, iter_num, running_speed, running_loss /
                            cfg.STATISTIC_STEP / perm_mat.size(0)))
                    tfboard_writer.add_scalars(
                        'speed', {'speed': running_speed},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / dataset_size

        save_model(model,
                   str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(),
                   str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()

        # Eval in each epoch
        accs = eval_model(model, dataloader['test'], train_epoch=epoch)
        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
Esempio n. 2
0
def train_model(model,
                optimizer,
                dataloader,
                num_epochs=25,
                resume=False,
                start_epoch=0):
    print('Start training...')

    since = time.time()  #记录时间开始节点
    dataset_size = len(dataloader['train'].dataset)

    #记录训练内存的设备
    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'  #模型参数储存的位置
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:  #如果是继续训练模型,在现有参数中再进行优化
        assert start_epoch != 0
        model_path = str(checkpoint_path /
                         'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path /
                         'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    record_loss = []
    record_acc = []

    #迭代训练模型
    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('_' * 10)

        model.train()  #设置模型为训练模式(启动梯度传播)
        print('lr = ' + ', '.join(
            ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        #读取样本数据
        for data in dataloader['train']:
            input_A = data['input_A']
            D = data['D']
            Q = data['Q']

            iter_num = iter_num + 1
            optimizer.zero_grad()  #清空梯度

            with torch.set_grad_enabled(True):
                D_pred = model(input_A)  #输入数据A,输出预测D

                loss = (D_pred - D)**2

                loss.backward()  #反传参数
                optimizer.step()  #优化器对参数进行梯度下降优化

                # statistics
                running_loss += loss.item()
                epoch_loss += loss.item()
                record_loss.append(loss.item())

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP / (time.time() -
                                                          running_since)
                    print(
                        'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                        .format(epoch, iter_num, running_speed,
                                running_loss / cfg.STATISTIC_STEP))

                    running_loss = 0.0
                    running_since = time.time()
        epoch_loss = epoch_loss / dataset_size

        save_model(model,
                   str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(),
                   str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()
        #在每次迭代中验证效果
        accs, average_acc = eval_model(model, dataloader['test'])
        record_acc.append(average_acc.item())

    #plot

    fig, axs = plt.subplots(1, 2)
    axs[0].plot(np.array(record_acc))
    axs[0].set_title('average acc')

    axs[1].plot(np.array(record_loss))
    axs[1].set_title('loss')

    plt.savefig('train.png')

    return model
Esempio n. 3
0
def train_eval_model(model,
                     criterion,
                     optimizer,
                     dataloader,
                     tfboard_writer,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0):
    print('Start training...')

    since = time.time()
    dataset_size = len(dataloader['train'].dataset)
    displacement = Displacement()
    lap_solver = hungarian

    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=cfg.TRAIN.LR_STEP,
                                               gamma=cfg.TRAIN.LR_DECAY,
                                               last_epoch=cfg.TRAIN.START_EPOCH - 1)

    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader['train']:
            if 'images' in inputs:
                data1, data2 = [_.cuda() for _ in inputs['images']]
                inp_type = 'img'
            elif 'features' in inputs:
                data1, data2 = [_.cuda() for _ in inputs['features']]
                inp_type = 'feat'
            else:
                raise ValueError('no valid data key (\'images\' or \'features\') found from dataloader!')
            P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']]
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]
            if 'es' in inputs:
                e1_gt, e2_gt = [_.cuda() for _ in inputs['es']]
                G1_gt, G2_gt = [_.cuda() for _ in inputs['Gs']]
                H1_gt, H2_gt = [_.cuda() for _ in inputs['Hs']]
                KG, KH = [_.cuda() for _ in inputs['Ks']]
            perm_mat = inputs['gt_perm_mat'].cuda()

            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                if 'es' in inputs:
                    s_pred, d_pred = \
                        model(data1, data2, P1_gt, P2_gt, G1_gt, G2_gt, H1_gt, H2_gt, n1_gt, n2_gt, KG, KH, inp_type)
                else:
                    s_pred, d_pred = \
                    model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt)

                multi_loss = []
                if cfg.TRAIN.LOSS_FUNC == 'offset':
                    d_gt, grad_mask = displacement(perm_mat, P1_gt, P2_gt, n1_gt)
                    loss = criterion(d_pred, d_gt, grad_mask)
                elif cfg.TRAIN.LOSS_FUNC == 'perm':
                    loss = criterion(s_pred, perm_mat, n1_gt, n2_gt)
                else:
                    raise ValueError('Unknown loss function {}'.format(cfg.TRAIN.LOSS_FUNC))

                # backward + optimize
                loss.backward()
                optimizer.step()

                if cfg.MODULE == 'NGM.hypermodel':
                    tfboard_writer.add_scalars(
                        'weight',
                        {'w2': model.module.weight2, 'w3': model.module.weight3},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )

                # training accuracy statistic
                acc, _, __ = matching_accuracy(lap_solver(s_pred, n1_gt, n2_gt), perm_mat, n1_gt)

                # tfboard writer
                loss_dict = {'loss_{}'.format(i): l.item() for i, l in enumerate(multi_loss)}
                loss_dict['loss'] = loss.item()
                tfboard_writer.add_scalars('loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                accdict = dict()
                accdict['matching accuracy'] = acc
                tfboard_writer.add_scalars(
                    'training accuracy',
                    accdict,
                    epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                )

                # statistics
                running_loss += loss.item() * perm_mat.size(0)
                epoch_loss += loss.item() * perm_mat.size(0)

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / (time.time() - running_since)
                    print('Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                          .format(epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / perm_mat.size(0)))
                    tfboard_writer.add_scalars(
                        'speed',
                        {'speed': running_speed},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )
                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / dataset_size

        save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()

        # Eval in each epoch
        accs = eval_model(model, dataloader['test'])
        acc_dict = {"{}".format(cls): single_acc for cls, single_acc in zip(dataloader['train'].dataset.classes, accs)}
        acc_dict['average'] = torch.mean(accs)
        tfboard_writer.add_scalars(
            'Eval acc',
            acc_dict,
            (epoch + 1) * cfg.TRAIN.EPOCH_ITERS
        )

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'
          .format(time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
Esempio n. 4
0
def train_eval_model(model,
                     permLoss,
                     optimizer,
                     dataloader,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0,
                     viz=None,
                     savefiletime='time'):
    print('**************************************')
    print('Start training...')
    dataset_size = len(dataloader['train'].dataset)
    print('train datasize: {}'.format(dataset_size))

    since = time.time()
    lap_solver = hungarian
    optimal_acc = 0.0
    optimal_rot = np.inf
    device = next(model.parameters()).device

    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path /
                         'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path /
                         'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=cfg.TRAIN.LR_STEP,
        gamma=cfg.TRAIN.LR_DECAY,
        last_epoch=cfg.TRAIN.START_EPOCH - 1)

    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(
            ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        iter_num = 0
        running_since = time.time()
        all_train_metrics_np = defaultdict(list)

        # Iterate over data3d.
        for inputs in dataloader['train']:
            P1_gt, P2_gt = [_.cuda()
                            for _ in inputs['Ps']]  #keypoints coordinate
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]  #keypoints number
            A1_gt, A2_gt = [_.cuda()
                            for _ in inputs['As']]  #edge connect matrix
            perm_mat = inputs['gt_perm_mat'].cuda()  #permute matrix
            T1_gt, T2_gt = [_.cuda() for _ in inputs['Ts']]
            Inlier_src_gt, Inlier_ref_gt = [_.cuda() for _ in inputs['Ins']]

            batch_cur_size = perm_mat.size(0)
            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                s_pred, Inlier_src_pre, Inlier_ref_pre = model(
                    P1_gt, P2_gt, A1_gt, A2_gt, n1_gt, n2_gt)

                # multi_loss = []
                if cfg.DATASET.NOISE_TYPE == 'clean':
                    permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt)
                    loss = permloss
                else:
                    if cfg.PGM.USEINLIERRATE:
                        s_pred = Inlier_src_pre * s_pred * Inlier_ref_pre.transpose(
                            2, 1).contiguous()
                    permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt)
                    loss = permloss

                # backward + optimize
                loss.backward()
                optimizer.step()

                # training accuracy statistic
                s_perm_mat = lap_solver(s_pred, n1_gt, n2_gt, Inlier_src_pre,
                                        Inlier_ref_pre)
                match_metrics = matching_accuracy(s_perm_mat, perm_mat, n1_gt)
                perform_metrics = compute_metrics(s_perm_mat, P1_gt[:, :, :3],
                                                  P2_gt[:, :, :3],
                                                  T1_gt[:, :3, :3],
                                                  T1_gt[:, :3, 3])

                for k in match_metrics:
                    all_train_metrics_np[k].append(match_metrics[k])
                for k in perform_metrics:
                    all_train_metrics_np[k].append(perform_metrics[k])
                all_train_metrics_np['loss'].append(np.repeat(loss.item(), 4))

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * batch_cur_size / (
                        time.time() - running_since)
                    # globalstep = epoch * dataset_size + iter_num * batch_cur_size
                    print(
                        'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}'
                        .format(
                            epoch, iter_num, running_speed,
                            np.mean(
                                np.concatenate(all_train_metrics_np['loss'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:]),
                            np.mean(
                                np.concatenate(all_train_metrics_np['acc_gt'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:]),
                            np.mean(
                                np.concatenate(
                                    all_train_metrics_np['acc_pred'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:])))
                    running_since = time.time()

        all_train_metrics_np = {
            k: np.concatenate(all_train_metrics_np[k])
            for k in all_train_metrics_np
        }
        summary_metrics = summarize_metrics(all_train_metrics_np)
        print('Epoch {:<4} Mean-Loss: {:.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}'.
              format(epoch, summary_metrics['loss'], summary_metrics['acc_gt'],
                     summary_metrics['acc_pred']))
        print_metrics(summary_metrics)

        save_model(model,
                   str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(),
                   str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        # to save values during training
        metric_is_save = False
        if metric_is_save:
            np.save(
                str(
                    Path(cfg.OUTPUT_PATH) /
                    ('train_log_' + savefiletime + '_metric')),
                all_train_metrics_np)

        if viz is not None:
            viz.update('train_loss', epoch, {'loss': summary_metrics['loss']})
            viz.update('train_acc', epoch, {'acc': summary_metrics['acc_gt']})
            viz.update(
                'train_metric', epoch, {
                    'r_mae': summary_metrics['r_mae'],
                    't_mae': summary_metrics['t_mae']
                })

        # Eval in each epochgi
        val_metrics = eval_model(model, dataloader['val'])
        if viz is not None:
            viz.update('val_acc', epoch, {'acc': val_metrics['acc_gt']})
            viz.update('val_metric', epoch, {
                'r_mae': val_metrics['r_mae'],
                't_mae': val_metrics['t_mae']
            })
        if optimal_acc < val_metrics['acc_gt']:
            optimal_acc = val_metrics['acc_gt']
            print('Current best acc model is {}'.format(epoch + 1))
        if optimal_rot > val_metrics['r_mae']:
            optimal_rot = val_metrics['r_mae']
            print('Current best rotation model is {}'.format(epoch + 1))

        # Test in each epochgi
        test_metrics = eval_model(model, dataloader['test'])
        if viz is not None:
            viz.update('test_acc', epoch, {'acc': test_metrics['acc_gt']})
            viz.update('test_metric', epoch, {
                'r_mae': test_metrics['r_mae'],
                't_mae': test_metrics['t_mae']
            })

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model