Example #1
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                 distance=args.distance, scaled=args.scaled)
    criterion = DFPLoss(alpha=args.alpha, beta=args.beta)
    optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Within Loss', 'Between Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion, device)
            save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth'))
            logger.append([epoch + 1, train_out["train_loss"], train_out["cls_loss"], train_out["dis_loss_within"],
                           train_out["dis_loss_between"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net, trainloader, device, args.plotfolder1, epoch=epoch,
                             plot_class_num=args.train_class_num, maximum=args.plot_max,
                             plot_quality=args.plot_quality,normalized=args.plot_normalized)
    if args.plot:
        # plot the test set
        plot_feature(net, testloader, device, args.plotfolder1, epoch="test",
                     plot_class_num=args.train_class_num + 1, maximum=args.plot_max,
                     plot_quality=args.plot_quality, normalized=args.plot_normalized)

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {"net": net,
            "distance": distance_results
            }
Example #2
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance Loss',
            'Train Acc.'
        ])

    # after resume
    criterion = DFPLoss(alpha=args.alpha)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage1_es):
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        train_out = stage1_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["loss_distance"], train_out["accuracy"]
        ])

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)
    # print(f"the distance thresholds are\n {distance_results['thresholds']}\n")
    # gap_results = plot_gap(net, trainloader, device, args)
    # stat = get_gap_stat(net, trainloader, device, args)
    # estimator =CGD_estimator(gap_results)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {
        "net": net,
        "distance": distance_results,
        # "stat": stat
    }
def main_stage2(stage1_dict):
    print('==> Building stage2 model..')
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if not args.evaluate and not os.path.isfile(args.stage2_resume):
        net = stage1_dict['net']
        net = net.to(device)
        thresholds = stage1_dict['distance']['thresholds']
        # stat = stage1_dict["stat"]
        net.module.set_threshold(thresholds.to(device))

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            try:
                thresholds = checkpoint['net']['thresholds']
            except:
                thresholds = checkpoint['net']['module.thresholds']
            net.module.set_threshold(thresholds.to(device))

            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in',
            'Distance out', 'Distance Center', 'Train Acc.'
        ])

    if args.evaluate:
        stage2_test(net, testloader, device)
        return net

    # after resume
    criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage2_es):
        print('\nStage_2 Epoch: %d   Learning rate: %f' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
        adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20)
        if epoch % 5 == 0:
            distance_results = plot_distance(net, trainloader, device, args)
            thresholds = distance_results['thresholds']
            net.module.set_threshold(thresholds.to(device))
        train_out = stage2_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
        stage2_test(net, testloader, device)
        # stat = get_gap_stat(net2, trainloader, device, args)

        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["distance_in"], train_out["distance_out"],
            train_out["distance_center"], train_out["accuracy"]
        ])

    print(f"\nFinish Stage-2 training...\n")

    logger.close()
    stage2_test(net, testloader, device)
    return net