def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, scaled=args.scaled) criterion = DFPLoss(alpha=args.alpha, beta=args.beta) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Within Loss', 'Between Loss', 'Train Acc.']) if not args.evaluate: for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth')) logger.append([epoch + 1, train_out["train_loss"], train_out["cls_loss"], train_out["dis_loss_within"], train_out["dis_loss_between"], train_out["accuracy"]]) if args.plot: plot_feature(net, trainloader, device, args.plotfolder1, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality,normalized=args.plot_normalized) if args.plot: # plot the test set plot_feature(net, testloader, device, args.plotfolder1, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality, normalized=args.plot_normalized) # calculating distances for last epoch distance_results = plot_distance(net, trainloader, device, args) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating ...") stage1_test(net, testloader, device) return {"net": net, "distance": distance_results }
def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, similarity=args.similarity, scaled=args.scaled, norm_centroid=args.norm_centroid, decorrelation=args.decorrelation) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Similarity Loss', 'Distance Loss', 'Train Acc.' ]) # after resume criterion = DFPLoss(alpha=args.alpha) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=20) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_similarity"], train_out["loss_distance"], train_out["accuracy"] ]) # calculating distances for last epoch distance_results = plot_distance(net, trainloader, device, args) # print(f"the distance thresholds are\n {distance_results['thresholds']}\n") # gap_results = plot_gap(net, trainloader, device, args) # stat = get_gap_stat(net, trainloader, device, args) # estimator =CGD_estimator(gap_results) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating ...") stage1_test(net, testloader, device) return { "net": net, "distance": distance_results, # "stat": stat }
def main_stage2(stage1_dict): print('==> Building stage2 model..') start_epoch = 0 # start from epoch 0 or last checkpoint epoch net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, similarity=args.similarity, scaled=args.scaled, norm_centroid=args.norm_centroid, decorrelation=args.decorrelation) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if not args.evaluate and not os.path.isfile(args.stage2_resume): net = stage1_dict['net'] net = net.to(device) thresholds = stage1_dict['distance']['thresholds'] # stat = stage1_dict["stat"] net.module.set_threshold(thresholds.to(device)) if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] try: thresholds = checkpoint['net']['thresholds'] except: thresholds = checkpoint['net']['module.thresholds'] net.module.set_threshold(thresholds.to(device)) logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in', 'Distance out', 'Distance Center', 'Train Acc.' ]) if args.evaluate: stage2_test(net, testloader, device) return net # after resume criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, args.stage2_es): print('\nStage_2 Epoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot. adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20) if epoch % 5 == 0: distance_results = plot_distance(net, trainloader, device, args) thresholds = distance_results['thresholds'] net.module.set_threshold(thresholds.to(device)) train_out = stage2_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) stage2_test(net, testloader, device) # stat = get_gap_stat(net2, trainloader, device, args) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_similarity"], train_out["distance_in"], train_out["distance_out"], train_out["distance_center"], train_out["accuracy"] ]) print(f"\nFinish Stage-2 training...\n") logger.close() stage2_test(net, testloader, device) return net