def main_stage1(): print(f"\nStart Stage-1 training...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch # data loader # Model print('==> Building model..') net = Network(backbone=args.arch, embed_dim=512, num_classes=args.train_class_num, use_fc=False, attmodule=False, classifier='dotproduct', backbone_fc=False, data_shape=4) # net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100 net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names( ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.']) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, args.stage1_es): print('\nStage_1 Epoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.lr, step=10) train_loss, train_acc = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, None, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append([ epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc ]) logger.close() print(f"\nFinish Stage-1 training...\n") return net
def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, scaled=args.scaled) criterion = DFPLoss(alpha=args.alpha, beta=args.beta) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Within Loss', 'Between Loss', 'Train Acc.']) if not args.evaluate: for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth')) logger.append([epoch + 1, train_out["train_loss"], train_out["cls_loss"], train_out["dis_loss_within"], train_out["dis_loss_between"], train_out["accuracy"]]) if args.plot: plot_feature(net, trainloader, device, args.plotfolder1, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality,normalized=args.plot_normalized) if args.plot: # plot the test set plot_feature(net, testloader, device, args.plotfolder1, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality, normalized=args.plot_normalized) # calculating distances for last epoch distance_results = plot_distance(net, trainloader, device, args) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating ...") stage1_test(net, testloader, device) return {"net": net, "distance": distance_results }
def main_stage2(net, mid_energy): print("Starting stage-2 fine-tuning ...") if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Train Acc.']) # after resume criterion = DFPLoss(temperature=args.temperature) optimizer = torch.optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) if not args.evaluate: for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, factor=args.stage1_lr_factor, step=args.stage1_lr_step) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append( [epoch + 1, train_out["train_loss"], train_out["accuracy"]]) if args.plot: plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, plot_quality=args.plot_quality) plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch), plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True) logger.close() print(f"\nFinish Stage-1 training...\n")
def main(): print(f"\nStart training ...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Building model..') net = BuildNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names( ['Epoch', 'Train Loss', 'Train Acc.', "Test F1", 'threshold']) if not args.evaluate: for epoch in range(start_epoch, args.es): adjust_learning_rate(optimizer, epoch, args.lr, factor=args.lr_factor, step=args.lr_step) print('\nEpoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'last_model.pth')) test_out = test(net, testloader, criterion, device) logger.append([ epoch + 1, train_out["train_loss"], train_out["accuracy"], test_out["best_F1"], test_out["best_thres"] ]) logger.close() print(f"\nFinish training...\n") else: print("===> Evaluating ...") test(net, testloader, criterion, device)
def main_stage1(): print(f"\nStart Stage-1 training ...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, p=args.p) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion = DFPLoss(temperature=args.temperature) optimizer = torch.optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Train Acc.']) if not args.evaluate: for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, factor=args.stage1_lr_factor, step=args.stage1_lr_step) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]]) if args.plot: plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, plot_quality=args.plot_quality) plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch), plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating stage-1 ...") stage_test(net, testloader, device) mid_dict = stage_valmixup(net, trainloader, device) print("===> stage1 energy based classification") stage_evaluate(net, testloader, mid_dict["mid_unknown"].item(), mid_dict["mid_known"].item(), feature="energy") print("===> stage1 softmax based classification") stage_evaluate(net, testloader, 0., 1., feature="normweight_fea2cen") return { "net": net.state_dict(), "mid_known": mid_dict["mid_known"], "mid_unknown": mid_dict["mid_unknown"] }
def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Train Acc.']) # after resume criterion = DFPLoss(scaling=args.scaling) optimizer = optim.Adam(net.parameters(), lr=args.stage1_lr) for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, factor=0.2, step=20) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]]) if args.plot: plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, plot_quality=args.plot_quality) plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch), plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating ...") stage1_test(net, testloader, device) return { "net": net }
def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight) # embed_dim = net.feat_dim if not args.embed_dim else args.embed_dim # criterion_cls = nn.CrossEntropyLoss() criterion_dis = DFPLoss(beta=args.beta, sigma=args.sigma) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss', 'Within Loss', 'Between Loss', 'Cen2cen Loss', 'Train Acc.']) for epoch in range(start_epoch, start_epoch + args.stage1_es): print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15) train_out = stage1_train(net, trainloader, optimizer, criterion_dis, device) save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth')) # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss', # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.'] logger.append([epoch + 1, train_out["train_loss"], 0.0, train_out["dis_loss_total"], train_out["dis_loss_within"], train_out["dis_loss_between"], train_out["dis_loss_cen2cen"], train_out["accuracy"]]) if args.plot: plot_feature(net, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max,plot_quality=args.plot_quality) logger.close() print(f"\nFinish Stage-1 training...\n") return net
def main_stage2(net, mid_known, mid_unknown): print("Starting stage-2 fine-tuning ...") start_epoch = 0 criterion = FinetuneLoss(mid_known=mid_known, mid_unknown=mid_unknown, gamma=args.gamma, temperature=args.temperature, feature='energy') criterion = criterion.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4) if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.stage2_resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names(['Epoch', 'Train Loss', 'Class Loss', 'Energy Loss', 'Energy Known', 'Energy Unknown', 'Train Acc.', "Test F1"]) if not args.evaluate: best_F1_list = [] for epoch in range(start_epoch, args.stage2_es): adjust_learning_rate(optimizer, epoch, args.stage2_lr, factor=args.stage2_lr_factor, step=args.stage2_lr_step) print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage2_train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) # test_out = test_with_hist(net, testloader, device, name=f"stage2_test{epoch}") test_out = test(net, testloader, device) # stage_valmixup(net, trainloader, device, name=f"stage2_mixup{epoch}") logger.append([epoch + 1, train_out["train_loss"], train_out["loss_classification"], train_out["loss_energy"], train_out["loss_energy_known"], train_out["loss_energy_unknown"], train_out["accuracy"], test_out["best_F1"] ]) best_F1_list.append(test_out["best_F1"]) logger.close() print(f"\nFinish Stage-2 training...\n") last_five = np.array(best_F1_list[-5:]) print(f"\nGamma:{args.gamma} | F1_mean: {last_five.mean()} | F1_std: {last_five.std()}")
def main_stage1(): print(f"\nStart Stage-1 training ...\n") # for initializing backbone, two branches, and centroids. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Model print('==> Building model..') net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, similarity=args.similarity, scaled=args.scaled, norm_centroid=args.norm_centroid, decorrelation=args.decorrelation) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.stage1_resume: # Load checkpoint. if os.path.isfile(args.stage1_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Similarity Loss', 'Distance Loss', 'Train Acc.' ]) # after resume criterion = DFPLoss(alpha=args.alpha) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, args.stage1_es): adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=20) print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage1_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth')) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_similarity"], train_out["loss_distance"], train_out["accuracy"] ]) # calculating distances for last epoch distance_results = plot_distance(net, trainloader, device, args) # print(f"the distance thresholds are\n {distance_results['thresholds']}\n") # gap_results = plot_gap(net, trainloader, device, args) # stat = get_gap_stat(net, trainloader, device, args) # estimator =CGD_estimator(gap_results) logger.close() print(f"\nFinish Stage-1 training...\n") print("===> Evaluating ...") stage1_test(net, testloader, device) return { "net": net, "distance": distance_results, # "stat": stat }
def main(): print(f"\nStart training ...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Building model..') net = BuildNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] loggerList = [] for i in range(args.train_class_num, args.test_class_num + 1): loggerList.append( Logger(os.path.join(args.checkpoint, f'log{i}.txt'), resume=True)) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: loggerList = [] for i in range(args.train_class_num, args.test_class_num + 1): logger = Logger(os.path.join(args.checkpoint, f'log{i}.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Train Acc.', "Pos-F1", 'Norm-F1', 'Energy-F1' ]) loggerList.append(logger) if not args.evaluate: for epoch in range(start_epoch, args.es): adjust_learning_rate(optimizer, epoch, args.lr, factor=args.lr_factor, step=args.lr_step) print('\nEpoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'last_model.pth')) for test_class_num in range(args.train_class_num, args.test_class_num + 1): testset = CIFAR10( root='../../data', train=False, download=True, transform=transform_test, train_class_num=args.train_class_num, test_class_num=test_class_num, includes_all_train_class=args.includes_all_train_class) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) test_out = test(net, testloader, criterion, device) logger = loggerList[test_class_num - args.train_class_num] logger.append([ epoch + 1, train_out["train_loss"], train_out["accuracy"], test_out["best_F1_possibility"], test_out["best_F1_norm"], test_out["best_F1_energy"] ]) logger.close() print(f"\nFinish training...\n")
def main_stage2(net, mid_known, mid_unknown): print("Starting stage-2 fine-tuning ...") start_epoch = 0 criterion = DFPNormLoss(mid_known=1.3 * mid_known, mid_unknown=0.7 * mid_unknown, alpha=args.alpha, temperature=args.temperature, feature='energy') optimizer = torch.optim.SGD(net.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4) if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Class Loss', 'Energy Loss', 'Energy Known', 'Energy Unknown', 'Train Acc.' ]) if not args.evaluate: for epoch in range(start_epoch, args.stage2_es): adjust_learning_rate(optimizer, epoch, args.stage2_lr, factor=args.stage2_lr_factor, step=args.stage2_lr_step) print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage2_train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_classification"], train_out["loss_energy"], train_out["loss_energy_known"], train_out["loss_energy_unknown"], train_out["accuracy"] ]) if args.plot: plot_feature(net, args, trainloader, device, args.plotfolder, epoch="stage2_" + str(epoch), plot_class_num=args.train_class_num, plot_quality=args.plot_quality) plot_feature(net, args, testloader, device, args.plotfolder, epoch="stage2_test" + str(epoch), plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True) logger.close() print(f"\nFinish Stage-2 training...\n") print("===> Evaluating stage-2 ...") stage_test(net, testloader, device, name="stage2_test_doublebar") stage_valmixup(net, trainloader, device, name="stage2_mixup_result")
if cfg.TENSORBOARD_SAVE: writer.add_scalar('eval_mean_reward', eval_mean_reward, eval_i) writer.add_scalar('eval_profit', eval_total_profit, eval_i) writer.add_scalar('eval_acc_profit', eval_acc_profit, eval_i) # print("Validation episode", eval_i, "ended. Mean reward =", eval_mean_reward, "| Total profit =", eval_total_profit, "(Start date =", start_date, ")") # print("Validation episode {} ended. Mean reward = {} | Total profit = {}".format(eval_i, eval_mean_reward, eval_total_profit)) logger.print_out( "Validation episode {} ended. Mean reward = {} | Total profit = {} | Acc profit = {}" .format(eval_i, eval_mean_reward, eval_total_profit, eval_acc_profit)) # print("Actions: {} (Start date = {})".format(actions, start_date)) logger.print_out("Actions: {} (Start date = {})".format( actions, start_date)) # rounded_obs = [round(x,2) for x in agent.env.obs.flatten()] # logger.print_out("obs: {}".format(rounded_obs)) # logger.print_out("attention_probs: {}".format(agent.env.attention_probs)) # logger.print_out("attention_obs: {}".format(agent.env.attention_obs)) logger.print_out("") # print() eval_i += 1 # End eval logger.close() os.system('clear') agent.clear_memory()
def main_stage2(net1, centroids): print(f"\n===> Start Stage-2 training...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Ignore the classAwareSampler since we are not focusing on long-tailed problem. trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) print('==> Building model..') net2 = Network(backbone=args.arch, embed_dim=512, num_classes=args.train_class_num, use_fc=True, attmodule=True, classifier='metaembedding', backbone_fc=False, data_shape=4) net2 = net2.to(device) if not args.evaluate: init_stage2_model(net1, net2) criterion = nn.CrossEntropyLoss() fea_criterion = DiscCentroidsLoss(args.train_class_num, args.stage1_feature_dim) fea_criterion = fea_criterion.to(device) optimizer = optim.SGD(net2.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer_criterion = optim.SGD(fea_criterion.parameters(), lr=args.lr * 0.1, momentum=0.9, weight_decay=5e-4) # passing centroids data. if not args.evaluate: pass_centroids(net2, fea_criterion, init_centroids=centroids) if device == 'cuda': net2 = torch.nn.DataParallel(net2) cudnn.benchmark = True if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net2.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names( ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.']) if not args.evaluate: for epoch in range(start_epoch, args.stage2_es): print('\nStage_2 Epoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot. adjust_learning_rate(optimizer, epoch, args.lr, step=20) train_loss, train_acc = stage2_train(net2, trainloader, optimizer, optimizer_criterion, criterion, fea_criterion, device) save_model(net2, None, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) logger.append([ epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc ]) pass_centroids(net2, fea_criterion, init_centroids=None) if epoch % 5 == 0: test(net2, testloader, device) print(f"\nFinish Stage-2 training...\n") logger.close() test(net2, testloader, device) return net2
def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # checkpoint args.checkpoint = './checkpoints/mnist/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # folder to save figures args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter' if not os.path.isdir(args.plotfolder): mkdir_p(args.plotfolder) # Data print('==> Preparing data..') transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) trainset = MNIST(root='../../data', train=True, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testset = MNIST(root='../../data', train=False, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) # data loader trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) # Model net = Network(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim) fea_dim = net.classifier.in_features net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names([ 'Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.', 'Test Loss', 'Test Acc.' ]) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # test(0, net, trainloader, testloader, criterion, device) epoch = 0 if not args.evaluate: for epoch in range(start_epoch, args.es): print('\nEpoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.lr, step=20) train_loss, train_acc = train(net, trainloader, optimizer, criterion, device) save_model(net, None, epoch, os.path.join(args.checkpoint, 'last_model.pth')) test_loss, test_acc = 0, 0 # logger.append([ epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc ]) plot_feature(net, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality) test(epoch, net, trainloader, testloader, criterion, device) test(99999, net, trainloader, testloader, criterion, device) plot_feature(net, testloader, device, args.plotfolder, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality) logger.close()
def main_stage2(net, mid_known, mid_unknown): print("Starting stage-2 fine-tuning ...") start_epoch = 0 criterion = FinetuneLoss(mid_known=mid_known, mid_unknown=mid_unknown, gamma=args.gamma, temperature=args.temperature, feature='energy') criterion = criterion.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4) if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] loggerList = [] for i in range(args.train_class_num, args.test_class_num + 1): loggerList.append( Logger(os.path.join(args.checkpoint, f'log{i}_stage2.txt'), resume=True)) else: print("=> no checkpoint found at '{}'".format(args.stage2_resume)) else: loggerList = [] for i in range(args.train_class_num, args.test_class_num + 1): logger = Logger(os.path.join(args.checkpoint, f'log{i}_stage2.txt')) logger.set_names( ['Epoch', 'Train Loss', 'Train Acc.', 'Energy-F1']) loggerList.append(logger) if not args.evaluate: for epoch in range(start_epoch, args.stage2_es): adjust_learning_rate(optimizer, epoch, args.stage2_lr, factor=args.stage2_lr_factor, step=args.stage2_lr_step) print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage2_train(net, trainloader, optimizer, criterion, device) save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) for test_class_num in range(args.train_class_num, args.test_class_num + 1): testset = CIFAR10( root='../../data', train=False, download=True, transform=transform_test, train_class_num=args.train_class_num, test_class_num=test_class_num, includes_all_train_class=args.includes_all_train_class) testloader = torch.utils.data.DataLoader( testset, batch_size=args.stage2_bs, shuffle=False, num_workers=4) test_out = test(net, testloader, device) logger = loggerList[test_class_num - args.train_class_num] logger.append([ epoch + 1, train_out["train_loss"], train_out["accuracy"], test_out["best_F1"] ]) logger.close() print(f"\nFinish Stage-2 training...\n")
def main_stage2(stage1_dict): print('==> Building stage2 model..') start_epoch = 0 # start from epoch 0 or last checkpoint epoch net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, similarity=args.similarity, scaled=args.scaled, norm_centroid=args.norm_centroid, decorrelation=args.decorrelation) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if not args.evaluate and not os.path.isfile(args.stage2_resume): net = stage1_dict['net'] net = net.to(device) thresholds = stage1_dict['distance']['thresholds'] # stat = stage1_dict["stat"] net.module.set_threshold(thresholds.to(device)) if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] try: thresholds = checkpoint['net']['thresholds'] except: thresholds = checkpoint['net']['module.thresholds'] net.module.set_threshold(thresholds.to(device)) logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in', 'Distance out', 'Distance Center', 'Train Acc.' ]) if args.evaluate: stage2_test(net, testloader, device) return net # after resume criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta) optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, args.stage2_es): print('\nStage_2 Epoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot. adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20) # if epoch %5 ==0: # distance_results = plot_distance(net, trainloader, device, args) # thresholds = distance_results['thresholds'] # net.module.set_threshold(thresholds.to(device)) train_out = stage2_train(net, trainloader, optimizer, criterion, device) save_model(net, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) stage2_test(net, testloader, device) # stat = get_gap_stat(net2, trainloader, device, args) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_similarity"], train_out["distance_in"], train_out["distance_out"], train_out["distance_center"], train_out["accuracy"] ]) print(f"\nFinish Stage-2 training...\n") logger.close() stage2_test(net, testloader, device) return net
def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # checkpoint args.checkpoint = './checkpoints/cifar/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = CIFAR100(root='../../data', train=True, download=True, transform=transform_train, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testset = CIFAR100(root='../../data', train=False, download=True, transform=transform_test, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) # Model print('==> Building model..') net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100 net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'Learning Rate', 'Train Loss','Train Acc.', 'Test Loss', 'Test Acc.']) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # test(0, net, trainloader, testloader, criterion, device) epoch=0 if not args.evaluate: for epoch in range(start_epoch, start_epoch + args.es): print('\nEpoch: %d Learning rate: %f' % (epoch+1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.lr) train_loss, train_acc = train(net,trainloader,optimizer,criterion,device) save_model(net, None, epoch, os.path.join(args.checkpoint,'last_model.pth')) test_loss, test_acc = 0, 0 # logger.append([epoch+1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc]) test(epoch, net, trainloader, testloader, criterion, device) logger.close()
def main(): args.checkpoint = './checkpoints/mnist/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # folder to save figures args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter' if not os.path.isdir(args.plotfolder): mkdir_p(args.plotfolder) device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Preparing data..') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) trainset = MNIST(root='../../data', train=True, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testset = MNIST(root='../../data', train=False, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) # data loader trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) print('==> Building model..') net = Network(backbone=args.arch, num_classes=args.train_class_num,embed_dim=args.embed_dim) fea_dim = net.classifier.in_features net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion_softamx = nn.CrossEntropyLoss() criterion_centerloss = CenterLoss(num_classes=args.train_class_num, feat_dim=fea_dim).to(device) optimizer_softmax = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer_centerloss = torch.optim.SGD(criterion_centerloss.parameters(), lr=args.center_lr, momentum=0.9, weight_decay=5e-4) if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) criterion_centerloss.load_state_dict(checkpoint['centerloss']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'Total Loss','Softmax Loss', 'Center Loss', 'train Acc.']) if not args.evaluate: scheduler = lr_scheduler.StepLR(optimizer_softmax, step_size=20, gamma=0.1) for epoch in range(start_epoch, start_epoch + args.es): print('\nEpoch: %d Learning rate: %f' % (epoch + 1, optimizer_softmax.param_groups[0]['lr'])) train_loss, softmax_loss, center_loss, train_acc = train(net, trainloader, optimizer_softmax, optimizer_centerloss, criterion_softamx, criterion_centerloss, device) save_model(net, criterion_centerloss, epoch, os.path.join(args.checkpoint, 'last_model.pth')) # plot the training data if args.plot: plot_feature(net,criterion_centerloss, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num,maximum=args.plot_max, plot_quality=args.plot_quality) logger.append([epoch + 1, train_loss, softmax_loss, center_loss, train_acc]) scheduler.step() test(net, testloader, device) if args.plot: plot_feature(net, criterion_centerloss, testloader, device, args.plotfolder, epoch="test", plot_class_num=args.train_class_num+1, maximum=args.plot_max, plot_quality=args.plot_quality) logger.close()
def main_stage2(stage1_dict): net1 = stage1_dict['net'] thresholds = stage1_dict['distance']['thresholds'] estimator = stage1_dict['estimator'] print(f"\n===> Start Stage-2 training...\n") start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Building model..') net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, similarity=args.similarity, scaled=args.scaled, thresholds=thresholds, norm_centroid=args.norm_centroid, amplifier=args.amplifier, estimator=estimator) net2 = net2.to(device) if not args.evaluate and not os.path.isdir(args.stage2_resume): init_stage2_model(net1, net2) if device == 'cuda': net2 = torch.nn.DataParallel(net2) cudnn.benchmark = True if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage2_resume) net2.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names([ 'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in', 'Distance out', 'Generate within', 'Generate 2origin', 'Train Acc.' ]) # after resume criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta) optimizer = optim.SGD(net2.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4) if not args.evaluate: for epoch in range(start_epoch, args.stage2_es): print('\nStage_2 Epoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot. adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=10) train_out = stage2_train(net2, trainloader, optimizer, criterion, device) save_model(net2, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) logger.append([ epoch + 1, train_out["train_loss"], train_out["loss_similarity"], train_out["distance_in"], train_out["distance_out"], train_out["generate_within"], train_out["generate_2orign"], train_out["accuracy"] ]) if args.plot: plot_feature(net2, args, trainloader, device, args.plotfolder2, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality, norm_centroid=args.norm_centroid, thresholds=thresholds) plot_feature(net2, args, testloader, device, args.plotfolder2, epoch="test_" + str(epoch), plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality, norm_centroid=args.norm_centroid, thresholds=thresholds, testmode=True) if args.plot: # plot the test set plot_feature(net2, args, testloader, device, args.plotfolder2, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality, norm_centroid=args.norm_centroid, thresholds=thresholds, testmode=True) print(f"\nFinish Stage-2 training...\n") logger.close() # test2(net2, testloader, device) return net2
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = Network(backbone=args.arch, num_classes=args.train_class_num) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, verbose=False) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.' ]) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) # pipe.build() # val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # if args.evaluate: # validate(val_loader, model, criterion) # return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch, args) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) [train_loss, train_acc, avg_train_time] = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) # evaluate on validation set # [test_loss, prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: # append logger file # logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5]) logger.append([ optimizer.param_groups[0]['lr'], train_loss, 0.0, train_acc, 0.0, 0.0 ]) # is_best = prec1 > best_prec1 is_best = False # best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint, filename="checkpoint.pth.tar") # if epoch == args.epochs - 1: # print('##Top-1 {0}\n' # '##Top-5 {1}\n' # '##Perf {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators train_loader.reset() # val_loader.reset() if args.local_rank == 0: logger.close()
def main_stage2(stage1_dict): net1 = stage1_dict["net"] thresholds = stage1_dict["distance"]["thresholds"] print(f"\n===> Start Stage-2 training...\n") start_epoch = 0 print('==> Building model..') # net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, # distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight,thresholds=thresholds) net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight, thresholds=thresholds) net2 = net2.to(device) criterion_dis = DFPLossGeneral(beta=args.beta, sigma=args.sigma,gamma=args.gamma) optimizer = optim.SGD(net2.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4) if not args.evaluate: init_stage2_model(net1, net2) if device == 'cuda': net2 = torch.nn.DataParallel(net2) cudnn.benchmark = True if args.stage2_resume: # Load checkpoint. if os.path.isfile(args.stage2_resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.stage1_resume) net2.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt')) logger.set_names(['Epoch', 'Train Loss', 'Within Loss', 'Between Loss', 'Within-Gen Loss', 'Between-Gen Loss', 'Random loss', 'Train Acc.']) if not args.evaluate: for epoch in range(start_epoch, args.stage2_es): adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20) print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr'])) train_out = stage2_train(net2, trainloader, optimizer, criterion_dis, device) save_model(net2, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth')) # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss', # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.'] logger.append([epoch + 1, train_out["dis_loss_total"], train_out["dis_loss_within"], train_out["dis_loss_between"],train_out["dis_loss_within_gen"], train_out["dis_loss_between_gen"], train_out["dis_loss_cen2cen"], train_out["accuracy"]]) if args.plot: plot_feature(net2, trainloader, device, args.plotfolder2, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality) if args.plot: # plot the test set plot_feature(net2, testloader, device, args.plotfolder2, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality) # calculating distances for last epoch # distance_results = plot_distance(net2, trainloader, device, args) logger.close() print(f"\nFinish Stage-2 training...\n") print("===> Evaluating ...") stage1_test(net2, testloader, device) return net2
def main(): start_epoch = 0 best_loss = 9999999.99 # Model print('==> Building model..') net = VanillaVAE(in_channels=1, latent_dim=args.latent_dim) net = net.to(device) if device == 'cuda': # Considering the data scale and model, it is unnecessary to use DistributedDataParallel # which could speed up the training and inference compared to DataParallel net = torch.nn.DataParallel(net) cudnn.benchmark = True optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.wd) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.scheduler_gamma) if args.resume: # Load checkpoint. if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) print('==> Resuming from checkpoint, loaded..') else: print("==> No checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names( ['Epoch', 'LR', 'Train Loss', 'Recons Loss', 'KLD Loss']) if not args.evaluate: # training print("==> start training..") for epoch in range(start_epoch, args.es): print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, scheduler.get_last_lr()[-1])) train_out = train(net, trainloader, optimizer) # {train_loss, recons_loss, kld_loss} save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'checkpoint.pth')) if train_out["train_loss"] < best_loss: save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'checkpoint_best.pth'), loss=train_out["train_loss"]) best_loss = train_out["train_loss"] logger.append([ epoch + 1, scheduler.get_last_lr()[-1], train_out["train_loss"], train_out["recons_loss"], train_out["kld_loss"] ]) scheduler.step() logger.close() print(f"\n==> Finish training..\n") print("===> start evaluating ...") generate_images(net, valloader, name="test_reconstruct") sample_images(net, name="test_randsample")