def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = Network(backbone=args.arch, num_classes=args.train_class_num) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, verbose=False) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.' ]) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, args.val) crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) validate(val_loader, train_loader, model)
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = Network(backbone=args.arch, num_classes=args.train_class_num) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, verbose=False) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.' ]) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) # pipe.build() # val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # if args.evaluate: # validate(val_loader, model, criterion) # return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch, args) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) [train_loss, train_acc, avg_train_time] = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) # evaluate on validation set # [test_loss, prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: # append logger file # logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5]) logger.append([ optimizer.param_groups[0]['lr'], train_loss, 0.0, train_acc, 0.0, 0.0 ]) # is_best = prec1 > best_prec1 is_best = False # best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint, filename="checkpoint.pth.tar") # if epoch == args.epochs - 1: # print('##Top-1 {0}\n' # '##Top-5 {1}\n' # '##Perf {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators train_loader.reset() # val_loader.reset() if args.local_rank == 0: logger.close()
def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # checkpoint args.checkpoint = './checkpoints/mnist/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # folder to save figures args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter' if not os.path.isdir(args.plotfolder): mkdir_p(args.plotfolder) # Data print('==> Preparing data..') transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) trainset = MNIST(root='../../data', train=True, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testset = MNIST(root='../../data', train=False, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) # data loader trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) # Model net = Network(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim) fea_dim = net.classifier.in_features net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names([ 'Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.', 'Test Loss', 'Test Acc.' ]) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # test(0, net, trainloader, testloader, criterion, device) epoch = 0 if not args.evaluate: for epoch in range(start_epoch, args.es): print('\nEpoch: %d Learning rate: %f' % (epoch + 1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.lr, step=20) train_loss, train_acc = train(net, trainloader, optimizer, criterion, device) save_model(net, None, epoch, os.path.join(args.checkpoint, 'last_model.pth')) test_loss, test_acc = 0, 0 # logger.append([ epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc ]) plot_feature(net, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality) test(epoch, net, trainloader, testloader, criterion, device) test(99999, net, trainloader, testloader, criterion, device) plot_feature(net, testloader, device, args.plotfolder, epoch="test", plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality) logger.close()
def main(): args.checkpoint = './checkpoints/mnist/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # folder to save figures args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter' if not os.path.isdir(args.plotfolder): mkdir_p(args.plotfolder) device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) start_epoch = 0 # start from epoch 0 or last checkpoint epoch print('==> Preparing data..') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) trainset = MNIST(root='../../data', train=True, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testset = MNIST(root='../../data', train=False, download=True, transform=transform, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) # data loader trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) print('==> Building model..') net = Network(backbone=args.arch, num_classes=args.train_class_num,embed_dim=args.embed_dim) fea_dim = net.classifier.in_features net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion_softamx = nn.CrossEntropyLoss() criterion_centerloss = CenterLoss(num_classes=args.train_class_num, feat_dim=fea_dim).to(device) optimizer_softmax = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer_centerloss = torch.optim.SGD(criterion_centerloss.parameters(), lr=args.center_lr, momentum=0.9, weight_decay=5e-4) if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) criterion_centerloss.load_state_dict(checkpoint['centerloss']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'Total Loss','Softmax Loss', 'Center Loss', 'train Acc.']) if not args.evaluate: scheduler = lr_scheduler.StepLR(optimizer_softmax, step_size=20, gamma=0.1) for epoch in range(start_epoch, start_epoch + args.es): print('\nEpoch: %d Learning rate: %f' % (epoch + 1, optimizer_softmax.param_groups[0]['lr'])) train_loss, softmax_loss, center_loss, train_acc = train(net, trainloader, optimizer_softmax, optimizer_centerloss, criterion_softamx, criterion_centerloss, device) save_model(net, criterion_centerloss, epoch, os.path.join(args.checkpoint, 'last_model.pth')) # plot the training data if args.plot: plot_feature(net,criterion_centerloss, trainloader, device, args.plotfolder, epoch=epoch, plot_class_num=args.train_class_num,maximum=args.plot_max, plot_quality=args.plot_quality) logger.append([epoch + 1, train_loss, softmax_loss, center_loss, train_acc]) scheduler.step() test(net, testloader, device) if args.plot: plot_feature(net, criterion_centerloss, testloader, device, args.plotfolder, epoch="test", plot_class_num=args.train_class_num+1, maximum=args.plot_max, plot_quality=args.plot_quality) logger.close()
def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(device) best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # checkpoint args.checkpoint = './checkpoints/cifar/' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = CIFAR100(root='../../data', train=True, download=True, transform=transform_train, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4) testset = CIFAR100(root='../../data', train=False, download=True, transform=transform_test, train_class_num=args.train_class_num, test_class_num=args.test_class_num, includes_all_train_class=args.includes_all_train_class) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4) # Model print('==> Building model..') net = Network(backbone=args.arch, num_classes=args.train_class_num) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. if os.path.isfile(args.resume): print('==> Resuming from checkpoint..') checkpoint = torch.load(args.resume) net.load_state_dict(checkpoint['net']) # best_acc = checkpoint['acc'] # print("BEST_ACCURACY: "+str(best_acc)) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'Learning Rate', 'Train Loss','Train Acc.', 'Test Loss', 'Test Acc.']) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # test(0, net, trainloader, testloader, criterion, device) epoch=0 if not args.evaluate: for epoch in range(start_epoch, args.es): print('\nEpoch: %d Learning rate: %f' % (epoch+1, optimizer.param_groups[0]['lr'])) adjust_learning_rate(optimizer, epoch, args.lr) train_loss, train_acc = train(net,trainloader,optimizer,criterion,device) save_model(net, None, epoch, os.path.join(args.checkpoint,'last_model.pth')) test_loss, test_acc = 0, 0 # logger.append([epoch+1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc]) # don't test the first epoch, cause some classes may have no predict samples, leading to error caused by # compute_train_score_and_mavs_and_dists if epoch % 5 == 0 and epoch!=0: test(epoch, net, trainloader, testloader, criterion, device) test(epoch, net, trainloader, testloader, criterion, device) logger.close()