def main(): args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader data_folder = os.path.join(args.data_folder, 'train') val_folder = os.path.join(args.data_folder, 'val') crop_padding = 32 image_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize = transforms.Normalize(mean=mean, std=std) if args.aug == 'NULL' and args.dataset == 'imagenet': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif args.aug == 'CJ': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) # elif args.aug == 'NULL' and args.dataset == 'cifar': # train_transform = transforms.Compose([ # transforms.RandomResizedCrop(size=32, scale=(0.2, 1.)), # transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), # transforms.RandomGrayscale(p=0.2), # transforms.RandomHorizontalFlip(p=0.5), # transforms.ToTensor(), # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # ]) # # test_transform = transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # ]) elif args.aug == 'simple' and args.dataset == 'imagenet': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), get_color_distortion(1.0), transforms.ToTensor(), normalize, ]) # TODO: Currently follow CMC test_transform = transforms.Compose([ transforms.Resize(image_size + crop_padding), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) elif args.aug == 'simple' and args.dataset == 'cifar': train_transform = transforms.Compose([ transforms.RandomResizedCrop(size=32), transforms.RandomHorizontalFlip(p=0.5), get_color_distortion(0.5), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) else: raise NotImplemented('augmentation not supported: {}'.format(args.aug)) # Get Datasets if args.dataset == "imagenet": train_dataset = ImageFolderInstance(data_folder, transform=train_transform, two_crop=args.moco) print(len(train_dataset)) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) test_dataset = datasets.ImageFolder(val_folder, transforms=test_transform) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=args.num_workers, pin_memory=True) elif args.dataset == 'cifar': # cifar-10 dataset if args.contrastive_model == 'simclr': train_dataset = CIFAR10Instance_double(root='./data', train=True, download=True, transform=train_transform, double=True) else: train_dataset = CIFAR10Instance(root='./data', train=True, download=True, transform=train_transform) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler, drop_last=True) test_dataset = CIFAR10Instance(root='./data', train=False, download=True, transform=test_transform) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=args.num_workers) # create model and optimizer n_data = len(train_dataset) if args.model == 'resnet50': model = InsResNet50() if args.contrastive_model == 'moco': model_ema = InsResNet50() elif args.model == 'resnet50x2': model = InsResNet50(width=2) if args.contrastive_model == 'moco': model_ema = InsResNet50(width=2) elif args.model == 'resnet50x4': model = InsResNet50(width=4) if args.contrastive_model == 'moco': model_ema = InsResNet50(width=4) elif args.model == 'resnet50_cifar': model = InsResNet50_cifar() if args.contrastive_model == 'moco': model_ema = InsResNet50_cifar() else: raise NotImplementedError('model not supported {}'.format(args.model)) # copy weights from `model' to `model_ema' if args.contrastive_model == 'moco': moment_update(model, model_ema, 0) # set the contrast memory and criterion if args.contrastive_model == 'moco': contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu) elif args.contrastive_model == 'simclr': contrast = None else: contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t, args.nce_m, args.softmax).cuda(args.gpu) if args.softmax: criterion = NCESoftmaxLoss() elif args.contrastive_model == 'simclr': criterion = BatchCriterion(1, args.nce_t, args.batch_size) else: criterion = NCECriterion(n_data) criterion = criterion.cuda(args.gpu) model = model.cuda() if args.contrastive_model == 'moco': model_ema = model_ema.cuda() # Exclude BN and bias if needed weight_decay = args.weight_decay if weight_decay and args.filter_weight_decay: parameters = add_weight_decay(model, weight_decay, args.filter_weight_decay) weight_decay = 0. else: parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.learning_rate, momentum=args.momentum, weight_decay=weight_decay) cudnn.benchmark = True if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) if args.contrastive_model == 'moco': optimizer_ema = torch.optim.SGD(model_ema.parameters(), lr=0, momentum=0, weight_decay=0) model_ema, optimizer_ema = amp.initialize(model_ema, optimizer_ema, opt_level=args.opt_level) if args.LARS: optimizer = LARS(optimizer=optimizer, eps=1e-8, trust_coef=0.001) # optionally resume from a checkpoint args.start_epoch = 0 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if contrast: contrast.load_state_dict(checkpoint['contrast']) if args.contrastive_model == 'moco': model_ema.load_state_dict(checkpoint['model_ema']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): print("==> training...") time1 = time.time() if args.contrastive_model == 'moco': loss, prob = train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, args) elif args.contrastive_model == 'simclr': print("Train using simclr") loss, prob = train_simclr(epoch, train_loader, model, criterion, optimizer, args) else: print("Train using InsDis") loss, prob = train_ins(epoch, train_loader, model, contrast, criterion, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('ins_loss', loss, epoch) logger.log_value('ins_prob', prob, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) test_epoch = 2 if epoch % test_epoch == 0: model.eval() if args.contrastive_model == 'moco': model_ema.eval() print('----------Evaluation---------') start = time.time() if args.dataset == 'cifar': acc = kNN(epoch, model, train_loader, test_loader, 200, args.nce_t, n_data, low_dim=128, memory_bank=None) print("Evaluation Time: '{}'s".format(time.time() - start)) # writer.add_scalar('nn_acc', acc, epoch) logger.log_value('Test accuracy', acc, epoch) # print('accuracy: {}% \t (best acc: {}%)'.format(acc, best_acc)) print('[Epoch]: {}'.format(epoch)) print('accuracy: {}%)'.format(acc)) # test_log_file.flush() # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), # 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.contrastive_model == 'moco': state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state # saving the model print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), # 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.contrastive_model == 'moco': state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join(args.model_folder, 'current.pth') torch.save(state, save_file) if epoch % args.save_freq == 0: save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def main_worker(gpu, ngpus_per_node, args): global best_acc1 best_acc1 = 0 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # set the model model, classifier, criterion = set_model(args, ngpus_per_node) # set optimizer optimizer = set_optimizer(args, classifier) cudnn.benchmark = True # optionally resume linear classifier args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # set the data loader train_loader, val_loader, train_sampler = get_train_val_loader(args) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) print("==> testing...") test_acc, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): state = { 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving model!') torch.save(state, save_name) # regular save if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if epoch % args.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_file = os.path.join( args.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # tensorboard logger pass
def main(): # parse the args args = parse_option() # set the loader train_loader, n_data = get_train_loader( args) # change to this if testing on cifar as baseline # set the model model, contrast, criterion_ab, criterion_l = set_model(args, n_data) # set the optimizer optimizer = set_optimizer(args, model) # set mixed precision if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model, contrast, criterion_l, criterion_ab, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('l_loss', l_loss, epoch) logger.log_value('l_prob', l_prob, epoch) logger.log_value('ab_loss', ab_loss, epoch) logger.log_value('ab_prob', ab_prob, epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def main(): global best_acc1 best_acc1 = 0 args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader train_folder = os.path.join(args.data_folder, 'train') val_folder = os.path.join(args.data_folder, 'val') image_size = 224 crop_padding = 32 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize = transforms.Normalize(mean=mean, std=std) if args.aug == 'NULL': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif args.aug == 'CJ': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: raise NotImplemented('augmentation not supported: {}'.format(args.aug)) train_dataset = datasets.ImageFolder(train_folder, train_transform) val_dataset = datasets.ImageFolder( val_folder, transforms.Compose([ transforms.Resize(image_size + crop_padding), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ])) print(len(train_dataset)) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) # create model and optimizer if args.model == 'resnet50': model = InsResNet50() classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 1) elif args.model == 'resnet50x2': model = InsResNet50(width=2) classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 2) elif args.model == 'resnet50x4': model = InsResNet50(width=4) classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 4) else: raise NotImplementedError('model not supported {}'.format(args.model)) print('==> loading pre-trained model') ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['model']) print("==> loaded checkpoint '{}' (epoch {})".format( args.model_path, ckpt['epoch'])) print('==> done') model = model.cuda() classifier = classifier.cuda() criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu) if not args.adam: optimizer = torch.optim.SGD(classifier.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=1e-8) model.eval() cudnn.benchmark = True # set mixed precision training # if args.amp: # model = amp.initialize(model, opt_level=args.opt_level) # classifier, optimizer = amp.initialize(classifier, optimizer, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc1 = checkpoint['best_acc1'] best_acc1 = best_acc1.cuda() print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if 'opt' in checkpoint.keys(): # resume optimization hyper-parameters print('=> resume hyper parameters') if 'bn' in vars(checkpoint['opt']): print('using bn: ', checkpoint['opt'].bn) if 'adam' in vars(checkpoint['opt']): print('using adam: ', checkpoint['opt'].adam) if 'cosine' in vars(checkpoint['opt']): print('using cosine: ', checkpoint['opt'].cosine) args.learning_rate = checkpoint['opt'].learning_rate # args.lr_decay_epochs = checkpoint['opt'].lr_decay_epochs args.lr_decay_rate = checkpoint['opt'].lr_decay_rate args.momentum = checkpoint['opt'].momentum args.weight_decay = checkpoint['opt'].weight_decay args.beta1 = checkpoint['opt'].beta1 args.beta2 = checkpoint['opt'].beta2 del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # set cosine annealing scheduler if args.cosine: # last_epoch = args.start_epoch - 2 # eta_min = args.learning_rate * (args.lr_decay_rate ** 3) * 0.1 # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min, last_epoch) eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1 scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min, -1) # dummy loop to catch up with current epoch for i in range(1, args.start_epoch): scheduler.step() # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): if args.cosine: scheduler.step() else: adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_acc5, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) print("==> testing...") test_acc, test_acc5, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc5', test_acc5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving best model!') torch.save(state, save_name) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': test_acc, 'optimizer': optimizer.state_dict(), } save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch) save_name = os.path.join(args.save_folder, save_name) print('saving regular model!') torch.save(state, save_name) # tensorboard logger pass
def main_worker(gpu, ngpus_per_node, opt): global best_acc, total_time opt.gpu = int(gpu) opt.gpu_id = int(gpu) if opt.gpu is not None: print("Use GPU: {} for training".format(opt.gpu)) if opt.multiprocessing_distributed: # Only one node now. opt.rank = gpu dist_backend = 'nccl' dist.init_process_group(backend=dist_backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) opt.batch_size = int(opt.batch_size / ngpus_per_node) opt.num_workers = int( (opt.num_workers + ngpus_per_node - 1) / ngpus_per_node) if opt.deterministic: torch.manual_seed(12345) cudnn.deterministic = True cudnn.benchmark = False numpy.random.seed(12345) class_num_map = { 'cifar100': 100, 'imagenet': 1000, 'imagenette': 10, } if opt.dataset not in class_num_map: raise NotImplementedError(opt.dataset) n_cls = class_num_map[opt.dataset] # model model_t = load_teacher(opt.path_t, n_cls, opt.gpu, opt) module_args = {'num_classes': n_cls} model_s = model_dict[opt.model_s](**module_args) if opt.dataset == 'cifar100': data = torch.randn(2, 3, 32, 32) elif opt.dataset == 'imagenet': data = torch.randn(2, 3, 224, 224) model_t.eval() model_s.eval() feat_t, _ = model_t(data, is_feat=True) feat_s, _ = model_s(data, is_feat=True) module_list = nn.ModuleList([]) module_list.append(model_s) trainable_list = nn.ModuleList([]) trainable_list.append(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) if opt.distill == 'kd': criterion_kd = DistillKL(opt.kd_T) elif opt.distill == 'hint': criterion_kd = HintLoss() regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape) module_list.append(regress_s) trainable_list.append(regress_s) elif opt.distill == 'semckd': s_n = [f.shape[1] for f in feat_s[1:-1]] t_n = [f.shape[1] for f in feat_t[1:-1]] criterion_kd = SemCKDLoss() self_attention = SelfA( len(feat_s) - 2, len(feat_t) - 2, opt.batch_size, s_n, t_n) module_list.append(self_attention) trainable_list.append(self_attention) elif opt.distill == 'crd': opt.s_dim = feat_s[-1].shape[1] opt.t_dim = feat_t[-1].shape[1] opt.n_data = 50000 criterion_kd = CRDLoss(opt) module_list.append(criterion_kd.embed_s) module_list.append(criterion_kd.embed_t) trainable_list.append(criterion_kd.embed_s) trainable_list.append(criterion_kd.embed_t) elif opt.distill == 'attention': criterion_kd = Attention() elif opt.distill == 'similarity': criterion_kd = Similarity() elif opt.distill == 'rkd': criterion_kd = RKDLoss() elif opt.distill == 'irg': criterion_kd = IRGLoss() elif opt.distill == 'pkt': criterion_kd = PKT() elif opt.distill == 'hkd': criterion_kd = HKDLoss(init_weight=opt.hkd_initial_weight, decay=opt.hkd_decay) elif opt.distill == 'correlation': criterion_kd = Correlation() embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim) embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim) module_list.append(embed_s) module_list.append(embed_t) trainable_list.append(embed_s) trainable_list.append(embed_t) elif opt.distill == 'vid': s_n = [f.shape[1] for f in feat_s[1:-1]] t_n = [f.shape[1] for f in feat_t[1:-1]] criterion_kd = nn.ModuleList( [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)]) # add this as some parameters in VIDLoss need to be updated trainable_list.append(criterion_kd) else: raise NotImplementedError(opt.distill) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append( criterion_div) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kd) # other knowledge distillation loss module_list.append(model_t) if torch.cuda.is_available(): # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if opt.multiprocessing_distributed: if opt.gpu is not None: torch.cuda.set_device(opt.gpu) module_list.cuda(opt.gpu) distributed_modules = [] for module in module_list: DDP = torch.nn.parallel.DistributedDataParallel distributed_modules.append( DDP(module, device_ids=[opt.gpu])) module_list = distributed_modules criterion_list.cuda(opt.gpu) else: print( 'multiprocessing_distributed must be with a specifiec gpu id' ) else: criterion_list.cuda() module_list.cuda() if not opt.deterministic: cudnn.benchmark = True optimizer = optim.SGD(trainable_list.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) # dataloader if opt.dataset == 'cifar100': if opt.distill in ['crd']: train_loader, val_loader, n_data = get_cifar100_dataloaders_sample( batch_size=opt.batch_size, num_workers=opt.num_workers, k=opt.nce_k, mode=opt.mode) else: train_loader, val_loader = get_cifar100_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers) elif opt.dataset in imagenet_list: if opt.dali is None: train_loader, val_loader, train_sampler = get_imagenet_dataloader( dataset=opt.dataset, batch_size=opt.batch_size, num_workers=opt.num_workers, multiprocessing_distributed=opt.multiprocessing_distributed) else: train_loader, val_loader = get_dali_data_loader(opt) else: raise NotImplementedError(opt.dataset) if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0: logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) if not opt.skip_validation: # validate teacher accuracy teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt) if opt.dali is not None: val_loader.reset() if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0: print('teacher accuracy: ', teacher_acc) else: print('Skipping teacher validation.') # routine for epoch in range(1, opt.epochs + 1): torch.cuda.empty_cache() if opt.multiprocessing_distributed: if opt.dali is None: train_sampler.set_epoch(epoch) adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_acc_top5, train_loss, data_time = train( epoch, train_loader, module_list, criterion_list, optimizer, opt) time2 = time.time() if opt.multiprocessing_distributed: metrics = torch.tensor( [train_acc, train_acc_top5, train_loss, data_time]).cuda(opt.gpu, non_blocking=True) reduced = reduce_tensor( metrics, opt.world_size if 'world_size' in opt else 1) train_acc, train_acc_top5, train_loss, data_time = reduced.tolist() if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0: print( ' * Epoch {}, GPU {}, Acc@1 {:.3f}, Acc@5 {:.3f}, Time {:.2f}, Data {:.2f}' .format(epoch, opt.gpu, train_acc, train_acc_top5, time2 - time1, data_time)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) print('GPU %d validating' % (opt.gpu)) test_acc, test_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt) if opt.dali is not None: train_loader.reset() val_loader.reset() if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0: print(' ** Acc@1 {:.3f}, Acc@5 {:.3f}'.format( test_acc, test_acc_top5)) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_loss', test_loss, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) # save the best model if test_acc > best_acc: best_acc = test_acc state = { 'epoch': epoch, 'model': model_s.state_dict(), 'best_acc': best_acc, } if opt.distill == 'semckd': state['attention'] = trainable_list[-1].state_dict() save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model_s)) test_merics = { 'test_loss': test_loss, 'test_acc': test_acc, 'test_acc_top5': test_acc_top5, 'epoch': epoch } save_dict_to_json( test_merics, os.path.join(opt.save_folder, "test_best_metrics.json")) print('saving the best model!') torch.save(state, save_file) if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0: # This best accuracy is only for printing purpose. print('best accuracy:', best_acc) # save parameters save_state = {k: v for k, v in opt._get_kwargs()} # No. parameters(M) num_params = (sum(p.numel() for p in model_s.parameters()) / 1000000.0) save_state['Total params'] = num_params save_state['Total time'] = (time.time() - total_time) / 3600.0 params_json_path = os.path.join(opt.save_folder, "parameters.json") save_dict_to_json(save_state, params_json_path)
def main(): opt = parse_option() # dataloader train_partition = 'trainval' if opt.use_trainval else 'train' if opt.dataset == 'miniImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(ImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(ImageNet(args=opt, partition='val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaImageNet(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaImageNet(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: n_cls = 64 elif opt.dataset == 'tieredImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(TieredImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(TieredImageNet(args=opt, partition='train_phase_val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaTieredImageNet( args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaTieredImageNet( args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 448 else: n_cls = 351 elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100': train_trans, test_trans = transforms_options['D'] train_loader = DataLoader(CIFAR100(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(CIFAR100(args=opt, partition='train', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaCIFAR100(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaCIFAR100(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: if opt.dataset == 'CIFAR-FS': n_cls = 64 elif opt.dataset == 'FC100': n_cls = 60 else: raise NotImplementedError('dataset not supported: {}'.format( opt.dataset)) else: raise NotImplementedError(opt.dataset) # model if not opt.load_latest: model = create_model(opt.model, n_cls, opt.dataset) else: latest_file = os.path.join(opt.save_folder, 'latest.pth') model = load_teacher(latest_file, n_cls, opt.dataset) # optimizer if opt.adam: optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=0.0005) else: optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): if opt.n_gpu > 1: model = nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) # routine: supervised pre-training for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, model, criterion, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model, criterion, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) latest_file = os.path.join(opt.save_folder, 'latest.pth') os.symlink(save_file, latest_file) # save the last model state = { 'opt': opt, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model)) torch.save(state, save_file)
def main(): args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader data_folder = os.path.join(args.data_folder, 'train') image_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize = transforms.Normalize(mean=mean, std=std) if args.aug == 'NULL': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif args.aug == 'CJ': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: raise NotImplemented('augmentation not supported: {}'.format(args.aug)) train_dataset = ImageFolderInstance(data_folder, transform=train_transform, two_crop=args.moco) print(len(train_dataset)) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) # create model and optimizer n_data = len(train_dataset) if args.model == 'resnet50': model = InsResNet50() if args.moco: model_ema = InsResNet50() elif args.model == 'resnet50x2': model = InsResNet50(width=2) if args.moco: model_ema = InsResNet50(width=2) elif args.model == 'resnet50x4': model = InsResNet50(width=4) if args.moco: model_ema = InsResNet50(width=4) else: raise NotImplementedError('model not supported {}'.format(args.model)) # copy weights from `model' to `model_ema' if args.moco: moment_update(model, model_ema, 0) # set the contrast memory and criterion if args.moco: contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu) else: contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t, args.nce_m, args.softmax).cuda(args.gpu) criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data) criterion = criterion.cuda(args.gpu) model = model.cuda() if args.moco: model_ema = model_ema.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) if args.moco: optimizer_ema = torch.optim.SGD(model_ema.parameters(), lr=0, momentum=0, weight_decay=0) model_ema, optimizer_ema = amp.initialize(model_ema, optimizer_ema, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) if args.moco: model_ema.load_state_dict(checkpoint['model_ema']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() if args.moco: loss, prob = train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, args) else: loss, prob = train_ins(epoch, train_loader, model, contrast, criterion, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('ins_loss', loss, epoch) logger.log_value('ins_prob', prob, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.moco: state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state # saving the model print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.moco: state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join(args.model_folder, 'current.pth') torch.save(state, save_file) if epoch % args.save_freq == 0: save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def __init__(self, log_dir): """Create a summary writer logging to log_dir.""" # self.writer = tf.summary.FileWriter(log_dir) self.logger = tb_logger.Logger(logdir=log_dir, flush_secs=2)
def main(): global best_acc1 best_acc1 = 0 args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader train_loader, val_loader, train_sampler = get_train_val_loader(args) # set the model model, classifier, criterion = set_model(args) # set optimizer optimizer = set_optimizer(args, classifier) cudnn.benchmark = True # optionally resume linear classifier args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc1 = checkpoint['best_acc1'] best_acc1 = best_acc1.cuda() print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_acc5, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) print("==> testing...") test_acc, test_acc5, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc5', test_acc5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving best model!') torch.save(state, save_name) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': test_acc, 'optimizer': optimizer.state_dict(), } save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch) save_name = os.path.join(args.save_folder, save_name) print('saving regular model!') torch.save(state, save_name) # tensorboard logger pass
def train(gpu=None): logs = { 'train':tensorboard_logger.Logger(tb_path + "/train"), 'prs':tensorboard_logger.Logger(tb_path + "/prs"), 'spr':tensorboard_logger.Logger(tb_path + "/sp"), 'r2':tensorboard_logger.Logger(tb_path + "/r2"), } db = Dataset(training_path, testing_path, post_map_path, feature_path, aux_path, attr_path, settings['min_images']) print 'Training Attributes:', db.attr_names model = neural_net(num_attributes=len(db.attr_inds), aux_size=len(db.aux_list)) if resume_train is None: start_train = 0 else: epochs_str = [el.split('_')[-1].split('.')[0] for el in glob.glob('log/' + resume_train + "/*.dat")] if 'model' in epochs_str: epochs_str.remove('model') last_epoch = np.max([int(el) for el in epochs_str]) # last_epoch = np.max([int(el.split('_')[-1][0]) for el in glob.glob('log/' + resume_train + "/*.dat")]) resume_path = 'log/' + resume_train + "/vgg_model_ep_" + str(last_epoch) + ".dat" start_train = last_epoch + 1 if gpu is not None: model.load_state_dict(torch.load(resume_path, map_location='cuda:' + str(gpu))) else: model.load_state_dict(torch.load(resume_path, map_location=lambda gpu, loc: gpu)) # Initializing PyTorch Dataloader dataloader = DataLoader(db, batch_size=settings['batch_size'], shuffle=True, num_workers=4) mr_loss = MarginRankingLoss(margin=0.3).to(gpu) optimizer = optim.Adadelta(model.parameters(), lr=settings['lr'], weight_decay=1e-5) model = model.to(gpu) step = 0 for epoch in range(start_train, settings['num_epochs']): print 'Epoch', epoch pbar = tqdm(total=db.__len__()) for i_batch, sample_batched in enumerate(dataloader): optimizer.zero_grad() image_1 = sample_batched['image_1'].type(torch.FloatTensor) image_2 = sample_batched['image_2'].type(torch.FloatTensor) aux_1 = sample_batched['label_1'].type(torch.FloatTensor).to(gpu) aux_2 = sample_batched['label_2'].type(torch.FloatTensor).to(gpu) gt = (aux_1 > aux_2).type(torch.FloatTensor) reg_loss_1 = torch.zeros(image_1.shape[0], dtype=torch.float32) reg_loss_2 = torch.zeros(image_1.shape[0], dtype=torch.float32) ranking_loss = torch.zeros(image_1.shape[0], dtype=torch.float32) if gpu is not None: image_1 = image_1.to(gpu) image_2 = image_2.to(gpu) aux_1 = aux_1.to(gpu) aux_2 = aux_2.to(gpu) gt = gt.to(gpu) reg_loss_1 = reg_loss_1.to(gpu) reg_loss_2 = reg_loss_2.to(gpu) ranking_loss = ranking_loss.to(gpu) out_1 = model(image_1) out_2 = model(image_2) for i in range(len(db.attr_inds)): # avg over attributes ranking_loss += mr_loss(out_1[i], out_2[i], gt[:, i]) ranking_loss = ranking_loss / len(db.attr_inds) if fixed_std: p = [torch.distributions.normal.Normal(aux_1[:, i], 0.1) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_1 = reg_loss_1 / len(db.attr_inds) p = [torch.distributions.normal.Normal(aux_2[:, i], 0.1) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_2 = reg_loss_2 / len(db.attr_inds) else: p = [torch.distributions.normal.Normal(aux_1[:, i], model.aux_stds[sample_batched['aux_1'], i]) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_1 = reg_loss_1 / len(db.attr_inds) p = [torch.distributions.normal.Normal(aux_2[:, i], model.aux_stds[sample_batched['aux_2'], i]) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_2 = reg_loss_2 / len(db.attr_inds) ranking_loss = ranking_loss.mean() # avg over batch reg_loss = reg_loss_1.mean() + reg_loss_2.mean() # avg over batch loss = reg_loss + ranking_loss step += 1 logs['train'].log_value('loss', loss.item(), step) loss.backward() optimizer.step() _loss = loss.item() pbar.update(image_1.shape[0]) pbar.close() if epoch % 50 == 0: model.eval() test(model, db, gpu, logs=logs, step=step) model.train() persist_model(model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat') # Performing final evaluation model.eval() test(model, db, gpu) persist_model(model, model_path) return
train_data = PlacesRoom(train=True) test_data = PlacesRoom(train=False) train_dataloader = DataLoader(train_data, batch_size=256, num_workers=6, shuffle=True, drop_last=True) val_dataloader = DataLoader(test_data, batch_size=256, num_workers=6, shuffle=True, drop_last=False) criterion = nn.CrossEntropyLoss().cuda() logger = tb_logger.Logger(logdir="linear_finetune", flush_secs=2) for epoch in range(60): # adjust_learning_rate(epoch, FLAGS, optimizer) print("==> training...") train_acc, train_acc5, train_loss = train(epoch, train_dataloader, model, classifier, criterion, optimizer) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) print("==> testing...")
def main(): global args, best_prec1, use_cuda args = parser.parse_args() # Data loading code normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) if args.augment: transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) else: transform_train = transforms.Compose([transforms.ToTensor()]) transform_test = transforms.Compose([transforms.ToTensor()]) kwargs = {'num_workers': 0, 'pin_memory': True} assert (args.dataset == 'cifar10' or args.dataset == 'cifar100' or args.dataset == 'svhn') if args.dataset == 'cifar10': nclasses = 10 dataset_train = NCIFAR10('./data', train=True, transform=transform_train, normalize_transform=normalize) dataset_test = NCIFAR10('./data', train=False, transform=transform_test, normalize_transform=normalize) else: raise RuntimeError('no other data set implementations available') ''' elif args.dataset == 'cifar100': nclasses = 100 dataset_train = NCIFAR100('./data', train=True, transform=transform_train, normalize_transform=normalize, download=True) dataset_test = NCIFAR100('./data', train=False, transform=transform_test, normalize_transform=normalize, download=True) elif args.dataset == 'svhn': nclasses = 10 dataset_train = NSVHN('./data', split='train', transform=transform_train, normalize_transform=normalize) dataset_test = NSVHN('./data', split='test', transform=transform_test, normalize_transform=normalize) ''' train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, shuffle=False, **kwargs) # -------------------------------------------------------------------------------- # create model if args.arch == 'resnet': experiment_dir = 'experiments/' output_dir = experiment_dir + 'resnet-cifar/' rnargs = { 'use_pp1': args.pushpull, 'pp_block1': args.pp_block1, 'pp_all': args.pp_all, 'train_alpha': args.train_alpha, 'size_lpf': args.lpf_size } if args.layers == 20: model = resnet20(**rnargs) elif args.layers == 32: model = resnet32(**rnargs) elif args.layers == 44: model = resnet44(**rnargs) elif args.layers == 56: model = resnet56(**rnargs) elif args.arch == 'densenet': experiment_dir = 'experiments/' output_dir = experiment_dir + 'densenet-cifar/' rnargs = { 'use_pp1': args.pushpull, 'pp_block1': args.pp_block1, 'num_classes': nclasses, 'small_inputs': True, 'efficient': args.efficient, 'compression': args.reduce, 'drop_rate': args.droprate, # 'scale_pp': args.scale_pp, # 'alpha_pp': args.alpha_pp } if args.layers == 40: model = densenet40_12(**rnargs) elif args.layers == 100: if args.growth == 12: model = densenet100_12(**rnargs) elif args.growth == 24: model = densenet100_24(**rnargs) else: raise RuntimeError('chosen architecture not implemented (yet)...') logger = None if args.tensorboard: ustr = datetime.datetime.now().strftime( "%y-%m-%d_%H-%M_") + uuid.uuid4().hex[:3] logger = tensorboard_logger.Logger(experiment_dir + "tensorboard/" + args.name + '/' + ustr) # get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # -------------------------------------------------------------------------------- use_cuda = torch.cuda.is_available() # for training on multiple GPUs. # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use # model = torch.nn.DataParallel(model).cuda() if use_cuda: model = model.cuda() # optionally resume from a checkpoint epoch = None if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() if use_cuda: criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) lr_milestones = json.loads(args.milestones) if args.extra_epochs > 0: lr_milestones = list( set(lr_milestones + json.loads(args.extra_milestones))) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=lr_milestones, gamma=0.1) scheduler.step(epoch) directory = output_dir + "%s/" % args.name if not os.path.exists(directory): os.makedirs(directory) for epoch in range(args.start_epoch, args.epochs + args.extra_epochs): fileout = open(output_dir + args.name + '/log.txt', "a+") # adjust_learning_rate(logger, optimizer, epoch + 1, args.epochs) scheduler.step() print('lr(', epoch, '): ', scheduler.get_lr()) # train for one epoch train(logger, train_loader, model, criterion, optimizer, epoch, fileout) # evaluate on validation set prec1 = validate(logger, val_loader, model, criterion, epoch, fileout) fileout.close() # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, output_dir) print('Best accuracy: ', best_prec1) fileout = open(output_dir + args.name + '/log.txt', "a+") fileout.write('Best accuracy: {}\n'.format(best_prec1)) fileout.close()
def _setup_tensorboard_logger(self): if self.checkpoint_path: self._tensortboard_logger = tensorboard_logger.Logger( self.checkpoint_path, flush_secs=5) else: self._tensortboard_logger = None
def main(): args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model and optimizer # == dataset config== """ CUDA_VISIBLE_DEVICES=0,1 python train_temporal_dis.py \ --batch_size 16 --num_workers 8 --nce_k 3569 --softmax --moco """ # args.dataset = 'hmdb51' # args.train_list = '../datasets/lists/hmdb51/hmdb51_rgb_train_split_1.txt' # args.val_list = '../datasets/lists/hmdb51/hmdb51_rgb_val_split_1.txt' """ CUDA_VISIBLE_DEVICES=1 python train_temporal_dis.py \ --batch_size 16 --num_workers 8 --nce_k 9536 --softmax --moco """ # args.print_freq = 100 # args.dataset = 'ucf101' # args.train_list = '../datasets/lists/ucf101/ucf101_rgb_train_split_1.txt' # args.val_list = '../datasets/lists/ucf101/ucf101_rgb_val_split_1.txt' # args.print_freq = 1000 # args.dataset = 'kinetics' # args.train_list = '../datasets/lists/kinetics-400/ssd_kinetics_video_trainlist.txt' # args.val_list = '../datasets/lists/kinetics-400/ssd_kinetics_video_vallist.txt' args.dropout = 0.5 args.clips = 1 args.data_length = 16 args.stride = 4 args.spatial_size = 224 args.root = "" args.mode = 'rgb' args.eval_indict = 'loss' args.pt_loss = 'TemporalDis' args.workers = 4 # args.arch = 'i3d' # 'r2p1d' num_class, data_length, image_tmpl = data_config(args) train_transforms, test_transforms, eval_transforms = augmentation_config( args) train_loader, val_loader, eval_loader, train_samples, val_samples, eval_samples = data_loader_init( args, data_length, image_tmpl, train_transforms, test_transforms, eval_transforms) n_data = len(train_loader) if args.arch == 'i3d': model = I3D(num_classes=101, modality=args.mode, dropout_prob=args.dropout, with_classifier=False) model_ema = I3D(num_classes=101, modality=args.mode, dropout_prob=args.dropout, with_classifier=False) elif args.arch == 'r2p1d': model = R2Plus1DNet((1, 1, 1, 1), num_classes=num_class, with_classifier=False) model_ema = R2Plus1DNet((1, 1, 1, 1), num_classes=num_class, with_classifier=False) elif args.arch == 'r3d': from model.r3d import resnet18 model = resnet18(num_classes=num_class, with_classifier=False) model_ema = resnet18(num_classes=num_class, with_classifier=False) else: Exception("Not implemene error!") model = torch.nn.DataParallel(model) model_ema = torch.nn.DataParallel(model_ema) # random initialization model.apply(weights_init) model_ema.apply(weights_init) # copy weights from `model' to `model_ema' moment_update(model, model_ema, 0) contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu) # contrast2 = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu) criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data) criterion = criterion.cuda(args.gpu) cls_criterion = nn.CrossEntropyLoss().cuda() model = model.cuda() if args.moco: model_ema = model_ema.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) if args.moco: optimizer_ema = torch.optim.SGD(model_ema.parameters(), lr=0, momentum=0, weight_decay=0) model_ema, optimizer_ema = amp.initialize(model_ema, optimizer_ema, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) if args.moco: model_ema.load_state_dict(checkpoint['model_ema']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) logger2 = tb_logger.Logger(logdir=args.tb_folder2, flush_secs=2) #==================================== our data augmentation method================================= pos_aug = GenPositive() neg_aug = GenNegative() # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() loss, prob = train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, args, pos_aug, neg_aug) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) saving(logger, loss, epoch, optimizer, args, model, contrast, prob, model_ema, 'TemporalDis')
def add_tensorboard_output(self, file_name): import tensorboard_logger self._use_tensorboard = True self.tensorboard_logger = tensorboard_logger.Logger(file_name)
def main(): best_acc = 0 opt = parse_option() # tensorboard logger logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # dataloader train_partition = 'trainval' if opt.use_trainval else 'train' if opt.dataset == 'miniImageNet': train_trans, test_trans = transforms_options[opt.transform] if opt.distill in ['contrast']: train_set = ImageNet(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = ImageNet(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(ImageNet(args=opt, partition='val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaImageNet(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaImageNet(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: n_cls = 64 elif opt.dataset == 'tieredImageNet': train_trans, test_trans = transforms_options[opt.transform] if opt.distill in ['contrast']: train_set = TieredImageNet(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = TieredImageNet(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(TieredImageNet(args=opt, partition='train_phase_val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaTieredImageNet( args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaTieredImageNet( args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 448 else: n_cls = 351 elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100': train_trans, test_trans = transforms_options['D'] if opt.distill in ['contrast']: train_set = CIFAR100(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = CIFAR100(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(CIFAR100(args=opt, partition='train', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaCIFAR100(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaCIFAR100(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: if opt.dataset == 'CIFAR-FS': n_cls = 64 elif opt.dataset == 'FC100': n_cls = 60 else: raise NotImplementedError('dataset not supported: {}'.format( opt.dataset)) else: raise NotImplementedError(opt.dataset) # model model_t = load_teacher(opt.path_t, n_cls, opt.dataset) model_s = create_model(opt.model_s, n_cls, opt.dataset) data = torch.randn(2, 3, 84, 84) model_t.eval() model_s.eval() feat_t, _ = model_t(data, is_feat=True) feat_s, _ = model_s(data, is_feat=True) module_list = nn.ModuleList([]) module_list.append(model_s) trainable_list = nn.ModuleList([]) trainable_list.append(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) if opt.distill == 'kd': criterion_kd = DistillKL(opt.kd_T) elif opt.distill == 'contrast': criterion_kd = NCELoss(opt, n_data) embed_s = Embed(feat_s[-1].shape[1], opt.feat_dim) embed_t = Embed(feat_t[-1].shape[1], opt.feat_dim) module_list.append(embed_s) module_list.append(embed_t) trainable_list.append(embed_s) trainable_list.append(embed_t) elif opt.distill == 'attention': criterion_kd = Attention() elif opt.distill == 'hint': criterion_kd = HintLoss() else: raise NotImplementedError(opt.distill) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append( criterion_div) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kd) # other knowledge distillation loss # optimizer optimizer = optim.SGD(trainable_list.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) # append teacher after optimizer to avoid weight_decay module_list.append(model_t) if torch.cuda.is_available(): module_list.cuda() criterion_list.cuda() cudnn.benchmark = True # validate teacher accuracy teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt) print('teacher accuracy: ', teacher_acc) # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) # routine: supervised model distillation for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, module_list, criterion_list, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # save the last model state = { 'opt': opt, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s)) torch.save(state, save_file)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 # stliu: best accuracy args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) # stliu: add resnet_ttt if args.arch == 'resnet_ttt': model = moco.builder.MoCo(ResNetCifar, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp, width=args.width, norm=args.norm) _, ext, head, ssh = build_model( args, model.encoder_q ) # stliu: ext, head and ssh share same paras as encoder_q # stliu: SVM with model_val on single GPU norm_layer = get_norm(args.norm) model_val = ResNetCifar(num_classes=args.moco_dim, width=args.width, norm_layer=norm_layer) else: model = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) # print(model) # stliu: comment this if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model_val.cuda(args.gpu) # stliu: for SVM ssh = ssh.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) # stliu: add broadcast_buffers=False to use normal BN model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], broadcast_buffers=False, find_unused_parameters=True) ssh = torch.nn.parallel.DistributedDataParallel( ssh, device_ids=[args.gpu], broadcast_buffers=False, find_unused_parameters=True) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # ssh = torch.nn.parallel.DistributedDataParallel(ssh, device_ids=[args.gpu]) else: model.cuda() model_val.cuda() # stliu: for SVM ssh = ssh.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel( model, broadcast_buffers=False, find_unused_parameters=True) ssh = torch.nn.parallel.DistributedDataParallel( ssh, broadcast_buffers=False, find_unused_parameters=True) # model = torch.nn.parallel.DistributedDataParallel(model) # ssh = torch.nn.parallel.DistributedDataParallel(ssh) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) model_val = model_val.cuda(args.gpu) # stliu: for SVM ssh = ssh.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) parameters = list(model.parameters()) + list(head.parameters()) optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) head.load_state_dict(checkpoint['head']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # stliu: I design it as a function train_loader, train_sampler, memory_loader, test_loader, teset = get_loader( args) if args.val: state_dict = model.state_dict() for k in list(state_dict.keys()): if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): state_dict[k[len("module.encoder_q."):]] = state_dict[k] del state_dict[k] model_val.load_state_dict(state_dict, strict=False) flag_liblinear = '-s 2 -q -n ' + str(args.workers) if args.ttt: test_acc_svm = ttt_test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh, teset, head) else: test_acc_svm = test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh) print('#### result ####\n' + args.val + ':', test_acc_svm, '\n################') else: # stliu: tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) for epoch in range(args.start_epoch, args.epochs + 1): # stliu: to save the last one if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch loss = train(train_loader, model, criterion, optimizer, epoch, args, ssh) # stliu: tensorboard logger logger.log_value('loss', loss, epoch) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if (epoch % args.save_freq == 0 and epoch != 0 ) or epoch == args.epochs: # stliu: ignore the first model print('==> Saving...') save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'head': head.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename=args.model_folder + '/checkpoint_{:04d}.pth.tar'.format(epoch)) # stliu: test with SVM if (epoch + 1) % args.svm_freq == 0: state_dict = model.state_dict() for k in list(state_dict.keys()): if k.startswith('module.encoder_q') and not k.startswith( 'module.encoder_q.fc'): state_dict[ k[len("module.encoder_q."):]] = state_dict[k] del state_dict[k] model_val.load_state_dict(state_dict, strict=False) flag_liblinear = '-s 2 -q -n ' + str(args.workers) test_acc_svm = test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh) # stliu: save the best model is_best = test_acc_svm > best_acc1 best_acc1 = max(test_acc_svm, best_acc1) if is_best: print('==> Saving the Best...') save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'head': head.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=True, filename=args.model_folder + '/best.pth.tar') print('The Best SVM Accuracy:', best_acc1)
def main(): best_acc = 0 opt = parse_option() # tensorboard logger logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # dataloader if opt.dataset == 'cifar100': if opt.distill in ['crd']: train_loader, val_loader, n_data = get_cifar100_dataloaders_sample(batch_size=opt.batch_size, num_workers=opt.num_workers, k=opt.nce_k, mode=opt.mode) else: train_loader, val_loader, n_data = get_cifar100_dataloaders(batch_size=opt.batch_size, num_workers=opt.num_workers, n_test=100, is_instance=True) n_cls = 100 else: raise NotImplementedError(opt.dataset) # model model_t = load_teacher(opt.path_t, n_cls) model_s = model_dict[opt.model_s](num_classes=n_cls) data = torch.randn(2, 3, 32, 32) model_t.eval() model_s.eval() feat_t, _ = model_t(data, is_feat=True) feat_s, _ = model_s(data, is_feat=True) module_list = nn.ModuleList([]) module_list.append(model_s) trainable_list = nn.ModuleList([]) trainable_list.append(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) if opt.distill == 'kd': criterion_kd = DistillKL(opt.kd_T) elif opt.distill == 'hint': criterion_kd = HintLoss() regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape) module_list.append(regress_s) trainable_list.append(regress_s) elif opt.distill == 'crd': opt.s_dim = feat_s[-1].shape[1] opt.t_dim = feat_t[-1].shape[1] opt.n_data = n_data criterion_kd = CRDLoss(opt) module_list.append(criterion_kd.embed_s) module_list.append(criterion_kd.embed_t) trainable_list.append(criterion_kd.embed_s) trainable_list.append(criterion_kd.embed_t) elif opt.distill == 'attention': criterion_kd = Attention() elif opt.distill == 'nst': criterion_kd = NSTLoss() elif opt.distill == 'similarity': criterion_kd = Similarity() elif opt.distill == 'rkd': criterion_kd = RKDLoss() elif opt.distill == 'pkt': criterion_kd = PKT() elif opt.distill == 'kdsvd': criterion_kd = KDSVD() elif opt.distill == 'correlation': criterion_kd = Correlation() embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim) embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim) module_list.append(embed_s) module_list.append(embed_t) trainable_list.append(embed_s) trainable_list.append(embed_t) elif opt.distill == 'vid': s_n = [f.shape[1] for f in feat_s[1:-1]] t_n = [f.shape[1] for f in feat_t[1:-1]] criterion_kd = nn.ModuleList( [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)] ) # add this as some parameters in VIDLoss need to be updated trainable_list.append(criterion_kd) elif opt.distill == 'abound': s_shapes = [f.shape for f in feat_s[1:-1]] t_shapes = [f.shape for f in feat_t[1:-1]] connector = Connector(s_shapes, t_shapes) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(connector) init_trainable_list.append(model_s.get_feat_modules()) criterion_kd = ABLoss(len(feat_s[1:-1])) init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt) # classification module_list.append(connector) elif opt.distill == 'factor': s_shape = feat_s[-2].shape t_shape = feat_t[-2].shape paraphraser = Paraphraser(t_shape) translator = Translator(s_shape, t_shape) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(paraphraser) criterion_init = nn.MSELoss() init(model_s, model_t, init_trainable_list, criterion_init, train_loader, logger, opt) # classification criterion_kd = FactorTransfer() module_list.append(translator) module_list.append(paraphraser) trainable_list.append(translator) elif opt.distill == 'fsp': s_shapes = [s.shape for s in feat_s[:-1]] t_shapes = [t.shape for t in feat_t[:-1]] criterion_kd = FSP(s_shapes, t_shapes) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(model_s.get_feat_modules()) init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt) # classification training pass else: raise NotImplementedError(opt.distill) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append(criterion_div) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kd) # other knowledge distillation loss # optimizer optimizer = optim.SGD(trainable_list.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) # append teacher after optimizer to avoid weight_decay module_list.append(model_t) if torch.cuda.is_available(): module_list.cuda() criterion_list.cuda() cudnn.benchmark = True # embed the watermark into teacher model wm_loader = None if opt.watermark == "usenix": # paper: https://www.usenix.org/system/files/conference/usenixsecurity18/sec18-adi.pdf # Define an optimizer for the teacher trainable_list_t = nn.ModuleList([model_t]) optimizer_t = optim.SGD(trainable_list_t.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) wm_loader = get_usenixwm_dataloader() print("## Train data + Watermark Val Acc") teacher_acc, _, _ = validate(val_loader, model_t, nn.CrossEntropyLoss(), opt) print("==> embedding USENIX watermark...") max_epochs = 250 # Cutoff val for epoch in range(1, max_epochs + 1): set_learning_rate(2e-4, optimizer_t) top1, top5 = train_vanilla(epoch, wm_loader, model_t, nn.CrossEntropyLoss(), optimizer_t, opt) if top1 >= 97: break print("## Watermark val") teacher_acc, _, _ = validate(wm_loader, model_t, nn.CrossEntropyLoss(), opt) print("==> done") # validate teacher accuracy teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt) print('teacher accuracy: ', teacher_acc) # If teacher and student models match, copy over weights for initialization if (opt.init_strat == "noise") and (type(model_t) == type(model_s)): print("==> Copying teachers weights to student with a weight of {}".format(opt.init_inv_corr)) model_s.load_state_dict(model_t.state_dict()) with torch.no_grad(): for param in model_s.parameters(): if torch.cuda.is_available(): noise = (torch.randn(param.size()) * opt.init_inv_corr).cuda() param.add_(noise) else: param.add_(torch.randn(param.size()) * opt.init_inv_corr) student_acc, _, _ = validate(val_loader, model_s, criterion_cls, opt) print('student accuracy: ', student_acc) print("==> done") # routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, module_list, criterion_list, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, tect_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt) if wm_loader is not None: print("==> wm retention") wm_top1, wm_top5, wm_loss = validate(wm_loader, model_s, criterion_cls, opt) logger.log_value('wm_ret', wm_top1, epoch) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_loss', test_loss, epoch) logger.log_value('test_acc_top5', tect_acc_top5, epoch) # save the best model if test_acc > best_acc: best_acc = test_acc state = { 'epoch': epoch, 'model': model_s.state_dict(), 'best_acc': best_acc, } save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model_s)) print('saving the best model!') torch.save(state, save_file) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), 'accuracy': test_acc, } save_file = os.path.join(opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # This best accuracy is only for printing purpose. # The results reported in the paper/README is from the last epoch. print('best accuracy:', best_acc) # save model state = { 'opt': opt, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s)) torch.save(state, save_file)
def main(): # parse the args args = parse_option() # set the loader train_loader, n_data = get_train_loader(args) # set the model model, contrast, criterion_ab, criterion_l = set_model(args, n_data) # set the optimizer optimizer = set_optimizer(args, model) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model, contrast, criterion_l, criterion_ab, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('l_loss', l_loss, epoch) logger.log_value('l_prob', l_prob, epoch) logger.log_value('ab_loss', ab_loss, epoch) logger.log_value('ab_prob', ab_prob, epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'contrast': contrast.state_dict(), 'epoch': epoch, } save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) pass
def main(): best_acc = 0 opt = parse_option() # dataloader if opt.dataset == 'cifar100': train_loader, val_loader = get_cifar100_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers) n_cls = 100 else: raise NotImplementedError(opt.dataset) # model # model = model_dict[opt.model](num_classes=n_cls) device = "cuda:0" if torch.cuda.is_available() else "cpu" model = model_dict[opt.model](num_classes=n_cls, grad_proj=True, device=device) # optimizer optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, model, criterion, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model, criterion, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc: best_acc = test_acc state = { 'epoch': epoch, 'model': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model)) print('saving the best model!') torch.save(state, save_file) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model.state_dict(), 'accuracy': test_acc, 'optimizer': optimizer.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # This best accuracy is only for printing purpose. # The results reported in the paper/README is from the last epoch. print('best accuracy:', best_acc) # save model state = { 'opt': opt, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model)) torch.save(state, save_file)
def main(): best_acc = 0 opt = parse_option() print('method: %s' % 'Ours') print('model: %s' % opt.model) print('architecture: %s' % opt.arch) print('weight_decay %f' % opt.weight_decay) print('batch_size: %d' % opt.batch_size) print('r %f' % opt.gamma) #print('a %f' %opt.alpha) print('e %f' % opt.eta) print('KD_T %f' % opt.kd_T) # dataloader if opt.dataset == 'cifar100': n_cls = 100 # model model = model_dict[opt.model](num_classes=n_cls) # optimizer optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() criterion_cls = nn.CrossEntropyLoss() criterion_soft = Softmax_T(opt.kd_T) criterion_kl = KL(opt.kd_T) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append( criterion_soft) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kl) # other knowledge distillation loss if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() criterion_list.cuda() cudnn.benchmark = True # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # routine #targets=get_traintarget() train_logits = torch.zeros((50000, 100)) train_loader, val_loader = get_cifar100_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers, is_instance=False, is_shuffle=True, is_soft=True, train_softlabels=train_logits) for epoch in range(1, opt.epochs + 1): adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() if epoch <= opt.eta: train_acc, train_loss, train_logits = train1( epoch, train_loader, model, criterion, optimizer, opt, train_logits) else: train_acc, train_loss, train_logits = train2( epoch, train_loader, model, criterion_list, optimizer, opt, train_logits) train_logits = train_logits.detach().cpu() if epoch >= opt.eta and epoch % opt.eta == 0: print('label update') train_loader, val_loader = get_cifar100_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers, is_instance=False, is_shuffle=True, is_soft=True, train_softlabels=train_logits) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model, criterion, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc: best_acc = test_acc state = { 'epoch': epoch, 'model': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model)) print('saving the best model!') torch.save(state, save_file) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model.state_dict(), 'accuracy': test_acc, 'optimizer': optimizer.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # This best accuracy is only for printing purpose. # The results reported in the paper/README is from the last epoch. print('best accuracy:', best_acc) # save model state = { 'opt': opt, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model)) torch.save(state, save_file)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) if args.arch == 'resnet50': model = Model(resnet50,args,width=1) elif args.arch == 'resnet50x2': model = Model(resnet50,args,width=2) elif args.arch == 'resnet50x4': model = Model(resnet50,args,width=4) else: raise NotImplementedError('model not supported {}'.format(args.arch)) if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.batch_size_u = int(args.batch_size_u / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) #find_unused_parameters=True else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criteria_x = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True ) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True print("=> preparing dataset") # Data loading code transform_strong = transforms.Compose([ transforms.RandomResizedCrop(96, scale=(0.2, 1.)), transforms.RandomApply([ transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) ], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) ]) transform_weak = transforms.Compose([ transforms.RandomResizedCrop(96, scale=(0.2, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) ]) transform_eval = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) ]) three_crops_transform = loader.ThreeCropsTransform(transform_weak, transform_strong, transform_strong) unlabeled_dataset = CustomDataset(args.data, 'unlabeled', transform=three_crops_transform) labeled_dataset = CustomDataset(args.data, 'train', transform=transform_weak) #labeled_sampler = torch.utils.data.distributed.DistributedSampler(labeled_dataset) labeled_loader = torch.utils.data.DataLoader( labeled_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) #unlabeled_sampler = torch.utils.data.distributed.DistributedSampler(unlabeled_dataset) unlabeled_loader = torch.utils.data.DataLoader( unlabeled_dataset, batch_size=int(args.batch_size_u), shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CustomDataset('/content/dataset', 'val', transform_eval), batch_size=64, shuffle=False, num_workers=args.workers, pin_memory=True) # create loggers if args.gpu==0: tb_logger = tensorboard_logger.Logger(logdir=os.path.join(args.exp_dir,'tensorboard'), flush_secs=2) logger = setup_default_logging(args) logger.info(dict(args._get_kwargs())) else: tb_logger = None logger = None for epoch in range(args.start_epoch, args.epochs): if epoch==0: args.m = 0.99 # larger update in first epoch else: args.m = args.moco_m # args.lr=0.01 adjust_learning_rate(optimizer, epoch, args) train(labeled_loader, unlabeled_loader, model, criteria_x, optimizer, epoch, args, logger, tb_logger) # evaluate on validation set acc1 = validate(val_loader, model, args, logger, tb_logger, epoch) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'args': args, 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict() },filename='{}/checkpoint_{:04d}.pt'.format(args.exp_dir,epoch)) # evaluate ema model acc1 = validate(val_loader, model, args, logger, tb_logger, -1)
def TrainModels(learners, train_loader, val_loader, train_settings, out_dir, cuda_device_id=0, batch_use_prob=1.0, print_log=True, log_dir=''): logger = None if log_dir != '': logger = tensorboard_logger.Logger(log_dir, flush_secs=5) train_log = [] min_validation_losses = [float('inf') for _ in learners] min_validation_loss = float('inf') num_inputs = len(learners[0].net.InputNames()) num_labels = len(learners[0].net.LabelNames()) for epoch in range(train_settings.epochs): running_losses = [0.0 for _ in learners] train_examples_per_net = [0 for _ in learners] for learner in learners: learner.optimizer.zero_grad() epoch_start_time = time.time() for training_batch in train_loader: input_vars, label_vars = DataBatchToVariables( training_batch[:-1], num_inputs, num_labels, cuda_device_id) example_indices = torch.squeeze(training_batch[-1], dim=1) for net_idx, learner in enumerate(learners): if random.uniform(0.0, 1.0) < batch_use_prob: # Read the weights. example_weights = MakeWeightsVariable( learner.weighter, example_indices, cuda_device_id) # forward + backward + optimize outputs = learner.net(input_vars) loss_value_per_example = train_settings.loss( outputs, label_vars) # TODO track unweighted losses separately. loss_value = torch.mean( torch.mul(loss_value_per_example, example_weights)) loss_value.backward() learner.optimizer.step() learner.optimizer.zero_grad() # Update weights using loss_value_per_example. learner.weighter.RegisterLosses( example_indices, loss_value_per_example.cpu().data.numpy()) # Accumulate statistics batch_size = input_vars[0].size()[0] train_examples_per_net[net_idx] += batch_size running_losses[net_idx] += loss_value.data[0] * batch_size epoch_end_time = time.time() epoch_duration = epoch_end_time - epoch_start_time examples_per_sec = sum(train_examples_per_net) / epoch_duration avg_losses = AverageLosses(running_losses, train_examples_per_net) avg_loss = sum(running_losses) / sum(train_examples_per_net) validation_total_losses = [0.0 for _ in learners] validation_examples = [0 for _ in learners] for learner in learners: learner.weighter.Step() learner.net.eval() for val_batch in val_loader: input_vars, label_vars = DataBatchToVariables( val_batch[:-1], num_inputs, num_labels, cuda_device_id) for net_idx, learner in enumerate(learners): outputs = learner.net(input_vars) loss_value_per_example = train_settings.loss( outputs, label_vars) loss_value = torch.mean(loss_value_per_example) batch_size = input_vars[0].size()[0] validation_examples[net_idx] += batch_size validation_total_losses[ net_idx] += loss_value.data[0] * batch_size for learner in learners: learner.net.train() validation_avg_losses = AverageLosses(validation_total_losses, validation_examples) validation_avg_loss = (sum(validation_total_losses) / sum(validation_examples)) epoch_metrics = { TRAIN_LOSS: avg_loss, VAL_LOSS: validation_avg_loss, EPOCH_DURATION_SEC: epoch_duration, EXAMPLES_PER_SEC: examples_per_sec } train_log.append(epoch_metrics) val_improved_marker = '' if validation_avg_loss < min_validation_loss: val_improved_marker = ' ***' min_validation_loss = validation_avg_loss elif (validation_avg_loss * 0.9) < min_validation_loss: # Highlight epochs with validation loss almost as good as the current # best loss. val_improved_marker = ' *' for net_idx, learner in enumerate(learners): if learner.lr_scheduler is not None: learner.lr_scheduler.step(validation_avg_losses[net_idx]) if validation_avg_losses[net_idx] < min_validation_losses[net_idx]: learner.net.cpu() torch.save( learner.net.state_dict(), io_helpers.ModelFileName(out_dir, net_idx, io_helpers.BEST)) learner.net.cuda(cuda_device_id) min_validation_losses[net_idx] = validation_avg_losses[net_idx] # Maybe print metrics to screen. if print_log: print('Epoch %d; %s%s' % (epoch, TrainLogEventToString(epoch_metrics), val_improved_marker)) # Maybe log metrics to tensorboard. if log_dir != '': logger.log_value('train_loss', avg_loss, epoch) logger.log_value('val_loss', validation_avg_loss, epoch) for net_idx, learner in enumerate(learners): learner.net.cpu() torch.save(learner.net.state_dict(), io_helpers.ModelFileName(out_dir, net_idx, io_helpers.LAST)) learner.net.cuda(cuda_device_id) return train_log
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) if args.arch == 'resnet50': model = MoPro(resnet50, args, width=1) elif args.arch == 'resnet50x2': model = MoPro(resnet50, args, width=2) elif args.arch == 'resnet50x4': model = MoPro(resnet50, args, width=4) else: raise NotImplementedError('model not supported {}'.format(args.arch)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code loader = dataloader.webvision_dataloader(batch_size=args.batch_size,num_workers=args.workers,\ root_dir=args.data,num_class=args.num_class,distributed=args.distributed) train_loader, test_loader, _ = loader.run() if args.gpu == 0: logger = tb_logger.Logger(logdir=os.path.join(args.exp_dir, 'tensorboard'), flush_secs=2) else: logger = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: loader.train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) train(train_loader, model, criterion, optimizer, epoch, args, logger) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename='{}/checkpoint_{:04d}.pth.tar'.format( args.exp_dir, epoch)) test(model, test_loader, args, epoch, logger)
model = model.cuda() ''' vocab, vec = torchwordemb.load_word2vec_bin("../dataset/GoogleNews-vectors-negative300.bin") text_field.vocab.set_vectors(vocab, vec, EMBEDDING_DIM) ''' text_field.vocab.load_vectors('glove.6B.300d') model.word_embeddings.weight.data = text_field.vocab.vectors.cuda() model.word_embeddings.weight.requires_grad = False loss_function = nn.NLLLoss() update_parameter = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(update_parameter, lr=1e-3) os.system('rm -rf ' + RESULT_PATH) train_logger = tensorboard_logger.Logger(RESULT_PATH + "/summaries/train/") dev_logger = tensorboard_logger.Logger(RESULT_PATH + '/summaries/dev/') no_up = 0 start_time = time.time() for i in range(EPOCH): print('epoch: %d start!' % i) train_epoch(model, train_iter, loss_function, optimizer, text_field, label_field, i, train_logger) print('now best dev acc:', best_dev_acc) dev_acc = evaluate(model, dev_iter, loss_function, i, dev_logger, 'dev') if dev_acc > best_dev_acc: best_dev_acc = dev_acc os.system('rm ' + RESULT_PATH + '/best_models/mr_best_model_minibatch_acc_*.model') os.system('mkdir ' + RESULT_PATH + '/best_models/')
def main(): best_acc = 0 opt = parse_option() # tensorboard logger logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # dataloader if opt.dataset == 'cifar100': if opt.distill in ['crd']: train_loader, val_loader, n_data = get_cifar100_dataloaders_sample( batch_size=opt.batch_size, num_workers=opt.num_workers, k=opt.nce_k, mode=opt.mode) else: train_loader, val_loader, n_data = get_cifar100_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers, is_instance=True) n_cls = 100 else: raise NotImplementedError(opt.dataset) # model arch_t = opt.model_t arch_s = opt.model_s model_t = load_teacher(opt.path_t, n_cls) model_s = model_dict[opt.model_s](num_classes=n_cls) data = torch.randn(2, 3, 32, 32) model_t.eval() model_s.eval() feat_t, _ = model_t(data, is_feat=True) feat_s, _ = model_s(data, is_feat=True) # print(arch_t) # for ft in feat_t: # print(ft.shape) # print(arch_s) # for fs in feat_s: # print(fs.shape) # return module_list = nn.ModuleList([]) module_list.append(model_s) trainable_list = nn.ModuleList([]) trainable_list.append(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) if opt.distill == 'kd': criterion_kd = DistillKL(opt.kd_T) elif opt.distill == 'hint': criterion_kd = HintLoss() regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape) module_list.append(regress_s) trainable_list.append(regress_s) elif opt.distill == 'crd': opt.s_dim = feat_s[-1].shape[1] opt.t_dim = feat_t[-1].shape[1] opt.n_data = n_data criterion_kd = CRDLoss(opt) module_list.append(criterion_kd.embed_s) module_list.append(criterion_kd.embed_t) trainable_list.append(criterion_kd.embed_s) trainable_list.append(criterion_kd.embed_t) elif opt.distill == 'attention': criterion_kd = Attention() elif opt.distill == 'nst': criterion_kd = NSTLoss() elif opt.distill == 'similarity': criterion_kd = Similarity() elif opt.distill == 'rkd': criterion_kd = RKDLoss() elif opt.distill == 'irg': criterion_kd = IRGLoss() elif opt.distill == 'hkd': criterion_kd = HKDLoss(init_weight=opt.hkd_initial_weight, decay=opt.hkd_decay) elif opt.distill == 'pkt': criterion_kd = PKT() elif opt.distill == 'kdsvd': criterion_kd = KDSVD() elif opt.distill == 'correlation': criterion_kd = Correlation() embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim) embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim) module_list.append(embed_s) module_list.append(embed_t) trainable_list.append(embed_s) trainable_list.append(embed_t) elif opt.distill == 'vid': s_n = [f.shape[1] for f in feat_s[1:-1]] t_n = [f.shape[1] for f in feat_t[1:-1]] criterion_kd = nn.ModuleList( [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)]) # add this as some parameters in VIDLoss need to be updated trainable_list.append(criterion_kd) elif opt.distill == 'abound': s_shapes = [f.shape for f in feat_s[1:-1]] t_shapes = [f.shape for f in feat_t[1:-1]] connector = Connector(s_shapes, t_shapes) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(connector) init_trainable_list.append(model_s.get_feat_modules()) criterion_kd = ABLoss(len(feat_s[1:-1])) init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt) # classification module_list.append(connector) elif opt.distill == 'factor': s_shape = feat_s[-2].shape t_shape = feat_t[-2].shape paraphraser = Paraphraser(t_shape) translator = Translator(s_shape, t_shape) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(paraphraser) criterion_init = nn.MSELoss() init(model_s, model_t, init_trainable_list, criterion_init, train_loader, logger, opt) # classification criterion_kd = FactorTransfer() module_list.append(translator) module_list.append(paraphraser) trainable_list.append(translator) elif opt.distill == 'fsp': s_shapes = [s.shape for s in feat_s[:-1]] t_shapes = [t.shape for t in feat_t[:-1]] criterion_kd = FSP(s_shapes, t_shapes) # init stage training init_trainable_list = nn.ModuleList([]) init_trainable_list.append(model_s.get_feat_modules()) init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt) # classification training pass elif opt.distill == 'l2tww': pairs = [] for pair in opt.pairs.split(','): pairs.append((int(pair.split('-')[0]), int(pair.split('-')[1]))) criterion_kd = L2TWW(opt.model_t, opt.model_s, pairs) wnet = WeightNetwork(opt.model_t, pairs) lwnet = LossWeightNetwork(opt.model_t, pairs, opt.loss_weight_type, opt.loss_weight_init) trainable_list.append(wnet) trainable_list.append(lwnet) module_list.append(wnet) module_list.append(lwnet) else: raise NotImplementedError(opt.distill) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append( criterion_div) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kd) # other knowledge distillation loss # optimizer optimizer = optim.SGD(trainable_list.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) # append teacher after optimizer to avoid weight_decay module_list.append(model_t) if torch.cuda.is_available(): module_list.cuda() criterion_list.cuda() cudnn.benchmark = True # validate teacher accuracy teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt) print('teacher accuracy: ', teacher_acc) # routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() # train_loss, train_acc = 0, 0 train_acc, train_loss = train(epoch, train_loader, module_list, criterion_list, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, tect_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_loss', test_loss, epoch) logger.log_value('test_acc_top5', tect_acc_top5, epoch) # save the best model if test_acc > best_acc: best_acc = test_acc state = { 'epoch': epoch, 'model': model_s.state_dict(), 'best_acc': best_acc, } save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model_s)) print('saving the best model!') torch.save(state, save_file) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), 'accuracy': test_acc, } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # This best accuracy is only for printing purpose. # The results reported in the paper/README is from the last epoch. print('best accuracy:', best_acc) # save model state = { 'opt': opt, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s)) torch.save(state, save_file)
def main(): if not torch.cuda.is_available(): raise 'Only support GPU mode' # parse the args args = parse_option() print(vars(args)) best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print('[Warning] The training modalities are RGB and [{}]'.format( args.modality)) # Data train_transforms = transforms.Compose([ transforms.Resize((128, 171)), # smaller edge to 128 transforms.RandomCrop(112), transforms.ToTensor() ]) if args.dataset == 'ucf101': trainset = UCF101Dataset('./data/ucf101/', transforms_=train_transforms) else: trainset = HMDB51Dataset('./data/hmdb51/', transforms_=train_transforms) train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) n_data = trainset.__len__() # set the model model, contrast, criterion_1, criterion_2 = set_model(args, n_data) # set the optimizer optimizer = set_optimizer(args, model) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[45, 90, 125, 160], gamma=0.2) # routine for epoch in range(args.start_epoch, args.epochs + 1): time1 = time.time() view1_loss, view1_prob, view2_loss, view2_prob = train( epoch, train_loader, model, contrast, criterion_1, criterion_2, optimizer, args) time2 = time.time() print('\nepoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('view1_loss', view1_loss, epoch) logger.log_value('view1_prob', view1_prob, epoch) logger.log_value('view2_loss', view2_loss, epoch) logger.log_value('view2_prob', view2_prob, epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache() scheduler.step() print(args.model_name)
def main(): global args global tensorboard_logger use_cuda = torch.cuda.is_available() initialize_environment(random_seed=cfg.RNG_SEED, use_cuda=use_cuda) assert 0 <= args.feat_id <= 3 assert 0 <= args.corpora_id <= 6 feat_name = feat_dict[args.feat_id] corpora_name = data_dict[args.corpora_id] datadir = os.path.join('data', corpora_name) nepoch = args.nepoch step = args.step_epoch dropout = args.dropout n_layers = cfg.N_LAYERS input_dim = input_feat_size_dict[args.feat_id] hidden_dims = cfg.HIDDEN_DIMS if args.feat_id == 0 or args.feat_id == 1: feat_func_dict = {'ln': ln, 'n': norm} elif args.feat_id == 2: feat_func_dict = {'ln': ln, 'n': norm, 'i': lambda x: x} elif args.feat_id == 3: feat_func_dict = {'i': lambda x: x} for feat_func_name, feat_func in feat_func_dict.items(): print(corpora_name, feat_name, feat_func_name) outputdir = os.path.join('data', corpora_name, feat_name + '_' + feat_func_name) # logging information loggin_dir = os.path.join(outputdir, 'runs', 'pretraining') if not os.path.exists(loggin_dir): os.makedirs(loggin_dir) tensorboard_logger = TF_LOGGER.Logger( os.path.join(loggin_dir, '%s' % (args.id))) # tensorboard_logger.configure(os.path.join(loggin_dir, '%s' % (args.id))) trainset = EncodedTextDataset(root=datadir, train=True, feat_name=feat_name, feat_func=feat_func) testset = EncodedTextDataset(root=datadir, train=False, feat_name=feat_name, feat_func=feat_func) kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {} trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batchsize, shuffle=True, **kwargs) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=True, **kwargs) pretrain( outputdir, { 'nlayers': n_layers, 'dropout': dropout, 'reluslope': 0.0, 'nepoch': nepoch, 'lrate': [args.lr], 'wdecay': [0.0], 'step': step, 'input_dim': input_dim, 'hidden_dims': hidden_dims }, use_cuda, trainloader, testloader)
def initialize_tensorboard(self): outputdir = get_output_dir(self.root_dir) loggin_dir = os.path.join(outputdir, 'runs', 'clustering') if not os.path.exists(loggin_dir): os.makedirs(loggin_dir) self.logger_tensorboard = tensorboard_logger.Logger(os.path.join(loggin_dir, '{}'.format(self.id)))
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # freeze all layers but the last fc for name, param in model.named_parameters(): if name not in ['fc.weight', 'fc.bias']: param.requires_grad = False # init the fc layer model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() if args.gpu==0: logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) else: logger = None # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, logger, epoch) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }) if epoch == args.start_epoch: sanity_check(model.state_dict(), args.pretrained)