def main(): best_acc = 0 opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, classifier, criterion = set_model(opt) # build optimizer # optimizer = set_optimizer(opt, [classifier]) optimizer = set_optimizer(opt, [classifier, model]) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, acc = train(train_loader, model, classifier, criterion, optimizer, epoch, opt) time2 = time.time() print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format( epoch, time2 - time1, acc)) # eval for one epoch loss, val_acc = validate(val_loader, model, classifier, criterion, opt) if val_acc > best_acc: best_acc = val_acc print('best accuracy: {:.2f}'.format(best_acc))
def main(): best_acc = 0 opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, criterion = set_model(opt) # build optimizer optimizer = set_optimizer(opt, model) # tensorboard writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger writer.add_scalar('train_loss', loss, global_step=epoch) writer.add_scalar('train_acc', train_acc, global_step=epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch) # evaluation loss, val_acc = validate(val_loader, model, criterion, opt) writer.add_scalar('val_loss', loss, global_step=epoch) writer.add_scalar('val_acc', val_acc, global_step=epoch) if val_acc > best_acc: best_acc = val_acc if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model, optimizer, opt, epoch, save_file) # save the last model save_file = os.path.join(opt.save_folder, 'last.pth') save_model(model, optimizer, opt, opt.epochs, save_file) print('best accuracy: {:.2f}'.format(best_acc))
def re_train(): sent_analysis = SentimentAnalysis() #d_word_index, embed = sent_analysis.get_vocab() train_loader, val_loader = sent_analysis.get_trainer() model, criterion, optimizer = sent_analysis.get_model() for epoch in range(1, sent_analysis.epochs + 1): util.adjust_learning_rate(sent_analysis.lr, optimizer, epoch) train(train_loader, model, criterion, optimizer, epoch) #test(val_loader, model, criterion) if epoch % sent_analysis.sf == 0: name_model = 'rnn_{}.pkl'.format(epoch) path_save_model = os.path.join('./', name_model) joblib.dump(model.float(), path_save_model, compress = 2)
def main(): best_acc = 0 best_acc5 = 0 opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, classifier, criterion = set_model(opt) # build optimizer optimizer = set_optimizer(opt, classifier) logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, acc, acc5 = train(train_loader, model, classifier, criterion, optimizer, epoch, opt) time2 = time.time() logging.info( 'Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format( epoch, time2 - time1, acc)) logger.log_value('classifier/train_loss', loss, epoch) logger.log_value('classifier/train_acc1', acc, epoch) logger.log_value('classifier/train_acc5', acc5, epoch) # eval for one epoch loss, val_acc, val_acc5 = validate(val_loader, model, classifier, criterion, opt) logger.log_value('classifier/val_loss', loss, epoch) logger.log_value('classifier/val_acc1', val_acc, epoch) logger.log_value('classifier/val_acc5', val_acc5, epoch) if val_acc > best_acc: best_acc = val_acc best_acc5 = val_acc5 logging.info('best accuracy: {:.2f}, accuracy5: {:.2f}'.format( best_acc, best_acc5))
def main(): best_acc = 0 opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, classifier, criterion = set_model(opt) # build optimizer optimizer = set_optimizer(opt, classifier) # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, acc = train(train_loader, model, classifier, criterion, optimizer, epoch, opt) time2 = time.time() print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format( epoch, time2 - time1, acc)) # eval for one epoch loss, val_acc = validate(val_loader, model, classifier, criterion, opt) if val_acc > best_acc: best_acc = val_acc # tensorboard logger logger.log_value('loss', loss, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model, optimizer, opt, epoch, save_file, classifier) print('best accuracy: {:.2f}'.format(best_acc))
def main(): best_acc = 0 best_classifier = None opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, classifier, criterion = set_model(opt) best_classifier = classifier # build optimizer optimizer = set_optimizer(opt, classifier) if opt.eval: loss, val_acc = validate(val_loader, model, classifier, criterion, opt) else: # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, acc = train(train_loader, model, classifier, criterion, optimizer, epoch, opt) time2 = time.time() print('Train epoch {}, total time {:.2f}, accuracy:{:.2f}'.format( epoch, time2 - time1, acc)) # eval for one epoch loss, val_acc = validate(val_loader, model, classifier, criterion, opt) if val_acc > best_acc: best_acc = val_acc best_classifier = classifier print('best accuracy: {:.2f}'.format(best_acc)) for epsilon in opt.epsilons: loss, acc, adv_acc = adveval(val_loader, model, best_classifier, criterion, opt, epsilon) print('adv accuracy at epsilon {:.2f}: {:.2f}'.format( epsilon, adv_acc))
def main(): opt = parse_option() # build data loader train_loader = set_loader(opt) # build model and criterion model, criterion = set_model(opt) # build optimizer optimizer = set_optimizer(opt, model) # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss = train(train_loader, model, criterion, optimizer, epoch, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('loss', loss, epoch) logger.log_value( 'learning_rate', optimizer.param_groups[0]['lr'], epoch) if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model, optimizer, opt, epoch, save_file) # save the last model save_file = os.path.join( opt.save_folder, 'last.pth') save_model(model, optimizer, opt, opt.epochs, save_file)
def main(): best_acc = 0 opt = parse_option() wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags) wandb.config.update(opt) wandb.save('*.py') wandb.run.save() # dataloader train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders(opt) # model model_t = [] if("," in opt.path_t): for path in opt.path_t.split(","): model_t.append(load_teacher(path, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size)) else: model_t.append(load_teacher(opt.path_t, opt.model_t, n_cls, opt.dataset, opt.trans, opt.memfeature_size)) model_s = create_model(opt.model_s, n_cls, opt.dataset, n_trans=opt.trans, embd_sz=opt.memfeature_size) if torch.cuda.device_count() > 1: print("second gpu count:", torch.cuda.device_count()) model_s = nn.DataParallel(model_s) if opt.pretrained_path != "": model_s.load_state_dict(torch.load(opt.pretrained_path)['model']) wandb.watch(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) criterion_kd = DistillKL(opt.kd_T) optimizer = optim.SGD(model_s.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) if torch.cuda.is_available(): for m in model_t: m.cuda() model_s.cuda() criterion_cls = criterion_cls.cuda() criterion_div = criterion_div.cuda() criterion_kd = criterion_kd.cuda() cudnn.benchmark = True MemBank = np.random.randn(no_sample, opt.memfeature_size) MemBank = torch.tensor(MemBank, dtype=torch.float).cuda() MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True) MemBank = MemBank / (MemBankNorm + 1e-6) meta_test_acc = 0 meta_test_std = 0 # routine: supervised model distillation for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss, MemBank = train(epoch, train_loader, model_s, model_t , criterion_cls, criterion_div, criterion_kd, optimizer, opt, MemBank) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) val_acc = 0 val_loss = 0 meta_val_acc = 0 meta_val_std = 0 # val_acc, val_acc_top5, val_loss = validate(val_loader, model_s, criterion_cls, opt) # #evaluate # start = time.time() # meta_val_acc, meta_val_std = meta_test(model_s, meta_valloader) # test_time = time.time() - start # print('Meta Val Acc: {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format(meta_val_acc, meta_val_std, test_time)) #evaluate start = time.time() meta_test_acc, meta_test_std = 0,0 #meta_test(model_s, meta_testloader, use_logit=False) test_time = time.time() - start print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'.format(meta_test_acc, meta_test_std, test_time)) # regular saving if epoch % opt.save_freq == 0 or epoch==opt.epochs: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, 'model_'+str(wandb.run.name)+'.pth') torch.save(state, save_file) #wandb saving torch.save(state, os.path.join(wandb.run.dir, "model.pth")) wandb.log({'epoch': epoch, 'Train Acc': train_acc, 'Train Loss':train_loss, 'Val Acc': val_acc, 'Val Loss':val_loss, 'Meta Test Acc': meta_test_acc, 'Meta Test std': meta_test_std, 'Meta Val Acc': meta_val_acc, 'Meta Val std': meta_val_std }) #final report print("GENERATING FINAL REPORT") generate_final_report(model_s, opt, wandb) #remove output.txt log file output_log_file = os.path.join(wandb.run.dir, "output.log") if os.path.isfile(output_log_file): os.remove(output_log_file) else: ## Show an error ## print("Error: %s file not found" % output_log_file)
def main(): global best_acc1 best_acc1 = 0 args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader train_folder = os.path.join(args.data_folder, 'train') val_folder = os.path.join(args.data_folder, 'val') image_size = 224 crop_padding = 32 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize = transforms.Normalize(mean=mean, std=std) if args.aug == 'NULL': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif args.aug == 'CJ': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: raise NotImplemented('augmentation not supported: {}'.format(args.aug)) train_dataset = datasets.ImageFolder(train_folder, train_transform) val_dataset = datasets.ImageFolder( val_folder, transforms.Compose([ transforms.Resize(image_size + crop_padding), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ])) print(len(train_dataset)) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) # create model and optimizer if args.model == 'resnet50': model = InsResNet50() classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 1) elif args.model == 'resnet50x2': model = InsResNet50(width=2) classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 2) elif args.model == 'resnet50x4': model = InsResNet50(width=4) classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 4) else: raise NotImplementedError('model not supported {}'.format(args.model)) print('==> loading pre-trained model') ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['model']) print("==> loaded checkpoint '{}' (epoch {})".format( args.model_path, ckpt['epoch'])) print('==> done') model = model.cuda() classifier = classifier.cuda() criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu) if not args.adam: optimizer = torch.optim.SGD(classifier.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=1e-8) model.eval() cudnn.benchmark = True # set mixed precision training # if args.amp: # model = amp.initialize(model, opt_level=args.opt_level) # classifier, optimizer = amp.initialize(classifier, optimizer, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc1 = checkpoint['best_acc1'] best_acc1 = best_acc1.cuda() print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if 'opt' in checkpoint.keys(): # resume optimization hyper-parameters print('=> resume hyper parameters') if 'bn' in vars(checkpoint['opt']): print('using bn: ', checkpoint['opt'].bn) if 'adam' in vars(checkpoint['opt']): print('using adam: ', checkpoint['opt'].adam) if 'cosine' in vars(checkpoint['opt']): print('using cosine: ', checkpoint['opt'].cosine) args.learning_rate = checkpoint['opt'].learning_rate # args.lr_decay_epochs = checkpoint['opt'].lr_decay_epochs args.lr_decay_rate = checkpoint['opt'].lr_decay_rate args.momentum = checkpoint['opt'].momentum args.weight_decay = checkpoint['opt'].weight_decay args.beta1 = checkpoint['opt'].beta1 args.beta2 = checkpoint['opt'].beta2 del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # set cosine annealing scheduler if args.cosine: # last_epoch = args.start_epoch - 2 # eta_min = args.learning_rate * (args.lr_decay_rate ** 3) * 0.1 # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min, last_epoch) eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1 scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min, -1) # dummy loop to catch up with current epoch for i in range(1, args.start_epoch): scheduler.step() # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): if args.cosine: scheduler.step() else: adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_acc5, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) print("==> testing...") test_acc, test_acc5, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc5', test_acc5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving best model!') torch.save(state, save_name) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': test_acc, 'optimizer': optimizer.state_dict(), } save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch) save_name = os.path.join(args.save_folder, save_name) print('saving regular model!') torch.save(state, save_name) # tensorboard logger pass
def main(): args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader data_folder = os.path.join(args.data_folder, 'train') image_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize = transforms.Normalize(mean=mean, std=std) if args.aug == 'NULL': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) elif args.aug == 'CJ': train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: raise NotImplemented('augmentation not supported: {}'.format(args.aug)) train_dataset = ImageFolderInstance(data_folder, transform=train_transform, two_crop=args.moco) print(len(train_dataset)) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) # create model and optimizer n_data = len(train_dataset) if args.model == 'resnet50': model = InsResNet50() if args.moco: model_ema = InsResNet50() elif args.model == 'resnet50x2': model = InsResNet50(width=2) if args.moco: model_ema = InsResNet50(width=2) elif args.model == 'resnet50x4': model = InsResNet50(width=4) if args.moco: model_ema = InsResNet50(width=4) else: raise NotImplementedError('model not supported {}'.format(args.model)) # copy weights from `model' to `model_ema' if args.moco: moment_update(model, model_ema, 0) # set the contrast memory and criterion if args.moco: contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu) else: contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t, args.nce_m, args.softmax).cuda(args.gpu) criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data) criterion = criterion.cuda(args.gpu) model = model.cuda() if args.moco: model_ema = model_ema.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) if args.moco: optimizer_ema = torch.optim.SGD(model_ema.parameters(), lr=0, momentum=0, weight_decay=0) model_ema, optimizer_ema = amp.initialize(model_ema, optimizer_ema, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') # checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) if args.moco: model_ema.load_state_dict(checkpoint['model_ema']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded successfully '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() if args.moco: loss, prob = train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, args) else: loss, prob = train_ins(epoch, train_loader, model, contrast, criterion, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('ins_loss', loss, epoch) logger.log_value('ins_prob', prob, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.moco: state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state # saving the model print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.moco: state['model_ema'] = model_ema.state_dict() if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join(args.model_folder, 'current.pth') torch.save(state, save_file) if epoch % args.save_freq == 0: save_file = os.path.join( args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
# save whole model (including stylebank) torch.save(model.state_dict(), args.MODEL_WEIGHT_PATH) # save seperate part with open(args.GLOBAL_STEP_PATH, 'w') as f: f.write(str(global_step)) torch.save(model.encoder_net.state_dict(), args.ENCODER_WEIGHT_PATH) torch.save(model.decoder_net.state_dict(), args.DECODER_WEIGHT_PATH) for i in range(len(style_dataset)): torch.save(model.style_bank[i].state_dict(), args.BANK_WEIGHT_PATH.format(i)) if global_step % args.ADJUST_LR_ITER == 0: lr_step = global_step / args.ADJUST_LR_ITER util.adjust_learning_rate(optimizer, lr_step) new_lr = util.adjust_learning_rate(optimizer_ae, lr_step) print("learning rate decay:", new_lr) # In[13]: """ Testing """ #for i, data in enumerate(content_dataloader, 0): # data = data[0].to(device) # batch_size = data.shape[0] ## data = data[0].repeat(batch_size, 1, 1, 1) # for j in range(batch_size): # util.showimg(data[j].cpu()) #
print(model) print(optimizer) print(criterion) if args['cuda']: torch.backends.cudnn.enabled = True cudnn.benchmark = True model.cuda() criterion = criterion.cuda() # training and testing start_time = time.time() # 创建文件写控制器,将之后的数值以protocol buffer格式写入到logs文件夹中,空的logs文件夹将被自动创建。 writer = SummaryWriter() for epoch in range(1, args['epochs'] + 1): util.adjust_learning_rate(args['learning_rate'], optimizer, epoch) if args['model'] == 'grn16': common.train_grn16(train_loader, model, criterion, optimizer, epoch, args['cuda'], args['clip'], args['print_freq']) common.test_grn16(val_loader, model, criterion, args['cuda'], args['print_freq']) elif args['model'] == 'keann': common.train_keann(train_loader, model, criterion, optimizer, epoch, args['cuda'], args['clip'], args['print_freq']) common.test_keann(val_loader, model, criterion, args['cuda'], args['print_freq'], args['pdtb_category']) elif args['model'] == 'keann_kg': common.train_keann_kg(train_loader, model, criterion, optimizer, epoch, args['cuda'], args['clip'], args['print_freq'], writer) common.test_keann_kg(val_loader, model, criterion, args['cuda'],
def train_net(): annList = [ '../data/train/Annotations/blouse.csv', '../data/train/Annotations/dress.csv', '../data/train/Annotations/outwear.csv', '../data/train/Annotations/skirt.csv', '../data/train/Annotations/trousers.csv' ] classNumList = [13, 15, 14, 4, 7] index_array = [[2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20], [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20], [17, 18, 21, 22, 23, 24, 25]] paramsNameList = ['blouse', 'dress', 'outwear', 'skirt', 'trousers'] modelSaveList = [ '../saveparameter/blouse/', '../saveparameter/dress/', '../saveparameter/outwear/', '../saveparameter/skirt/', '../saveparameter/trousers/' ] paramsOldList = [ '../saveparameter/blouse/3000res50.pth.tar', '../saveparameter/dress/15000new2.pth.tar', '../saveparameter/outwear/10000new2.pth.tar', '../saveparameter/skirt/5000new2.pth.tar', '/home/tanghm/Documents/YFF/project/saveparameter/trousers/15000new2.pth.tar' ] for idx in range(0, 1): #打印当前训练的服饰类别 print('train' + paramsNameList[idx]) #该服饰一共需要预测多少个关键点 numpoints = classNumList[idx] #构建模型 model = construct_model(numpoints) state_dict = torch.load(paramsOldList[idx])['state_dict'] model.load_state_dict(state_dict) # lable文件的路径 ann_path = annList[idx] #图像所在路径 img_dir = '../data/train/' stride = 8 cudnn.benchmark = True config = util.Config('./config.yml') #构建训练的数据 train_loader = torch.utils.data.DataLoader( dataset_loader.dataset_loader(numpoints, img_dir, ann_path, stride, Mytransforms.Compose([ Mytransforms.RandomResized(), Mytransforms.RandomRotate(40), Mytransforms.RandomCrop(384), ]), sigma=15), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) #网络的loss函数类型 if (torch.cuda.is_available()): criterion = nn.MSELoss().cuda() params = [] for key, value in model.named_parameters(): if value.requires_grad != False: params.append({'params': value, 'lr': config.base_lr}) # optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum, # weight_decay=config.weight_decay) optimizer = torch.optim.Adam(params, lr=config.base_lr, betas=(0.9, 0.99), weight_decay=config.weight_decay) # model.train() # only for bn and dropout model.eval() # from matplotlib import pyplot as plt iters = 0 batch_time = util.AverageMeter() data_time = util.AverageMeter() losses = util.AverageMeter() losses_list = [util.AverageMeter() for i in range(12)] end = time.time() heat_weight = 48 * 48 * ( classNumList[idx] + 1) / 2.0 # for convenient to compare with origin code # heat_weight = 1 while iters < config.max_iter: #input 表示图片,heatmap表示网络输出值 for i, (input, heatmap) in enumerate(train_loader): learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy, \ policy_parameter=config.policy_parameter) data_time.update(time.time() - end) if (torch.cuda.is_available()): input = input.cuda(async=True) heatmap = heatmap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) #将图像进行tensor和Variable转化后喂进模型 heat = model(input_var) # feat = C4.cpu().data.numpy() # for n in range(100): # plt.subplot(10, 10, n + 1); # plt.imshow(feat[0, n, :, :], cmap='gray') # plt.xticks([]); # plt.yticks([]) # plt.show() loss1 = criterion(heat, heatmap_var) * heat_weight # loss2 = criterion(heat4, heatmap_var) * heat_weight # loss3 = criterion(heat5, heatmap_var) * heat_weight # loss4 = criterion(heat6, heatmap_var) * heat_weight # loss5 = criterion(heat, heatmap_var) # loss6 = criterion(heat, heatmap_var) loss = loss1 # + loss2 + loss3# + loss4# + loss5 + loss6 losses.update(loss.data[0], input.size(0)) loss_list = [loss1 ] # , loss2, loss3]# , loss4 ]# , loss5 , loss6] for cnt, l in enumerate(loss_list): losses_list[cnt].update(l.data[0], input.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() iters += 1 if iters % config.display == 0: print( 'Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'. format(iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 1): print( 'Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})' .format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() data_time.reset() losses.reset() for cnt in range(12): losses_list[cnt].reset() if iters % 1000 == 0: torch.save( { 'iter': iters, 'state_dict': model.state_dict(), }, modelSaveList[idx] + str(iters) + 'res50.pth.tar') with open('./logLoss2.txt', 'a') as f: f.write( 'Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'. format(iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses) + '\n') if iters == config.max_iter: break return
highestScore = 0 tsid = 0 name_model = 'parser_model2.pt' path_save_model = os.path.join('gen', name_model) for epoch in range(1, args.epochs+1): for i, (word_tensor, ext_word_ids,char_ids,pos_tensor,xpos_tensor,head_targets,rel_targets,seq_lengths,perm_idx) in enumerate(train_loader): start = time.time() # switch to train mode model.train() ts = (((epoch -1) * train_loader.n_batches) + (i+1)) if (ts%5000 == 0): adjust_learning_rate(args.lr, optimizer,optimizer_sparse) if args.cuda: word_tensor = word_tensor.cuda() pos_tensor = pos_tensor.cuda() xpos_tensor = xpos_tensor.cuda() head_targets = head_targets.cuda() rel_targets = rel_targets.cuda() # compute output arc_logits,label_logits = model(word_tensor,ext_word_ids,char_ids,pos_tensor,xpos_tensor,seq_lengths) arc_logits = arc_logits[:,1:,:] label_logits = label_logits[:,1:,:,:] head_targets = head_targets.view(-1) rel_targets = rel_targets.view(-1) s_arc_scores, s_arc_indices = torch.max(arc_logits, 2)
def main(): # parse the args args = parse_option() # set the loader train_loader, n_data = get_train_loader(args) # set the model model, contrast, criterion_ab, criterion_l = set_model(args, n_data) # set the optimizer optimizer = set_optimizer(args, model) # set mixed precision if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) contrast.load_state_dict(checkpoint['contrast']) if args.amp and checkpoint['opt'].amp: print('==> resuming amp state_dict') amp.load_state_dict(checkpoint['amp']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model, contrast, criterion_l, criterion_ab, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger logger.log_value('l_loss', l_loss, epoch) logger.log_value('l_prob', l_prob, epoch) logger.log_value('ab_loss', ab_loss, epoch) logger.log_value('ab_prob', ab_prob, epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': model.state_dict(), 'contrast': contrast.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } if args.amp: state['amp'] = amp.state_dict() save_file = os.path.join(args.model_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def train(args): # Device, save and log configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_dir = Path(os.path.join(args.save_dir, args.name)) save_dir.mkdir(exist_ok=True, parents=True) log_dir = Path(os.path.join(args.log_dir, args.name)) log_dir.mkdir(exist_ok=True, parents=True) writer = SummaryWriter(log_dir=str(log_dir)) # Prepare datasets content_dataset = TrainDataset(args.content_dir, args.img_size) texture_dataset = TrainDataset(args.texture_dir, args.img_size, gray_only=True) color_dataset = TrainDataset(args.color_dir, args.img_size) content_iter = iter( data.DataLoader(content_dataset, batch_size=args.batch_size, sampler=InfiniteSamplerWrapper(content_dataset), num_workers=args.n_threads)) texture_iter = iter( data.DataLoader(texture_dataset, batch_size=args.batch_size, sampler=InfiniteSamplerWrapper(texture_dataset), num_workers=args.n_threads)) color_iter = iter( data.DataLoader(color_dataset, batch_size=args.batch_size, sampler=InfiniteSamplerWrapper(color_dataset), num_workers=args.n_threads)) # Prepare network network = Net(args) network.train() network.to(device) # Training options opt_L = torch.optim.Adam(network.L_path.parameters(), lr=args.lr) opt_AB = torch.optim.Adam(network.AB_path.parameters(), lr=args.lr) opts = [opt_L, opt_AB] # Start Training for i in tqdm(range(args.max_iter)): # S1: Adjust lr and prepare data adjust_learning_rate(opts, iteration_count=i, args=args) content_l, content_ab = [x.to(device) for x in next(content_iter)] texture_l = next(texture_iter).to(device) color_l, color_ab = [x.to(device) for x in next(color_iter)] # S2: Forward l_pred, ab_pred = network(content_l, content_ab, texture_l, color_ab) # S3: Calculate loss loss_ct, loss_t = network.ct_t_loss(l_pred, content_l, texture_l) loss_cr = network.cr_loss(ab_pred, color_ab) loss_ctw = args.content_weight * loss_ct loss_tw = args.texture_weight * loss_t loss_crw = args.color_weight * loss_cr loss = loss_ctw + loss_tw + loss_crw # S4: Backward for opt in opts: opt.zero_grad() loss.backward() for opt in opts: opt.step() # S5: Summary loss and save subnets writer.add_scalar('loss_content', loss_ct.item(), i + 1) writer.add_scalar('loss_texture', loss_t.item(), i + 1) writer.add_scalar('loss_color', loss_cr.item(), i + 1) if (i + 1) % args.save_model_interval == 0 or (i + 1) == args.max_iter: state_dict = network.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].to(torch.device('cpu')) torch.save(state_dict, save_dir / 'network_iter_{:d}.pth.tar'.format(i + 1)) writer.close()
def main_worker(gpu, ngpus_per_node, args): global best_acc1 best_acc1 = 0 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # set the model model, classifier, criterion = set_model(args, ngpus_per_node) # set optimizer optimizer = set_optimizer(args, classifier) cudnn.benchmark = True # optionally resume linear classifier args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # set the data loader train_loader, val_loader, train_sampler = get_train_val_loader(args) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) print("==> testing...") test_acc, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): state = { 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving model!') torch.save(state, save_name) # regular save if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if epoch % args.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_file = os.path.join( args.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # tensorboard logger pass
def main(): global best_acc1 best_acc1 = 0 args = parse_option() if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # set the data loader train_loader, val_loader, train_sampler = get_train_val_loader(args) # set the model model, classifier, criterion = set_model(args) # set optimizer optimizer = set_optimizer(args, classifier) cudnn.benchmark = True # optionally resume linear classifier args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] + 1 best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc1 = checkpoint['best_acc1'] best_acc1 = best_acc1.cuda() print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint torch.cuda.empty_cache() else: print("=> no checkpoint found at '{}'".format(args.resume)) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() train_acc, train_acc5, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) time2 = time.time() print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) print("==> testing...") test_acc, test_acc5, test_loss = validate(val_loader, model, classifier, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc5', test_acc5, epoch) logger.log_value('test_loss', test_loss, epoch) # save the best model if test_acc > best_acc1: best_acc1 = test_acc state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } save_name = '{}_layer{}.pth'.format(args.model, args.layer) save_name = os.path.join(args.save_folder, save_name) print('saving best model!') torch.save(state, save_name) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': test_acc, 'optimizer': optimizer.state_dict(), } save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch) save_name = os.path.join(args.save_folder, save_name) print('saving regular model!') torch.save(state, save_name) # tensorboard logger pass
def train(current_gpu, args): best_acc1 = -1 model_history = {} model_history = util.init_modelhistory(model_history) train_start = time.time() ## choose model from pytorch model_zoo model = util.torch_model(args.model_name, pretrained=True) loss_fn = nn.CrossEntropyLoss().cuda() ## distributed_setting model, args = dis_util.dist_setting(current_gpu, model, loss_fn, args) ## CuDNN library will benchmark several algorithms and pick that which it found to be fastest cudnn.benchmark = False if args.seed else True optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.apex: model, optimizer = dis_util.apex_init(model, optimizer, args) # args.collate_fn = partial(dis_util.fast_collate, memory_format=args.memory_format) args = _get_images(args, data_type='train') train_loader, train_sampler = _get_train_data_loader(args, **args.kwargs) test_loader = _get_test_data_loader(args, **args.kwargs) logger.info("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset))) logger.info("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset))) for epoch in range(1, args.num_epochs + 1): ## batch_time = util.AverageMeter('Time', ':6.3f') data_time = util.AverageMeter('Data', ':6.3f') losses = util.AverageMeter('Loss', ':.4e') top1 = util.AverageMeter('Acc@1', ':6.2f') top5 = util.AverageMeter('Acc@5', ':6.2f') progress = util.ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) trn_loss = [] model.train() end = time.time() running_loss = 0.0 ## Set epoch count for DistributedSampler if args.multigpus_distributed: train_sampler.set_epoch(epoch) prefetcher = util.data_prefetcher(train_loader) input, target = prefetcher.next() batch_idx = 0 while input is not None: batch_idx += 1 if args.prof >= 0 and batch_idx == args.prof: print("Profiling begun at iteration {}".format(batch_idx)) torch.cuda.cudart().cudaProfilerStart() if args.prof >= 0: torch.cuda.nvtx.range_push( "Body of iteration {}".format(batch_idx)) util.adjust_learning_rate(optimizer, epoch, batch_idx, len(train_loader), args) ##### DATA Processing ##### targets_gra = target[:, 0] targets_vow = target[:, 1] targets_con = target[:, 2] # 50%의 확률로 원본 데이터 그대로 사용 if np.random.rand() < 0.5: logits = model(input) grapheme = logits[:, :168] vowel = logits[:, 168:179] cons = logits[:, 179:] loss1 = loss_fn(grapheme, targets_gra) loss2 = loss_fn(vowel, targets_vow) loss3 = loss_fn(cons, targets_con) else: lam = np.random.beta(1.0, 1.0) rand_index = torch.randperm(input.size()[0]) shuffled_targets_gra = targets_gra[rand_index] shuffled_targets_vow = targets_vow[rand_index] shuffled_targets_con = targets_con[rand_index] bbx1, bby1, bbx2, bby2 = _rand_bbox(input.size(), lam) input[:, :, bbx1:bbx2, bby1:bby2] = input[rand_index, :, bbx1:bbx2, bby1:bby2] # 픽셀 비율과 정확히 일치하도록 lambda 파라메터 조정 lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (input.size()[-1] * input.size()[-2])) logits = model(input) grapheme = logits[:, :168] vowel = logits[:, 168:179] cons = logits[:, 179:] loss1 = loss_fn(grapheme, targets_gra) * lam + loss_fn( grapheme, shuffled_targets_gra) * (1. - lam) loss2 = loss_fn(vowel, targets_vow) * lam + loss_fn( vowel, shuffled_targets_vow) * (1. - lam) loss3 = loss_fn(cons, targets_con) * lam + loss_fn( cons, shuffled_targets_con) * (1. - lam) loss = 0.5 * loss1 + 0.25 * loss2 + 0.25 * loss3 trn_loss.append(loss.item()) running_loss += loss.item() ######################################################### # compute gradient and do SGD step optimizer.zero_grad() if args.apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # Printing vital information if (batch_idx + 1) % (args.log_interval) == 0: s = f'[Epoch {epoch} Batch {batch_idx+1}/{len(train_loader)}] ' \ f'loss: {running_loss / args.log_interval:.4f}' print(s) running_loss = 0 if True or batch_idx % args.log_interval == 0: # Every log_interval iterations, check the loss, accuracy, and speed. # For best performance, it doesn't make sense to print these metrics every # iteration, since they incur an allreduce and some host<->device syncs. # Measure accuracy prec1, prec5 = util.accuracy(logits, target, topk=(1, 5)) # Average loss and accuracy across processes for logging if args.multigpus_distributed: reduced_loss = dis_util.reduce_tensor(loss.data, args) prec1 = dis_util.reduce_tensor(prec1, args) prec5 = dis_util.reduce_tensor(prec5, args) else: reduced_loss = loss.data # to_python_float incurs a host<->device sync losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) ## Waiting until finishing operations on GPU (Pytorch default: async) torch.cuda.synchronize() batch_time.update((time.time() - end) / args.log_interval) end = time.time() if current_gpu == 0: print( 'Epoch: [{0}][{1}/{2}] ' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Speed {3:.3f} ({4:.3f}) ' 'Loss {loss.val:.10f} ({loss.avg:.4f}) ' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) ' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, batch_idx, len(train_loader), args.world_size * args.batch_size / batch_time.val, args.world_size * args.batch_size / batch_time.avg, batch_time=batch_time, loss=losses, top1=top1, top5=top5)) model_history['epoch'].append(epoch) model_history['batch_idx'].append(batch_idx) model_history['batch_time'].append(batch_time.val) model_history['losses'].append(losses.val) model_history['top1'].append(top1.val) model_history['top5'].append(top5.val) input, target = prefetcher.next() acc1 = validate(test_loader, model, loss_fn, epoch, model_history, trn_loss, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multigpus_distributed or (args.multigpus_distributed and args.rank % args.num_gpus == 0): util.save_history( os.path.join(args.output_data_dir, 'model_history.p'), model_history) util.save_model( { 'epoch': epoch + 1, 'model_name': args.model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), # 'class_to_idx' : train_loader.dataset.class_to_idx, }, is_best, args.model_dir)
def main(): best_acc = 0 opt = parse_option() # tensorboard logger logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # dataloader train_partition = 'trainval' if opt.use_trainval else 'train' if opt.dataset == 'miniImageNet': train_trans, test_trans = transforms_options[opt.transform] if opt.distill in ['contrast']: train_set = ImageNet(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = ImageNet(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(ImageNet(args=opt, partition='val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaImageNet(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaImageNet(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: n_cls = 64 elif opt.dataset == 'tieredImageNet': train_trans, test_trans = transforms_options[opt.transform] if opt.distill in ['contrast']: train_set = TieredImageNet(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = TieredImageNet(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(TieredImageNet(args=opt, partition='train_phase_val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaTieredImageNet( args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaTieredImageNet( args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 448 else: n_cls = 351 elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100': train_trans, test_trans = transforms_options['D'] if opt.distill in ['contrast']: train_set = CIFAR100(args=opt, partition=train_partition, transform=train_trans, is_sample=True, k=opt.nce_k) else: train_set = CIFAR100(args=opt, partition=train_partition, transform=train_trans) n_data = len(train_set) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(CIFAR100(args=opt, partition='train', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaCIFAR100(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaCIFAR100(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: if opt.dataset == 'CIFAR-FS': n_cls = 64 elif opt.dataset == 'FC100': n_cls = 60 else: raise NotImplementedError('dataset not supported: {}'.format( opt.dataset)) else: raise NotImplementedError(opt.dataset) # model model_t = load_teacher(opt.path_t, n_cls, opt.dataset) model_s = create_model(opt.model_s, n_cls, opt.dataset) data = torch.randn(2, 3, 84, 84) model_t.eval() model_s.eval() feat_t, _ = model_t(data, is_feat=True) feat_s, _ = model_s(data, is_feat=True) module_list = nn.ModuleList([]) module_list.append(model_s) trainable_list = nn.ModuleList([]) trainable_list.append(model_s) criterion_cls = nn.CrossEntropyLoss() criterion_div = DistillKL(opt.kd_T) if opt.distill == 'kd': criterion_kd = DistillKL(opt.kd_T) elif opt.distill == 'contrast': criterion_kd = NCELoss(opt, n_data) embed_s = Embed(feat_s[-1].shape[1], opt.feat_dim) embed_t = Embed(feat_t[-1].shape[1], opt.feat_dim) module_list.append(embed_s) module_list.append(embed_t) trainable_list.append(embed_s) trainable_list.append(embed_t) elif opt.distill == 'attention': criterion_kd = Attention() elif opt.distill == 'hint': criterion_kd = HintLoss() else: raise NotImplementedError(opt.distill) criterion_list = nn.ModuleList([]) criterion_list.append(criterion_cls) # classification loss criterion_list.append( criterion_div) # KL divergence loss, original knowledge distillation criterion_list.append(criterion_kd) # other knowledge distillation loss # optimizer optimizer = optim.SGD(trainable_list.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) # append teacher after optimizer to avoid weight_decay module_list.append(model_t) if torch.cuda.is_available(): module_list.cuda() criterion_list.cuda() cudnn.benchmark = True # validate teacher accuracy teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt) print('teacher accuracy: ', teacher_acc) # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) # routine: supervised model distillation for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, module_list, criterion_list, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model_s.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # save the last model state = { 'opt': opt, 'model': model_s.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s)) torch.save(state, save_file)
def main(): args = parse_option() os.makedirs(args.checkpoint_path, exist_ok=True) if not args.debug: os.environ['PYTHONBREAKPOINT'] = '0' logger = get_logger(logpath=os.path.join(args.checkpoint_path, 'logs'), filepath=os.path.abspath(__file__)) def print_pass(*args): logger.info(*args) builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) print(args) train_loader = get_train_loader(args) isd = ISD(args.arch, K=args.queue_size, m=args.momentum, T=args.temp) isd.data_parallel() isd = isd.cuda() print(isd) criterion = KLD().cuda() params = [p for p in isd.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.learning_rate, momentum=args.sgd_momentum, weight_decay=args.weight_decay) cudnn.benchmark = True args.start_epoch = 1 if args.resume: print('==> resume from checkpoint: {}'.format(args.resume)) ckpt = torch.load(args.resume) print('==> resume from epoch: {}'.format(ckpt['epoch'])) isd.load_state_dict(ckpt['state_dict'], strict=True) optimizer.load_state_dict(ckpt['optimizer']) args.start_epoch = ckpt['epoch'] + 1 # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() loss = train_student(epoch, train_loader, isd, criterion, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # saving the model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'state_dict': isd.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } save_file = os.path.join( args.checkpoint_path, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def main_worker(gpu, ngpus_per_node, args): global best_acc1 # stliu: best accuracy args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) # stliu: add resnet_ttt if args.arch == 'resnet_ttt': model = moco.builder.MoCo(ResNetCifar, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp, width=args.width, norm=args.norm) _, ext, head, ssh = build_model( args, model.encoder_q ) # stliu: ext, head and ssh share same paras as encoder_q # stliu: SVM with model_val on single GPU norm_layer = get_norm(args.norm) model_val = ResNetCifar(num_classes=args.moco_dim, width=args.width, norm_layer=norm_layer) else: model = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) # print(model) # stliu: comment this if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model_val.cuda(args.gpu) # stliu: for SVM ssh = ssh.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) # stliu: add broadcast_buffers=False to use normal BN model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], broadcast_buffers=False, find_unused_parameters=True) ssh = torch.nn.parallel.DistributedDataParallel( ssh, device_ids=[args.gpu], broadcast_buffers=False, find_unused_parameters=True) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # ssh = torch.nn.parallel.DistributedDataParallel(ssh, device_ids=[args.gpu]) else: model.cuda() model_val.cuda() # stliu: for SVM ssh = ssh.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel( model, broadcast_buffers=False, find_unused_parameters=True) ssh = torch.nn.parallel.DistributedDataParallel( ssh, broadcast_buffers=False, find_unused_parameters=True) # model = torch.nn.parallel.DistributedDataParallel(model) # ssh = torch.nn.parallel.DistributedDataParallel(ssh) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) model_val = model_val.cuda(args.gpu) # stliu: for SVM ssh = ssh.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) parameters = list(model.parameters()) + list(head.parameters()) optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) head.load_state_dict(checkpoint['head']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # stliu: I design it as a function train_loader, train_sampler, memory_loader, test_loader, teset = get_loader( args) if args.val: state_dict = model.state_dict() for k in list(state_dict.keys()): if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): state_dict[k[len("module.encoder_q."):]] = state_dict[k] del state_dict[k] model_val.load_state_dict(state_dict, strict=False) flag_liblinear = '-s 2 -q -n ' + str(args.workers) if args.ttt: test_acc_svm = ttt_test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh, teset, head) else: test_acc_svm = test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh) print('#### result ####\n' + args.val + ':', test_acc_svm, '\n################') else: # stliu: tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) for epoch in range(args.start_epoch, args.epochs + 1): # stliu: to save the last one if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch loss = train(train_loader, model, criterion, optimizer, epoch, args, ssh) # stliu: tensorboard logger logger.log_value('loss', loss, epoch) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if (epoch % args.save_freq == 0 and epoch != 0 ) or epoch == args.epochs: # stliu: ignore the first model print('==> Saving...') save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'head': head.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename=args.model_folder + '/checkpoint_{:04d}.pth.tar'.format(epoch)) # stliu: test with SVM if (epoch + 1) % args.svm_freq == 0: state_dict = model.state_dict() for k in list(state_dict.keys()): if k.startswith('module.encoder_q') and not k.startswith( 'module.encoder_q.fc'): state_dict[ k[len("module.encoder_q."):]] = state_dict[k] del state_dict[k] model_val.load_state_dict(state_dict, strict=False) flag_liblinear = '-s 2 -q -n ' + str(args.workers) test_acc_svm = test(memory_loader, model, model_val, test_loader, flag_liblinear, args, ssh) # stliu: save the best model is_best = test_acc_svm > best_acc1 best_acc1 = max(test_acc_svm, best_acc1) if is_best: print('==> Saving the Best...') save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'head': head.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=True, filename=args.model_folder + '/best.pth.tar') print('The Best SVM Accuracy:', best_acc1)
def main(): # Make directories if they don't already exist util.make_directories() # Load model options model_options = constants.MAIN_MODEL_OPTIONS ########## DATA ########## if constants.PRINT_MODEL_STATUS: print("Loading data") dataset_map = util.load_dataset_map() train_captions, val_captions, test_captions = util.load_text_vec( 'Data', constants.VEC_OUTPUT_FILE_NAME, dataset_map) train_image_dict, val_image_dict, test_image_dict = util.get_images( 'Data', constants.DIRECTORY_PATH, constants.FLOWERS_DICTS_PATH) ########## MODEL ########## generator = CondBeganGenerator(model_options) discriminator = CondBeganDiscriminator(model_options) # Put G and D on cuda if GPU available if torch.cuda.is_available(): if constants.PRINT_MODEL_STATUS: print("CUDA is available") generator = generator.cuda() discriminator = discriminator.cuda() if constants.PRINT_MODEL_STATUS: print("Moved models to GPU") # Initialize weights generator.apply(util.weights_init) discriminator.apply(util.weights_init) ########## SAVED VARIABLES ######### new_epoch = 0 began_k = 0 train_losses = {"generator": [], "discriminator": [], "converge": []} val_losses = {"generator": [], "discriminator": [], "converge": []} losses = {'train': train_losses, 'val': val_losses} ########## OPTIMIZER ########## g_optimizer = optim.Adam(generator.parameters(), lr=constants.LR, betas=constants.BETAS) # Changes the optimizer to SGD if declared in constants if constants.D_OPTIMIZER_SGD: d_optimizer = optim.SGD(discriminator.parameters(), lr=constants.LR) else: d_optimizer = optim.Adam(discriminator.parameters(), lr=constants.LR, betas=constants.BETAS) if constants.PRINT_MODEL_STATUS: print("Added optimizers") ########## RESUME OPTION ########## if args.resume: print("Resuming from epoch " + args.resume) checkpoint = torch.load(constants.SAVE_PATH + 'weights/epoch' + str(args.resume)) new_epoch = checkpoint['epoch'] + 1 generator.load_state_dict(checkpoint['g_dict']) discriminator.load_state_dict(checkpoint['d_dict']) began_k = checkpoint['began_k'] g_optimizer.load_state_dict(checkpoint['g_optimizer']) d_optimizer.load_state_dict(checkpoint['d_optimizer']) losses = torch.load(constants.SAVE_PATH + 'losses') ########## VARIABLES ########## noise_vec = torch.FloatTensor(constants.BATCH_SIZE, model_options['z_dim']) text_vec = torch.FloatTensor(constants.BATCH_SIZE, model_options['caption_vec_len']) real_img = torch.FloatTensor(constants.BATCH_SIZE, model_options['image_channels'], constants.IMAGE_SIZE, constants.IMAGE_SIZE) real_caption = torch.FloatTensor(constants.BATCH_SIZE, model_options['caption_vec_len']) if constants.USE_CLS: wrong_img = torch.FloatTensor(constants.BATCH_SIZE, model_options['image_channels'], constants.IMAGE_SIZE, constants.IMAGE_SIZE) wrong_caption = torch.FloatTensor(constants.BATCH_SIZE, model_options['caption_vec_len']) # Add cuda GPU option if torch.cuda.is_available(): noise_vec = noise_vec.cuda() text_vec = text_vec.cuda() real_img = real_img.cuda() real_caption = real_caption.cuda() if constants.USE_CLS: wrong_img = wrong_img.cuda() ########## Training ########## num_iterations = 0 for epoch in range(new_epoch, constants.NUM_EPOCHS): print("Epoch %d" % (epoch)) st = time.time() for i, batch_iter in enumerate( util.grouper(train_captions.keys(), constants.BATCH_SIZE)): batch_keys = [x for x in batch_iter if x is not None] curr_batch_size = len(batch_keys) discriminator.train() generator.train() discriminator.zero_grad() # Zero out gradient # Save computations for gradient calculations for p in discriminator.parameters(): p.requires_grad = True # Need this to be true to update generator as well ########## BATCH DATA ######### noise_batch = torch.randn(curr_batch_size, model_options['z_dim']) text_vec_batch = torch.Tensor( util.get_text_description(train_captions, batch_keys)) real_caption_batch = torch.Tensor( util.get_text_description(train_captions, batch_keys)) real_img_batch = torch.Tensor( util.choose_real_image(train_image_dict, batch_keys)) if constants.USE_CLS: wrong_img_batch = torch.Tensor( util.choose_wrong_image(train_image_dict, batch_keys)) if torch.cuda.is_available(): noise_batch = noise_batch.cuda() text_vec_batch = text_vec_batch.cuda() real_caption_batch = real_caption_batch.cuda() real_img_batch = real_img_batch.cuda() if constants.USE_CLS: wrong_img_batch = wrong_img_batch.cuda() # Fill in tensors with batch data noise_vec.resize_as_(noise_batch).copy_(noise_batch) text_vec.resize_as_(text_vec_batch).copy_(text_vec_batch) real_caption.resize_as_(text_vec_batch).copy_(text_vec_batch) real_img.resize_as_(real_img_batch).copy_(real_img_batch) if constants.USE_CLS: wrong_img.resize_as_(wrong_img_batch).copy_(wrong_img_batch) ########## RUN THROUGH GAN ########## gen_image = generator.forward(Variable(text_vec), Variable(noise_vec)) real_img_passed = discriminator.forward(Variable(real_img), Variable(real_caption)) fake_img_passed = discriminator.forward(gen_image.detach(), Variable(real_caption)) if constants.USE_CLS: wrong_img_passed = discriminator.forward( Variable(wrong_img), Variable(real_caption)) ########## TRAIN DISCRIMINATOR ########## if constants.USE_REAL_LS: # Real loss sensitivity # L_D = L(y_r) - k * (L(y_f) + L(y_f, r)) # L_G = L(y_f) + L(y_f, r) # k = k + lambda_k * (gamma * L(y_r) + L(y_f) + L(y_f, r)) d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_real_sensitivity_loss = torch.mean( torch.abs(fake_img_passed - Variable(real_img))) d_loss = d_real_loss - began_k * ( 0.5 * d_fake_loss + 0.5 * d_real_sensitivity_loss) # Update began k value balance = (model_options['began_gamma'] * d_real_loss - 0.5 * d_fake_loss - 0.5 * d_real_sensitivity_loss).data[0] began_k = min( max(began_k + model_options['began_lambda_k'] * balance, 0), 1) elif constants.USE_CLS: # Cond BEGAN Discrminator Loss with CLS # L(y_w) is the caption loss sensitivity CLS (makes sure that captions match the image) # L_D = L(y_r) + L(y_f, w) - k * L(y_f) # L_G = L(y_f) # k = k + lambda_k * (gamma * (L(y_r) + L(y_f, w)) - L(y_f)) d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_wrong_loss = torch.mean( torch.abs(fake_img_passed - Variable(wrong_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_loss = 0.5 * d_real_loss + 0.5 * d_wrong_loss - began_k * d_fake_loss # Update began k value balance = (model_options['began_gamma'] * (0.5 * d_real_loss + 0.5 * d_wrong_loss) - d_fake_loss).data[0] began_k = min( max(began_k + model_options['began_lambda_k'] * balance, 0), 1) # No CLS option else: # Cond BEGAN Discriminator Loss # L_D = L(y_r) - k * L(y_f) # k = k + lambda_k * (gamma * L(y_r) + L(y_f)) d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_loss = d_real_loss - began_k * d_fake_loss # Update began k value balance = (model_options['began_gamma'] * d_real_loss - d_fake_loss).data[0] began_k = min( max(began_k + model_options['began_lambda_k'] * balance, 0), 1) d_loss.backward() d_optimizer.step() ########## TRAIN GENERATOR ########## generator.zero_grad() for p in discriminator.parameters(): p.requires_grad = False # Generate image again if you want to if constants.REGEN_IMAGE: noise_batch = torch.randn(curr_batch_size, model_options['z_dim']) if torch.cuda.is_available(): noise_batch = noise_batch.cuda() noise_vec.resize_as_(noise_batch).copy_(noise_batch) gen_image = generator.forward(Variable(text_vec), Variable(noise_vec)) new_fake_img_passed = discriminator.forward( gen_image, Variable(real_caption)) # Generator Loss # L_G = L(y_f) g_loss = torch.mean(torch.abs(new_fake_img_passed - gen_image)) if constants.USE_REAL_LS: g_loss += torch.mean( torch.abs(new_fake_img_passed - Variable(real_img))) elif constants.USE_CLS: g_loss -= torch.mean( torch.abs(new_fake_img_passed - Variable(wrong_img))) g_loss.backward() g_optimizer.step() # M = L(y_r) + |gamma * L(y_r) - L(y_f)| convergence_val = d_real_loss + abs(balance) # learning rate decay g_optimizer = util.adjust_learning_rate(g_optimizer, num_iterations) d_optimizer = util.adjust_learning_rate(d_optimizer, num_iterations) if i % constants.LOSS_SAVE_IDX == 0: losses['train']['generator'].append((g_loss.data[0], epoch, i)) losses['train']['discriminator'].append( (d_loss.data[0], epoch, i)) losses['train']['converge'].append( (convergence_val.data[0], epoch, i)) num_iterations += 1 print('Total number of iterations: ', num_iterations) print('Training G Loss: ', g_loss.data[0]) print('Training D Loss: ', d_loss.data[0]) print('Training Convergence: ', convergence_val.data[0]) print('K value: ', began_k) epoch_time = time.time() - st print("Time: ", epoch_time) if epoch == constants.REPORT_EPOCH: with open(constants.SAVE_PATH + 'report.txt', 'w') as f: f.write(constants.EXP_REPORT) f.write("Time per epoch: " + str(epoch_time)) print("Saved report") ########## DEV SET ######### # Calculate dev set loss # Volatile is true because we are running in inference mode (no need to calculate gradients) generator.eval() discriminator.eval() for i, batch_iter in enumerate( util.grouper(val_captions.keys(), constants.BATCH_SIZE)): batch_keys = [x for x in batch_iter if x is not None] curr_batch_size = len(batch_keys) # Gather batch data noise_batch = torch.randn(curr_batch_size, model_options['z_dim']) text_vec_batch = torch.Tensor( util.get_text_description(val_captions, batch_keys)) real_caption_batch = torch.Tensor( util.get_text_description(val_captions, batch_keys)) real_img_batch = torch.Tensor( util.choose_real_image(val_image_dict, batch_keys)) if constants.USE_CLS: wrong_img_batch = torch.Tensor( util.choose_wrong_image(val_image_dict, batch_keys)) if torch.cuda.is_available(): noise_batch = noise_batch.cuda() text_vec_batch = text_vec_batch.cuda() real_caption_batch = real_caption_batch.cuda() real_img_batch = real_img_batch.cuda() if constants.USE_CLS: wrong_img_batch = wrong_img_batch.cuda() # Fill in tensors with batch data noise_vec.resize_as_(noise_batch).copy_(noise_batch) text_vec.resize_as_(text_vec_batch).copy_(text_vec_batch) real_caption.resize_as_(text_vec_batch).copy_(text_vec_batch) real_img.resize_as_(real_img_batch).copy_(real_img_batch) if constants.USE_CLS: wrong_img.resize_as_(wrong_img_batch).copy_(wrong_img_batch) # Run through generator gen_image = generator.forward(Variable( text_vec, volatile=True), Variable( noise_vec, volatile=True)) # Returns tensor variable holding image # Run through discriminator real_img_passed = discriminator.forward( Variable(real_img, volatile=True), Variable(real_caption, volatile=True)) fake_img_passed = discriminator.forward( gen_image.detach(), Variable(real_caption, volatile=True)) if constants.USE_CLS: wrong_img_passed = discriminator.forward( Variable(wrong_img, volatile=True), Variable(real_caption, volatile=True)) # Calculate D loss # D LOSS if constants.USE_REAL_LS: d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_real_sensitivity_loss = torch.mean( torch.abs(fake_img_passed - Variable(real_img))) d_loss = d_real_loss - began_k * ( 0.5 * d_fake_loss + 0.5 * d_real_sensitivity_loss) balance = (model_options['began_gamma'] * d_real_loss - 0.5 * d_fake_loss - 0.5 * d_real_sensitivity_loss).data[0] elif constants.USE_CLS: d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_wrong_loss = torch.mean( torch.abs(fake_img_passed - Variable(wrong_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_loss = 0.5 * d_real_loss + 0.5 * d_wrong_loss - began_k * d_fake_loss balance = (model_options['began_gamma'] * (0.5 * d_real_loss + 0.5 * d_wrong_loss) - d_fake_loss).data[0] # No CLS option else: d_real_loss = torch.mean( torch.abs(real_img_passed - Variable(real_img))) d_fake_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) d_loss = d_real_loss - began_k * d_fake_loss # Update began k value balance = (model_options['began_gamma'] * d_real_loss - d_fake_loss).data[0] # Calculate G loss if constants.USE_REAL_LS: g_loss = 0.5 * torch.mean( torch.abs(fake_img_passed - gen_image)) g_loss += 0.5 * torch.mean( torch.abs(fake_img_passed - Variable(real_img))) elif constants.USE_CLS: g_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) g_loss -= 0.5 * torch.mean( torch.abs(fake_img_passed - Variable(wrong_img))) else: # L_G = L(y_f) g_loss = torch.mean(torch.abs(fake_img_passed - gen_image)) # M = L(y_r) + |gamma * L(y_r) - L(y_f)| convergence_val = d_real_loss + abs(balance) if i % constants.LOSS_SAVE_IDX == 0: losses['val']['generator'].append((g_loss.data[0], epoch, i)) losses['val']['discriminator'].append( (d_loss.data[0], epoch, i)) losses['val']['converge'].append( (convergence_val.data[0], epoch, i)) print('Val G Loss: ', g_loss.data[0]) print('Val D Loss: ', d_loss.data[0]) print('Val Convergence: ', convergence_val.data[0]) # Save losses torch.save(losses, constants.SAVE_PATH + 'losses') # Save images vutils.save_image(gen_image[0].data.cpu(), constants.SAVE_PATH + 'images/gen0_epoch' + str(epoch) + '.png', normalize=True) vutils.save_image(gen_image[1].data.cpu(), constants.SAVE_PATH + 'images/gen1_epoch' + str(epoch) + '.png', normalize=True) vutils.save_image(fake_img_passed[0].data.cpu(), constants.SAVE_PATH + 'images/gen_recon0_epoch' + str(epoch) + '.png', normalize=True) vutils.save_image(fake_img_passed[1].data.cpu(), constants.SAVE_PATH + 'images/gen_recon1_epoch' + str(epoch) + '.png', normalize=True) # vutils.save_image(real_img_passed[0].data.cpu(), # constants.SAVE_PATH + 'images/real_recon0_epoch' + str(epoch) + '.png', # normalize=True) # vutils.save_image(real_img_passed[1].data.cpu(), # constants.SAVE_PATH + 'images/real_recon1_epoch' + str(epoch) + '.png', # normalize=True) # Save model if epoch % constants.CHECKPOINT_FREQUENCY == 0 and epoch != 0 or epoch == constants.NUM_EPOCHS - 1: save_checkpoint = { 'epoch': epoch, 'g_dict': generator.state_dict(), 'd_dict': discriminator.state_dict(), 'g_optimizer': g_optimizer.state_dict(), 'd_optimizer': d_optimizer.state_dict(), 'began_k': began_k } torch.save(save_checkpoint, constants.SAVE_PATH + 'weights/epoch' + str(epoch))
def main(): opt = parse_option() # dataloader train_partition = 'trainval' if opt.use_trainval else 'train' if opt.dataset == 'miniImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(ImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(ImageNet(args=opt, partition='val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaImageNet(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaImageNet(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: n_cls = 64 elif opt.dataset == 'tieredImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(TieredImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(TieredImageNet(args=opt, partition='train_phase_val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaTieredImageNet( args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaTieredImageNet( args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 448 else: n_cls = 351 elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100': train_trans, test_trans = transforms_options['D'] train_loader = DataLoader(CIFAR100(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(CIFAR100(args=opt, partition='train', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaCIFAR100(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaCIFAR100(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: if opt.dataset == 'CIFAR-FS': n_cls = 64 elif opt.dataset == 'FC100': n_cls = 60 else: raise NotImplementedError('dataset not supported: {}'.format( opt.dataset)) else: raise NotImplementedError(opt.dataset) # model if not opt.load_latest: model = create_model(opt.model, n_cls, opt.dataset) else: latest_file = os.path.join(opt.save_folder, 'latest.pth') model = load_teacher(latest_file, n_cls, opt.dataset) # optimizer if opt.adam: optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=0.0005) else: optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): if opt.n_gpu > 1: model = nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) # routine: supervised pre-training for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss = train(epoch, train_loader, model, criterion, optimizer, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_loss', train_loss, epoch) test_acc, test_acc_top5, test_loss = validate(val_loader, model, criterion, opt) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc_top5', test_acc_top5, epoch) logger.log_value('test_loss', test_loss, epoch) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) latest_file = os.path.join(opt.save_folder, 'latest.pth') os.symlink(save_file, latest_file) # save the last model state = { 'opt': opt, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model)) torch.save(state, save_file)
def main(): opt = parse_option() with open(f"{opt.tb_folder}/config.json", "w") as fo: fo.write(json.dumps(vars(opt), indent=4)) # dataloader train_partition = 'trainval' if opt.use_trainval else 'train' if opt.dataset == 'miniImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(ImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(ImageNet(args=opt, partition='val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaImageNet(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaImageNet(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: n_cls = 64 elif opt.dataset == 'tieredImageNet': train_trans, test_trans = transforms_options[opt.transform] train_loader = DataLoader(TieredImageNet(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(TieredImageNet(args=opt, partition='train_phase_val', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaTieredImageNet( args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaTieredImageNet( args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 448 else: n_cls = 351 elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100': train_trans, test_trans = transforms_options['D'] train_loader = DataLoader(CIFAR100(args=opt, partition=train_partition, transform=train_trans), batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(CIFAR100(args=opt, partition='train', transform=test_trans), batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) meta_testloader = DataLoader(MetaCIFAR100(args=opt, partition='test', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) meta_valloader = DataLoader(MetaCIFAR100(args=opt, partition='val', train_transform=train_trans, test_transform=test_trans), batch_size=opt.test_batch_size, shuffle=False, drop_last=False, num_workers=opt.num_workers) if opt.use_trainval: n_cls = 80 else: if opt.dataset == 'CIFAR-FS': n_cls = 64 elif opt.dataset == 'FC100': n_cls = 60 else: raise NotImplementedError('dataset not supported: {}'.format( opt.dataset)) elif opt.dataset == "imagenet": train_trans, test_trans = transforms_options["A"] train_dataset = ImagenetFolder(root=os.path.join( opt.data_root, "train"), transform=train_trans) val_dataset = ImagenetFolder(root=os.path.join(opt.data_root, "val"), transform=test_trans) train_loader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.num_workers) val_loader = DataLoader(val_dataset, batch_size=opt.batch_size // 2, shuffle=False, drop_last=False, num_workers=opt.num_workers // 2) n_cls = 1000 else: raise NotImplementedError(opt.dataset) # model model = create_model(opt.model, n_cls, opt.dataset, use_srl=opt.srl) # optimizer if opt.adam: optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=0.0005) else: optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) if opt.label_smoothing: criterion = LabelSmoothing(smoothing=opt.smoothing_ratio) elif opt.gce: criterion = GuidedComplementEntropy(alpha=opt.gce_alpha, classes=n_cls) else: criterion = nn.CrossEntropyLoss() if opt.opl: auxiliary_loss = OrthogonalProjectionLoss(use_attention=True) elif opt.popl: auxiliary_loss = PerpetualOrthogonalProjectionLoss(feat_dim=640) else: auxiliary_loss = None if torch.cuda.is_available(): if opt.n_gpu > 1: model = nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() if auxiliary_loss is not None: auxiliary_loss = auxiliary_loss.cuda() cudnn.benchmark = True # tensorboard logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2) # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) else: scheduler = None # routine: supervised pre-training for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() if auxiliary_loss is not None: train_acc, train_loss, [train_cel, train_opl ] = train(epoch=epoch, train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, opt=opt, auxiliary=auxiliary_loss) else: train_acc, train_loss = train(epoch=epoch, train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, opt=opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) logger.log_value('accuracy/train_acc', train_acc, epoch) logger.log_value('train_losses/loss', train_loss, epoch) if auxiliary_loss is not None: logger.log_value('train_losses/cel', train_cel, epoch) logger.log_value('train_losses/opl', train_opl, epoch) else: logger.log_value('train_losses/cel', train_loss, epoch) if auxiliary_loss is not None: test_acc, test_acc_top5, test_loss, [test_cel, test_opl] = \ validate(val_loader, model, criterion, opt, auxiliary=auxiliary_loss) else: test_acc, test_acc_top5, test_loss = validate( val_loader, model, criterion, opt) logger.log_value('accuracy/test_acc', test_acc, epoch) logger.log_value('accuracy/test_acc_top5', test_acc_top5, epoch) logger.log_value('test_losses/loss', test_loss, epoch) if auxiliary_loss is not None: logger.log_value('test_losses/cel', test_cel, epoch) logger.log_value('test_losses/opl', test_opl, epoch) else: logger.log_value('test_losses/cel', test_loss, epoch) # regular saving if epoch % opt.save_freq == 0: print('==> Saving...') state = { 'epoch': epoch, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # save the last model state = { 'opt': opt, 'model': model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(), } save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model)) torch.save(state, save_file)
def train_net(model, args): ann_path = '../FashionAI/data/train/Annotations/trainminusval.csv' img_dir = '../FashionAI/data/train/' stride = 8 cudnn.benchmark = True config = util.Config('./config.yml') train_loader = torch.utils.data.DataLoader(dataset_loader.dataset_loader( img_dir, ann_path, stride, Mytransforms.Compose([ Mytransforms.RandomResized(), Mytransforms.RandomRotate(40), Mytransforms.RandomCrop(384), ]), sigma=15), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) criterion = nn.MSELoss().cuda() params = [] for key, value in model.named_parameters(): if value.requires_grad != False: params.append({'params': value, 'lr': config.base_lr}) optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum, weight_decay=config.weight_decay) # model.train() # only for bn and dropout model.eval() from matplotlib import pyplot as plt iters = 0 batch_time = util.AverageMeter() data_time = util.AverageMeter() losses = util.AverageMeter() losses_list = [util.AverageMeter() for i in range(12)] end = time.time() heat_weight = 48 * 48 * 25 / 2.0 # for convenient to compare with origin code # heat_weight = 1 while iters < config.max_iter: for i, (input, heatmap) in enumerate(train_loader): learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy,\ policy_parameter=config.policy_parameter) data_time.update(time.time() - end) input = input.cuda(async=True) heatmap = heatmap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) heat = model(input_var) # feat = C4.cpu().data.numpy() # for n in range(100): # plt.subplot(10, 10, n + 1); # plt.imshow(feat[0, n, :, :], cmap='gray') # plt.xticks([]); # plt.yticks([]) # plt.show() loss1 = criterion(heat, heatmap_var) * heat_weight # loss2 = criterion(heat4, heatmap_var) * heat_weight # loss3 = criterion(heat5, heatmap_var) * heat_weight # loss4 = criterion(heat6, heatmap_var) * heat_weight # loss5 = criterion(heat, heatmap_var) # loss6 = criterion(heat, heatmap_var) loss = loss1 # + loss2 + loss3# + loss4# + loss5 + loss6 losses.update(loss.data[0], input.size(0)) loss_list = [loss1] #, loss2, loss3]# , loss4 ]# , loss5 , loss6] for cnt, l in enumerate(loss_list): losses_list[cnt].update(l.data[0], input.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() iters += 1 if iters % config.display == 0: print( 'Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format( iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 1): print( 'Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})'. format(cnt + 1, loss1=losses_list[cnt])) print( time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() data_time.reset() losses.reset() for cnt in range(12): losses_list[cnt].reset() if iters % 5000 == 0: torch.save({ 'iter': iters, 'state_dict': model.state_dict(), }, str(iters) + '.pth.tar') if iters == config.max_iter: break return
epoch_loss += c_loss optimizer.step() if ind % 100 == 0: print("iter [%d] CLoss: %.4f" % (ind, c_loss)) if ind > args.max_iter: break print("Epoch [%d] Loss: %.4f" % (epoch + 1, epoch_loss)) log_value('loss', epoch_loss, epoch) log_value('lr', args.lr, epoch) if args.adjust_lr: args.lr = adjust_learning_rate(optimizer, args.lr, args.weight_decay, epoch, args.epochs) if args.net == "fcn" or args.net == "psp": checkpoint_fn = os.path.join( args.pth_dir, "%s-%s-res%s-%s.pth.tar" % (args.savename, args.net, args.res, epoch + 1)) else: checkpoint_fn = os.path.join( args.pth_dir, "%s-%s-%s.pth.tar" % (args.savename, args.net, epoch + 1)) args.start_epoch = epoch + 1 save_dic = { 'args': args, 'epoch': epoch + 1, 'g1_state_dict': model_g1.state_dict(),
def main(): opt = parse_option() wandb.init(project=opt.model_path.split("/")[-1], tags=opt.tags) wandb.config.update(opt) wandb.save('*.py') wandb.run.save() train_loader, val_loader, meta_testloader, meta_valloader, n_cls, no_sample = get_dataloaders( opt) # model model = create_model(opt.model, n_cls, opt.dataset, n_trans=opt.trans, embd_sz=opt.memfeature_size) wandb.watch(model) # optimizer if opt.adam: print("Adam") optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=0.0005) else: print("SGD") optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): if opt.n_gpu > 1: model = nn.DataParallel(model) model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True # set cosine annealing scheduler if opt.cosine: eta_min = opt.learning_rate * (opt.lr_decay_rate**3) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, opt.epochs, eta_min, -1) MemBank = np.random.randn(no_sample, opt.memfeature_size) MemBank = torch.tensor(MemBank, dtype=torch.float).cuda() MemBankNorm = torch.norm(MemBank, dim=1, keepdim=True) MemBank = MemBank / (MemBankNorm + 1e-6) # routine: supervised pre-training for epoch in range(1, opt.epochs + 1): if opt.cosine: scheduler.step() else: adjust_learning_rate(epoch, opt, optimizer) print("==> training...") time1 = time.time() train_acc, train_loss, MemBank = train(epoch, train_loader, model, criterion, optimizer, opt, MemBank) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) val_acc, val_acc_top5, val_loss = 0, 0, 0 #validate(val_loader, model, criterion, opt) #validate start = time.time() meta_val_acc, meta_val_std = 0, 0 #meta_test(model, meta_valloader) test_time = time.time() - start print( 'Meta Val Acc : {:.4f}, Meta Val std: {:.4f}, Time: {:.1f}'.format( meta_val_acc, meta_val_std, test_time)) #evaluate start = time.time() meta_test_acc, meta_test_std = 0, 0 #meta_test(model, meta_testloader) test_time = time.time() - start print('Meta Test Acc: {:.4f}, Meta Test std: {:.4f}, Time: {:.1f}'. format(meta_test_acc, meta_test_std, test_time)) # regular saving if epoch % opt.save_freq == 0 or epoch == opt.epochs: print('==> Saving...') state = { 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), } save_file = os.path.join(opt.save_folder, 'model_' + str(wandb.run.name) + '.pth') torch.save(state, save_file) #wandb saving torch.save(state, os.path.join(wandb.run.dir, "model.pth")) wandb.log({ 'epoch': epoch, 'Train Acc': train_acc, 'Train Loss': train_loss, 'Val Acc': val_acc, 'Val Loss': val_loss, 'Meta Test Acc': meta_test_acc, 'Meta Test std': meta_test_std, 'Meta Val Acc': meta_val_acc, 'Meta Val std': meta_val_std }) #final report print("GENERATING FINAL REPORT") generate_final_report(model, opt, wandb) #remove output.txt log file output_log_file = os.path.join(wandb.run.dir, "output.log") if os.path.isfile(output_log_file): os.remove(output_log_file) else: ## Show an error ## print("Error: %s file not found" % output_log_file)
def main(): args = parse_option() os.makedirs(args.checkpoint_path, exist_ok=True) if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) train_loader = get_train_loader(args) teacher = get_teacher_model(args) student = get_student_model(args) # Calculate feature dimension of student and teacher teacher.eval() student.eval() tmp_input = torch.randn(2, 3, 224, 224) feat_t = teacher.forward(tmp_input, 0) feat_s = student(tmp_input) student_feats_dim = feat_s.shape[-1] teacher_feats_dim = feat_t.shape[-1] compress = CompReSS(teacher_feats_dim, student_feats_dim, args.compress_memory_size, args.compress_t) student = torch.nn.DataParallel(student).cuda() teacher.gpu() optimizer = torch.optim.SGD(student.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True args.start_epoch = 1 # routine for epoch in range(args.start_epoch, args.epochs + 1): adjust_learning_rate(epoch, args, optimizer) print("==> training...") time1 = time.time() loss = train_student(epoch, train_loader, teacher, student, compress, optimizer, args) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # saving the model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'model': student.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, } save_file = os.path.join( args.checkpoint_path, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) torch.save(state, save_file) # help release GPU memory del state torch.cuda.empty_cache()
def train_net(model, args): ann_path = '../FashionAI/data/train/Annotations/trainminusval.csv' img_dir = '../FashionAI/data/train/' stride = 8 cudnn.benchmark = True config = util.Config('./config.yml') train_loader = torch.utils.data.DataLoader( dataset_loader.dataset_loader(img_dir, ann_path, stride, transforms.ToTensor()), batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) criterion = nn.MSELoss().cuda() params, multiple = get_parameters(model, config, False) optimizer = torch.optim.SGD(params, config.base_lr, momentum=config.momentum, weight_decay=config.weight_decay) model.train() iters = 0 batch_time = util.AverageMeter() data_time = util.AverageMeter() losses = util.AverageMeter() losses_list = [util.AverageMeter() for i in range(12)] end = time.time() heat_weight = 48 * 48 * 25 / 2.0 # for convenient to compare with origin code # heat_weight = 1 while iters < config.max_iter: for i, (input, heatmap) in enumerate(train_loader): learning_rate = util.adjust_learning_rate(optimizer, iters, config.base_lr, policy=config.lr_policy,\ policy_parameter=config.policy_parameter, multiple=multiple) data_time.update(time.time() - end) input = input.cuda(async=True) heatmap = heatmap.cuda(async=True) input_var = torch.autograd.Variable(input) heatmap_var = torch.autograd.Variable(heatmap) heat1, heat2, heat3, heat4, heat5, heat6 = model(input_var) loss1 = criterion(heat1,heatmap_var) * heat_weight loss2 = criterion(heat2, heatmap_var) * heat_weight loss3 = criterion(heat3, heatmap_var) * heat_weight loss4 = criterion(heat4, heatmap_var) * heat_weight loss5 = criterion(heat5, heatmap_var) * heat_weight loss6 = criterion(heat6, heatmap_var) * heat_weight loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6 losses.update(loss.data[0], input.size(0)) loss_list = [loss1 , loss2 , loss3 , loss4 , loss5 , loss6] for cnt, l in enumerate(loss_list): losses_list[cnt].update(l.data[0], input.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() iters += 1 if iters % config.display == 0: print('Train Iteration: {0}\t' 'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t' 'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n' 'Learning rate = {2}\n' 'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format( iters, config.display, learning_rate, batch_time=batch_time, data_time=data_time, loss=losses)) for cnt in range(0, 6): print('Loss{0}_1 = {loss1.val:.8f} (ave = {loss1.avg:.8f})'.format(cnt + 1,loss1=losses_list[cnt])) print(time.strftime( '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n', time.localtime())) batch_time.reset() data_time.reset() losses.reset() for cnt in range(12): losses_list[cnt].reset() if iters % 5000 == 0: torch.save({ 'iter': iters, 'state_dict': model.state_dict(), }, str(iters) + '.pth.tar') if iters == config.max_iter: break return