optimizer_decoder.step() if train_dis: NetD.zero_grad() loss_discriminator.backward() optimizer_discriminator.step() print( '[%d/%d][%d/%d] loss_discriminator: %.4f loss_decoder: %.4f loss_encoder: %.4f D_x: %.4f D_G_z1: %.4f D_G_z2: %.4f' % (epoch, opt.niter, i, len(dataloader), loss_discriminator.item(), loss_decoder.item(), loss_encoder.item(), D_x, D_G_z1, D_G_z2)) mu, logvar = NetE(fixed_batch) sample = Sampler([mu, logvar], device) rec_real = NetG(sample) vutils.save_image(rec_real, '%s/rec_real_epoch_%03d.png' % (opt.outf, epoch), normalize=True) if epoch % 10 == 0: torch.save(NetE.state_dict(), '%s/NetE_epoch_%d.pth' % (opt.outf, epoch)) torch.save(NetG.state_dict(), '%s/NetG_epoch_%d.pth' % (opt.outf, epoch)) torch.save(NetD.state_dict(), '%s/NetD_epoch_%d.pth' % (opt.outf, epoch)) noise = Variable(torch.randn(batch_size, nz, 1, 1)).to(device) noise.normal_(0, 1) rec_noise = NetG(noise) vutils.save_image(rec_noise, '%s/rec_noise.png' % (opt.outf), normalize=True)
rec_real = NetG(sample) errDec_MSE = torch.sum(0.5*(input - rec_real) ** 2, 1) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) loss_decoder = torch.sum(errDec_MSE) loss_encoder = torch.sum(errDec_MSE) + torch.sum(KLD) NetE.zero_grad() loss_encoder.backward(retain_graph=True) optimizer_encorder.step() NetG.zero_grad() loss_decoder.backward(retain_graph=True) optimizer_decoder.step() print('[%d/%d][%d/%d] loss_decoder: %.4f loss_encoder: %.4f' % (epoch, opt.niter, i, len(dataloader), loss_decoder.item(), loss_encoder.item())) mu, logvar = NetE(fixed_batch) sample = Sampler([mu, logvar], device) rec_real = NetG(sample) vutils.save_image(rec_real, '%s/rec_real_epoch_%03d.png' % (opt.outf, epoch), normalize=True) if epoch % 10 == 0: torch.save(NetE.state_dict(), '%s/NetE_epoch_%d.pth' % (opt.outf, epoch)) torch.save(NetG.state_dict(), '%s/NetG_epoch_%d.pth' % (opt.outf, epoch))
def main(args): # Create model directory for saving trained models if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, augmentation, normalization for using the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.im_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Configure the network encoder = Encoder(args.embed_size).to(device) decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # mini-batch images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.num_epochs, i, total_step, loss.item())) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder.ckpt')) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder.ckpt'))
def train(args): #数据预处理,生成vocab和data preprocess(args['cap_path'], args['vocab_path'], args['data_path']) if not os.path.exists(args['model_path']): os.mkdir(args['model_path']) #对图片进行处理,进行数据增强 transform = transforms.Compose([ transforms.Resize((args['resize'], args['resize'])), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) with open(args['data_path'], 'rb') as f: Data = pickle.load(f) data_loader = get_loader(args['train_img_path'], Data, vocab, transform, args['batch_size'], shuffle=True, num_workers=args['num_workers']) encoder = Encoder(args['embed_size'], args['pooling_kernel']).cuda() decoder = Decoder(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers']).cuda() criterion = nn.CrossEntropyLoss().cuda() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args['learning_rate']) total_step = len(data_loader) for epoch in range(args['num_epochs']): for i, (images, captions, lengths) in enumerate(data_loader): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #打印训练信息 if i % args['log_step'] == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], i, total_step, loss.item(), np.exp(loss.item()))) #保存模型 if (i + 1) % args['save_step'] == 0: torch.save( decoder.state_dict(), os.path.join(args['model_path'], 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args['model_path'], 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) #每个epoch结束也保存一次模型 torch.save( decoder.state_dict(), os.path.join(args['model_path'], 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args['model_path'], 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))