def build_models(self): # ###################encoders######################################## # image_encoder = ImageEncoder(output_channels=cfg.hidden_dim) if cfg.text_encoder_path != '': img_encoder_path = cfg.text_encoder_path.replace('text_encoder', 'image_encoder') print('Load image encoder from:', img_encoder_path) state_dict = torch.load(img_encoder_path, map_location='cpu') if 'model' in state_dict.keys(): image_encoder.load_state_dict(state_dict['model']) else: image_encoder.load_state_dict(state_dict) for p in image_encoder.parameters(): # make image encoder grad on p.requires_grad = True # image_encoder.eval() epoch = 0 ################################################################### text_encoder = TextEncoder(bert_config = self.bert_config) if cfg.text_encoder_path != '': epoch = cfg.text_encoder_path[istart:iend] epoch = int(epoch) + 1 text_encoder_path = cfg.text_encoder_path print('Load text encoder from:', text_encoder_path) state_dict = torch.load(text_encoder_path, map_location='cpu') if 'model' in state_dict.keys(): text_encoder.load_state_dict(state_dict['model']) else: text_encoder.load_state_dict(state_dict) for p in text_encoder.parameters(): # make text encoder grad on p.requires_grad = True # ########################################################### # if cfg.CUDA: text_encoder = text_encoder.cuda() image_encoder = image_encoder.cuda() return [text_encoder, image_encoder, epoch]
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing train_transform = transforms.Compose([ transforms.RandomCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # val_transform = transforms.Compose([ # transforms.Resize(args.image_size, interpolation=Image.LANCZOS), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader train_data_loader = get_loader(args.train_image_dir, args.train_vqa_path, args.ix_to_ans_file, args.train_description_file, vocab, train_transform, args.batch_size, shuffle=True, num_workers=args.num_workers) #val_data_loader = get_loader(args.val_image_dir, args.val_vqa_path, args.ix_to_ans_file, vocab, val_transform, args.batch_size, shuffle=False, num_workers=args.num_workers) image_encoder = ImageEncoder(args.img_feature_size) question_emb_size = 1024 # description_emb_size = 512 no_ans = 1000 question_encoder = BertEncoder(question_emb_size) # ques_description_encoder = BertEncoder(description_emb_size) # vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, description_emb_size, no_ans) vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, no_ans) pretrained_epoch = 0 if args.pretrained_epoch > 0: pretrained_epoch = args.pretrained_epoch image_encoder.load_state_dict(torch.load('./models/image_encoder-' + str(pretrained_epoch) + '.pkl')) question_encoder.load_state_dict(torch.load('./models/question_encoder-' + str(pretrained_epoch) + '.pkl')) # ques_description_encoder.load_state_dict(torch.load('./models/ques_description_encoder-' + str(pretrained_epoch) + '.pkl')) vqa_decoder.load_state_dict(torch.load('./models/vqa_decoder-' + str(pretrained_epoch) + '.pkl')) if torch.cuda.is_available(): image_encoder.cuda() question_encoder.cuda() # ques_description_encoder.cuda() vqa_decoder.cuda() print("Cuda is enabled...") criterion = nn.CrossEntropyLoss() # params = image_encoder.get_params() + question_encoder.get_params() + ques_description_encoder.get_params() + vqa_decoder.get_params() params = list(image_encoder.parameters()) + list(question_encoder.parameters()) + list(vqa_decoder.parameters()) #print("params: ", params) optimizer = torch.optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) total_train_step = len(train_data_loader) min_avg_loss = float("inf") overfit_warn = 0 for epoch in range(args.num_epochs): if epoch < pretrained_epoch: continue image_encoder.train() question_encoder.train() #ques_description_encoder.train() vqa_decoder.train() avg_loss = 0.0 avg_acc = 0.0 for bi, (question_arr, image_vqa, target_answer, answer_str) in enumerate(train_data_loader): loss = 0 image_encoder.zero_grad() question_encoder.zero_grad() #ques_description_encoder.zero_grad() vqa_decoder.zero_grad() images = to_var(torch.stack(image_vqa)) question_arr = to_var(torch.stack(question_arr)) #ques_desc_arr = to_var(torch.stack(ques_desc_arr)) target_answer = to_var(torch.tensor(target_answer)) image_emb = image_encoder(images) question_emb = question_encoder(question_arr) #ques_desc_emb = ques_description_encoder(ques_desc_arr) #output = vqa_decoder(image_emb, question_emb, ques_desc_emb) output = vqa_decoder(image_emb, question_emb) loss = criterion(output, target_answer) _, prediction = torch.max(output,1) no_correct_prediction = prediction.eq(target_answer).sum().item() accuracy = no_correct_prediction * 100/ args.batch_size #### target_answer_no = target_answer.tolist() prediction_no = prediction.tolist() #### loss_num = loss.item() avg_loss += loss.item() avg_acc += no_correct_prediction #loss /= (args.batch_size) loss.backward() optimizer.step() # Print log info if bi % args.log_step == 0: print('Epoch [%d/%d], Train Step [%d/%d], Loss: %.4f, Acc: %.4f' %(epoch + 1, args.num_epochs, bi, total_train_step, loss.item(), accuracy)) avg_loss /= (args.batch_size * total_train_step) avg_acc /= (args.batch_size * total_train_step) print('Epoch [%d/%d], Average Train Loss: %.4f, Average Train acc: %.4f' %(epoch + 1, args.num_epochs, avg_loss, avg_acc)) # Save the models torch.save(image_encoder.state_dict(), os.path.join(args.model_path, 'image_encoder-%d.pkl' %(epoch+1))) torch.save(question_encoder.state_dict(), os.path.join(args.model_path, 'question_encoder-%d.pkl' %(epoch+1))) #torch.save(ques_description_encoder.state_dict(), os.path.join(args.model_path, 'ques_description_encoder-%d.pkl' %(epoch+1))) torch.save(vqa_decoder.state_dict(), os.path.join(args.model_path, 'vqa_decoder-%d.pkl' %(epoch+1))) overfit_warn = overfit_warn + 1 if (min_avg_loss < avg_loss) else 0 min_avg_loss = min(min_avg_loss, avg_loss) lossFileName = "result/result_"+str(epoch)+".txt" test_fd = open(lossFileName, 'w') test_fd.write('Epoch: '+ str(epoch) + ' avg_loss: ' + str(avg_loss)+ " avg_acc: "+ str(avg_acc)+"\n") test_fd.close() if overfit_warn >= 5: print("terminated as overfitted") break
kdd_dataset = Dataset(use_bert=use_bert) sampler = DistributedSampler(kdd_dataset) loader = DataLoader(kdd_dataset, collate_fn=collate_fn, batch_size=150, sampler=sampler, num_workers=20) nhead = 4 score_model = ScoreModel(kdd_dataset.unknown_token + 1, 1024, 1024, use_bert=use_bert).cuda() image_encoder = ImageEncoder(input_dim=2048, output_dim=1024, nhead=nhead) image_encoder.load_pretrained_weights( path='../user_data/image_encoder_large.pth') image_encoder = image_encoder.cuda() # text_generator = TextGenerator(score_model.embed.num_embeddings).cuda() # score_model = ScoreModel(30522, 256, num_heads=1).cuda() # category_embedding = CategoryEmbedding(256).cuda() optimizer = Adam(score_model.get_params() + image_encoder.get_params()) if start_epoch > 0 and local_rank == 0: checkpoints = torch.load( os.path.join(checkpoints_dir, 'model-epoch{}.pth'.format(start_epoch))) score_model.load_state_dict(checkpoints['score']) image_encoder.load_state_dict(checkpoints['item']) # text_generator.load_state_dict(checkpoints['generator']) optimizer.load_state_dict(checkpoints['optimizer']) print("load checkpoints")
def load_network(self): image_generator = ImageGenerator() image_generator.apply(weights_init) disc_image = DiscriminatorImage() disc_image.apply(weights_init) emb_dim = 300 text_encoder = TextEncoder(emb_dim, self.txt_emb, 1, dropout=0.0) attn_model = 'general' text_generator = TextGenerator(attn_model, emb_dim, len(self.txt_dico.id2word), self.txt_emb, n_layers=1, dropout=0.0) image_encoder = ImageEncoder() image_encoder.apply(weights_init) disc_latent = DiscriminatorLatent(emb_dim) if cfg.NET_G != '': state_dict = \ torch.load(cfg.NET_G, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load from: ', cfg.NET_G) if cfg.NET_D != '': state_dict = \ torch.load(cfg.NET_D, map_location=lambda storage, loc: storage) netD.load_state_dict(state_dict) print('Load from: ', cfg.NET_D) if cfg.ENCODER != '': state_dict = \ torch.load(cfg.ENCODER, map_location=lambda storage, loc: storage) encoder.load_state_dict(state_dict) print('Load from: ', cfg.ENCODER) if cfg.DECODER != '': state_dict = \ torch.load(cfg.DECODER, map_location=lambda storage, loc: storage) decoder.load_state_dict(state_dict) print('Load from: ', cfg.DECODER) if cfg.IMAGE_ENCODER != '': state_dict = \ torch.load(cfg.IMAGE_ENCODER, map_location=lambda storage, loc: storage) image_encoder.load_state_dict(state_dict) print('Load from: ', cfg.IMAGE_ENCODER) if cfg.CUDA: image_encoder.cuda() image_generator.cuda() text_encoder.cuda() text_generator.cuda() disc_image.cuda() disc_latent.cuda() return image_encoder, image_generator, text_encoder, text_generator, disc_image, disc_latent