def get_self_critical_reward(self, data_gts, gen_result): batch_size = len(gen_result) # batch_size = sample_size * seq_per_img res = {} for i in range(batch_size): res[i] = [' '.join(gen_result[i])] batch_reward = 0 gts = {} for i in range(batch_size): gts[i] = data_gts[i] if self.cfg.TRAIN.RL.Cider_Reward_Weight > 0: cider, cider_scores = Cider().compute_score(gts, res) cider_scores = self.cfg.TRAIN.RL.Cider_Reward_Weight * torch.FloatTensor( cider_scores) batch_reward = cider else: cider_scores = 0 rewards = cider_scores.unsqueeze(1).repeat(1, self.bi_max) return batch_reward, rewards
model.load_state_dict(training_checkpoint[key]) if _A.start_from_checkpoint.split("_")[-1][:-4] == 'best': start_iteration = 1 else: start_iteration = int( _A.start_from_checkpoint.split("_")[-1][:-4]) + 1 else: start_iteration = 1 start_iteration = 1 # Initialize the lr for start iteration lr_scheduler.step(start_iteration - 1) # Construct a cider evaluation object cider_train = Cider( PTBTokenizer.tokenize( train_dataset._captions_reader._ref_caps_full_sentences)) # -------------------------------------------------------------------------------------------- # TRAINING LOOP # -------------------------------------------------------------------------------------------- model.eval() model._is_val = False running_reward = .0 reward_counter = 1 for iteration in tqdm(range(start_iteration, _C.OPTIM.NUM_ITERATIONS + 1)): # keys: {"image_id", "image_features", "caption_tokens"} batch = next(train_dataloader) batch_size = batch["image_features"].shape[0] optimizer.zero_grad()
if not os.path.isfile('vocab_%s.pkl' % args.exp_name): print("Building vocabulary") text_field.build_vocab(train_dataset, val_dataset, min_freq=5) pickle.dump(text_field.vocab, open('vocab_%s.pkl' % args.exp_name, 'wb')) else: text_field.vocab = pickle.load(open('vocab_%s.pkl' % args.exp_name, 'rb')) # Model and dataloaders encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': args.m}) decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>']) model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device) dict_dataset_train = train_dataset.image_dictionary({'image': image_field, 'text': RawField()}) ref_caps_train = list(train_dataset.text) cider_train = Cider(PTBTokenizer.tokenize(ref_caps_train)) dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()}) dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()}) def lambda_lr(s): warm_up = args.warmup s += 1 return (model.d_model ** -.5) * min(s ** -.5, s * warm_up ** -1.5) # Initial conditions optim = Adam(model.parameters(), lr=1, betas=(0.9, 0.98)) scheduler = LambdaLR(optim, lambda_lr) loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>']) use_rl = False
encoder = MemoryAugmentedEncoder( 3, 0, attention_module=ScaledDotProductAttentionMemory, attention_module_kwargs={'m': args.m}) decoder = MeshedDecoder(len(text_field.vocab), 108, 3, text_field.vocab.stoi['<pad>']) model = Transformer(text_field.vocab.stoi['<start>'], encoder, decoder).to(device) dict_dataset_train = train_dataset.image_dictionary({ 'image': image_field, 'text': RawField() }) ref_caps_train = list(train_dataset.text) cider_train = Cider(my_tokenize(ref_caps_train)) dict_dataset_val = val_dataset.image_dictionary({ 'image': image_field, 'text': RawField() }) dict_dataset_test = test_dataset.image_dictionary({ 'image': image_field, 'text': RawField() }) def lambda_lr(s): warm_up = args.warmup s += 1 return (model.d_model**-.5) * min(s**-.5, s * warm_up**-1.5) # Initial conditions
# Pipeline for text text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True, nopoints=False) # Create the dataset dataset = COCO(image_field, text_field, 'coco/images/', args.annotation_folder, args.annotation_folder) _, _, test_dataset = dataset.splits text_field.vocab = pickle.load(open('vocab.pkl', 'rb')) ref_caps_test = list(test_dataset.text) cider_test = Cider(PTBTokenizer.tokenize(ref_caps_test)) # Model and dataloaders Transformer, TransformerEncoder, TransformerDecoderLayer, ScaledDotProductAttention = model_factory( args) encoder = TransformerEncoder(3, 0, attention_module=ScaledDotProductAttention, d_in=args.dim_feats, d_k=args.d_k, d_v=args.d_v, h=args.head) decoder = TransformerDecoderLayer(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'],