def sup_loss_on_batch(self, batch, eval): self.prof.tick("out") action_loss_total = Variable( empty_float_tensor([1], self.is_cuda, self.cuda_device)) if batch is None: print("Skipping None Batch") return action_loss_total images = self.maybe_cuda(batch["images"]) instructions = self.maybe_cuda(batch["instr"]) instr_lengths = batch["instr_len"] states = self.maybe_cuda(batch["states"]) actions = self.maybe_cuda(batch["actions"]) # Auxiliary labels lm_pos_fpv = batch["lm_pos_fpv"] lm_pos_map = batch["lm_pos_map"] lm_indices = batch["lm_indices"] goal_pos_map = batch["goal_loc"] # TODO: Get rid of this. We will have lm_mentioned booleans and lm_mentioned_idx integers and that's it. TEMPLATES = True if TEMPLATES: lm_mentioned_tplt = batch["lm_mentioned_tplt"] side_mentioned_tplt = batch["side_mentioned_tplt"] else: lang_lm_mentioned = batch["lang_lm_mentioned"] lm_mentioned = batch["lm_mentioned"] # stops = self.maybe_cuda(batch["stops"]) masks = self.maybe_cuda(batch["masks"]) # This is the first-timestep metadata metadata = batch["md"] seq_len = images.size(1) batch_size = images.size(0) count = 0 correct_goal_count = 0 goal_count = 0 # Loop thru batch for b in range(batch_size): seg_idx = -1 self.reset() self.prof.tick("out") b_seq_len = len_until_nones(metadata[b]) # TODO: Generalize this # Slice the data according to the sequence length b_metadata = metadata[b][:b_seq_len] b_images = images[b][:b_seq_len] b_instructions = instructions[b][:b_seq_len] b_instr_len = instr_lengths[b][:b_seq_len] b_states = states[b][:b_seq_len] b_actions = actions[b][:b_seq_len] b_lm_pos_fpv = lm_pos_fpv[b][:b_seq_len] b_lm_pos_map = lm_pos_map[b][:b_seq_len] b_lm_indices = lm_indices[b][:b_seq_len] b_goal_pos = goal_pos_map[b][:b_seq_len] if not TEMPLATES: b_lang_lm_mentioned = lang_lm_mentioned[b][:b_seq_len] b_lm_mentioned = lm_mentioned[b][:b_seq_len] # Convert landmark and goal position from meters_and_metrics to pixels b_lm_pos_map = [ torch.from_numpy( transformations.pos_m_to_px(p.numpy(), self.params["global_map_size"], self.params["world_size_m"], self.params["world_size_px"])) if p is not None else None for p in b_lm_pos_map ] b_goal_pos = torch.from_numpy( transformations.pos_m_to_px(b_goal_pos.numpy(), self.params["global_map_size"], self.params["world_size_m"], self.params["world_size_px"])) b_lm_pos_map = [ self.cuda_var(s.long()) if s is not None else None for s in b_lm_pos_map ] b_lm_pos_fpv = [ self.cuda_var( (s / RESNET_FACTOR).long()) if s is not None else None for s in b_lm_pos_fpv ] b_lm_indices = [ self.cuda_var(s) if s is not None else None for s in b_lm_indices ] b_goal_pos = self.cuda_var(b_goal_pos) if not TEMPLATES: b_lang_lm_mentioned = self.cuda_var(b_lang_lm_mentioned) b_lm_mentioned = [ self.cuda_var(s) if s is not None else None for s in b_lm_mentioned ] # TODO: Figure out how to keep these properly. Perhaps as a whole batch is best # TODO: Introduce a key-value store (encapsulate instead of inherit) self.tensor_store.keep_inputs("lm_pos_fpv", b_lm_pos_fpv) self.tensor_store.keep_inputs("lm_pos_map", b_lm_pos_map) self.tensor_store.keep_inputs("lm_indices", b_lm_indices) self.tensor_store.keep_inputs("goal_pos_map", b_goal_pos) if not TEMPLATES: self.tensor_store.keep_inputs("lang_lm_mentioned", b_lang_lm_mentioned) self.tensor_store.keep_inputs("lm_mentioned", b_lm_mentioned) # TODO: Abstract all of these if-elses in a modular way once we know which ones are necessary if TEMPLATES: b_lm_mentioned_tplt = lm_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = side_mentioned_tplt[b][:b_seq_len] b_side_mentioned_tplt = self.cuda_var(b_side_mentioned_tplt) b_lm_mentioned_tplt = self.cuda_var(b_lm_mentioned_tplt) self.tensor_store.keep_inputs("lm_mentioned_tplt", b_lm_mentioned_tplt) self.tensor_store.keep_inputs("side_mentioned_tplt", b_side_mentioned_tplt) #b_lm_mentioned = b_lm_mentioned_tplt b_obs_mask = [True for _ in range(b_seq_len)] b_plan_mask = [True for _ in range(b_seq_len)] b_plan_mask_t_cpu = torch.Tensor(b_plan_mask) == True b_plan_mask_t = self.maybe_cuda(b_plan_mask_t_cpu) b_pos_enc = None # ---------------------------------------------------------------------------- # Optional Auxiliary Inputs # ---------------------------------------------------------------------------- if self.aux_losses.input_required("lm_pos_map_select"): b_lm_pos_map_select = [ lm_pos for i, lm_pos in enumerate(b_lm_pos_map) if b_plan_mask[i] ] self.tensor_store.keep_inputs("lm_pos_map_select", b_lm_pos_map_select) if self.aux_losses.input_required("lm_indices_select"): b_lm_indices_select = [ lm_idx for i, lm_idx in enumerate(b_lm_indices) if b_plan_mask[i] ] self.tensor_store.keep_inputs("lm_indices_select", b_lm_indices_select) if self.aux_losses.input_required("lm_mentioned_select"): b_lm_mentioned_select = [ lm_m for i, lm_m in enumerate(b_lm_mentioned) if b_plan_mask[i] ] self.tensor_store.keep_inputs("lm_mentioned_select", b_lm_mentioned_select) # ---------------------------------------------------------------------------- self.prof.tick("inputs") actions = self(b_images, b_states, b_instructions, b_instr_len, has_obs=b_obs_mask, plan=b_plan_mask, pos_enc=b_pos_enc) action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False) self.prof.tick("call") action_losses = self.action_loss.batch_reduce_loss(action_losses) action_loss = self.action_loss.reduce_loss(action_losses) action_loss_total = action_loss count += b_seq_len self.prof.tick("loss") action_loss_avg = action_loss_total / (count + 1e-9) self.prof.tick("out") # Doing this in the end (outside of se aux_losses = self.aux_losses.calculate_aux_loss(self.tensor_store, reduce_average=True) aux_loss = self.aux_losses.combine_losses(aux_losses, self.aux_weights) prefix = self.model_name + ("/eval" if eval else "/train") self.writer.add_dict(prefix, get_current_meters(), self.get_iter()) self.writer.add_dict(prefix, aux_losses, self.get_iter()) self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu().item(), self.get_iter()) # TODO: Log value here self.writer.add_scalar(prefix + "/goal_accuracy", self.goal_acc_meter.get(), self.get_iter()) self.prof.tick("auxiliaries") total_loss = action_loss_avg + aux_loss self.inc_iter() self.prof.tick("summaries") self.prof.loop() self.prof.print_stats(1) return total_loss
def sup_loss_on_batch(self, batch, eval=False, viz=False): if eval: self.eval() else: self.train() images = cuda_var(batch["images"], self.is_cuda, self.cuda_device) instructions = cuda_var(batch["instr"], self.is_cuda, self.cuda_device) instruction_masks = cuda_var(batch["instr_mask"], self.is_cuda, self.cuda_device) label_masks = cuda_var(batch["traj_labels"], self.is_cuda, self.cuda_device) # Each of the above is a list of lists of tensors, where the outer list is over the batch and the inner list # is over the segments. Loop through and accumulate loss for each batch sequentially, and for each segment. # Reset model state (embedding etc) between batches, but not between segments. # We don't process each batch in batch-mode, because it's complicated, with the varying number of segments and all. batch_size = len(images) total_class_loss = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device), requires_grad=True) total_ground_loss = Variable(empty_float_tensor([1], self.is_cuda, self.cuda_device), requires_grad=True) count = 0 label_masks = self.label_pool(label_masks) mask_pred, features, emb_loss = self(images, instructions, instruction_masks) if BCE: mask_pred_flat = mask_pred.view(-1, 1) label_masks_flat = label_masks - torch.min(label_masks) label_masks_flat = label_masks_flat / ( torch.max(label_masks_flat) + 1e-9) label_masks_flat = label_masks_flat.view(-1, 1).clamp(0, 1) main_loss = self.mask_loss(mask_pred_flat, label_masks_flat) elif NLL: mask_pred_1 = F.softmax(mask_pred, 1, _stacklevel=5) mask_pred_2 = 1 - mask_pred_1 mask_pred_1 = mask_pred_1.unsqueeze(1) mask_pred_2 = mask_pred_2.unsqueeze(1) mask_pred = torch.cat((mask_pred_1, mask_pred_2), dim=1) label_masks = label_masks.clamp(0, 1) if self.is_cuda: label_masks = label_masks.type(torch.cuda.LongTensor) else: label_masks = label_masks.type(torch.LongTensor) main_loss = self.mask_loss(mask_pred, label_masks) elif CE: # Crossentropy2D internally applies logsoftmax to mask_pred, # but labels are already assumed to be a valid probability distribution, so no softmax is applied main_loss = self.mask_loss(mask_pred, label_masks) # So for nice plotting, we must manually do it mask_pred = self.spatialsoftmax(mask_pred) else: main_loss = self.mask_loss(mask_pred, label_masks) # sum emb loss if batch size > 1 if type(emb_loss) == tuple: emb_loss = sum(emb_loss) # Extract the feature vectors corresponding to every landmark's location in the map # Apply a linear layer to classify which of the 64 landmarks it is # The landmark positions have to be divided by the same factor as the ResNet scaling factor lcount = 0 for i in range(batch_size): if self.class_loss and len(batch["lm_pos"][i]) > 0: lcount += 1 landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda, self.cuda_device) landmark_indices = cuda_var(batch["lm_indices"][i], self.is_cuda, self.cuda_device) landmark_coords = (landmark_pos / 8).long() lm_features = self.gather2d(features[i:i + 1, 0:32], landmark_coords) lm_pred = self.aux_class_linear(lm_features) class_loss = self.aux_loss(lm_pred, landmark_indices) total_class_loss = total_class_loss + class_loss if self.ground_loss and len(batch["lm_pos"][i]) > 0: landmark_pos = cuda_var(batch["lm_pos"][i], self.is_cuda, self.cuda_device) landmark_mentioned = cuda_var(batch["lm_mentioned"][i], self.is_cuda, self.cuda_device) landmark_coords = (landmark_pos / 8).long() g_features = self.gather2d(features[i:i + 1, 32:35], landmark_coords) lm_pred = self.aux_ground_linear(g_features) ground_loss = self.aux_loss(lm_pred, landmark_mentioned) total_ground_loss = total_ground_loss + ground_loss total_class_loss = total_class_loss / (lcount + 1e-9) total_ground_loss = total_ground_loss / (lcount + 1e-9) count += 1 # Just visualization and debugging code if self.get_iter() % 50 == 0: presenter = Presenter() pred_viz_np = presenter.overlaid_image(images[0].data, mask_pred[0].data) labl_viz_np = presenter.overlaid_image(images[0].data, label_masks[0].data) comp = np.concatenate((pred_viz_np, labl_viz_np), axis=1) presenter.show_image(comp, "path_pred") if hasattr(self.sentence_embedding, "save_att_map"): self.sentence_embedding.save_att_map(self.get_iter(), i) total_loss = main_loss + 0.1 * total_class_loss + 0.001 * emb_loss + 0.1 * total_ground_loss total_loss = total_loss / (count + 1e-9) self.write_summaires("eval" if eval else "train", self.get_iter(), total_loss, main_loss, emb_loss, total_class_loss, total_ground_loss) self.inc_iter() return total_loss
def deterministic_action(self, action_mean, action_std, stop_prob): batch_size = action_mean.size(0) action = Variable(empty_float_tensor((batch_size, 4), self.is_cuda, self.cuda_device)) action[:, 0:3] = action_mean[:, 0:3] action[:, 3] = stop_prob return action
def forward(self, word_ids, lengths=None): # TODO: Get rid of this and abstract in another layer if isinstance(word_ids, list) and lengths is None: word_ids, lengths = sequence_list_to_tensor([word_ids]) if self.is_cuda: word_ids = word_ids.cuda( ) #size: [2, 500] [batch size, max intruction len] lengths = lengths.cuda() #instruction length word_embeddings = self.embedding( word_ids) #size: [2, 500, 20] embedding size: 20 batch_size = word_embeddings.size(0) # size:2 sentence_embeddings = Variable( empty_float_tensor( (batch_size, self.lstm_size * self.factor * self.num_attn_heads), self.is_cuda, self.cuda_device)) #size [2,80] penal = 0 for i in range(batch_size): length = int(lengths[i]) if length == 0: print("Empty caption") continue embeddings_i = word_embeddings[i, 0:length].unsqueeze( 1) # size: [instruction length, 1, 20] h0 = Variable( empty_float_tensor( (self.lstm_layers * self.factor, 1, self.lstm_size), self.is_cuda)) #size: [2, 1, 40] c0 = Variable( empty_float_tensor( (self.lstm_layers * self.factor, 1, self.lstm_size), self.is_cuda)) #size: [2, 1, 40] outputs, states = self.lstm_txt( embeddings_i, (h0, c0) ) #output size: [intr_len, 1, 80] #2 states: forward and backwward. size: [2, 1, 40] H = outputs.squeeze(dim=1) #size: [instr_len, 80] hidden, cell = (states[0].squeeze(dim=1), states[1].squeeze(dim=1) ) #size: 2x[2,40] #self-attention s1 = self.W_s1(H) s2 = self.W_s2(F.tanh(s1)) A = F.softmax(s2.t(), dim=1) M = torch.mm(A, H) AAt = torch.mm(A, A.t()) for j in range(self.num_attn_heads): AAt[j, j] = 0 p = torch.norm(AAt, 2) penal += p * p penal /= batch_size # Mean-reduce the 1st (sequence) dimension #sentence_embedding = torch.mean(M, 0) #size [80] sentence_embedding = M.view(-1) sentence_embeddings[i] = sentence_embedding.squeeze() if self.n_batch % 2000 == 0 and self.idx2word is not None: str_id = word_ids[-1][:length].data.cpu().numpy() instr = [self.idx2word[str(i)] for i in str_id] Att = A.data.cpu().numpy() filepath = get_self_attention_path( ) + "sample_instructions/sample_intr-{}-{}.txt".format( self.n_epoch, self.n_batch) # with open(filepath, "w") as f: # for w in zip(instr, Att[0], Att[1], Att[2], Att[3], Att[4]): # f.write(str(w)+"\n") imgpath = get_self_attention_path( ) + "instruction_heatmap/intr_heatmap-{}-{}.png".format( self.n_epoch, self.n_batch) # plt.close() plt.figure(figsize=(len(instr) / 6, 1.8)) plt.pcolor(Att) plt.xticks(np.linspace(0.5, len(instr) - 0.5, len(instr)), instr, rotation=90, fontsize=10) plt.gcf().subplots_adjust(bottom=0.5) plt.savefig(imgpath) # plt.show() self.n_batch += 1 return sentence_embeddings, penal
def sup_loss_on_batch(self, batch, eval): self.prof.tick("out") action_loss_total = Variable( empty_float_tensor([1], self.is_cuda, self.cuda_device)) if batch is None: print("Skipping None Batch") return action_loss_total images = self.maybe_cuda(batch["images"]) instructions = self.maybe_cuda(batch["instr"]) instr_lengths = batch["instr_len"] actions = self.maybe_cuda(batch["actions"]) metadata = batch["md"] batch_size = images.size(0) count = 0 # Loop thru batch for b in range(batch_size): self.reset() self.prof.tick("out") b_seq_len = len_until_nones(metadata[b]) # TODO: Generalize this # Slice the data according to the sequence length b_metadata = metadata[b][:b_seq_len] b_images = images[b][:b_seq_len] b_instructions = instructions[b][:b_seq_len] b_instr_len = instr_lengths[b][:b_seq_len] b_actions = actions[b][:b_seq_len] # ---------------------------------------------------------------------------- self.prof.tick("inputs") actions = self(b_images, b_instructions, b_instr_len) action_losses, _ = self.action_loss(b_actions, actions, batchreduce=False) self.prof.tick("call") action_losses = self.action_loss.batch_reduce_loss(action_losses) action_loss = self.action_loss.reduce_loss(action_losses) action_loss_total = action_loss count += b_seq_len self.prof.tick("loss") action_loss_avg = action_loss_total / (count + 1e-9) self.prof.tick("out") prefix = self.model_name + ("/eval" if eval else "/train") self.writer.add_dict(prefix, get_current_meters(), self.get_iter()) self.writer.add_scalar(prefix + "/action_loss", action_loss_avg.data.cpu()[0], self.get_iter()) total_loss = action_loss_avg self.inc_iter() self.prof.loop() self.prof.print_stats(1) return total_loss