def _make_history_context(self, batch, decay=0.5): ''' Embed history context both vision and text, return a list of [vision embed, text embed]''' history_lengths = [len(b['history_heading']) for b in batch] max_history = max(history_lengths) context_list = [] text_context_list = [] for hist_count in range(max_history): new_batch = [copy.deepcopy(b) for b in batch] zero_list = [] for i, b in enumerate(new_batch): if len(b['history_heading']) > hist_count: b['heading'] = b['history_heading'][hist_count] b['path'] = b['history_path'][hist_count] b['instr_encoding'] = b['history_instr_encoding'][hist_count] else: b['path'] = [b['path'][0]] b['instr_encoding'] = np.array([VOCAB_EOS_IDX]) zero_list.append(i) path_obs, path_actions, encoded_instructions = \ self.env.gold_obs_actions_and_instructions(new_batch) batched_image_features, batched_action_embeddings, _, seq_lengths = \ batch_observations_and_actions(path_obs, path_actions, self.env.padding_feature, self.env.padding_action) seq_lengths[zero_list] = 0 context = self.decoder(batched_image_features, batched_action_embeddings, seq_lengths, context=True) context_list.append(context) max_len = max([len(ins) for ins in encoded_instructions]) batched_ins, _, ins_lengths \ = batch_instructions_from_encoded(encoded_instructions, max_len + 2, cut=False) text_context = self.encoder(batched_ins, ins_lengths, context=True) text_context_list.append(text_context) context_list = torch.stack(context_list, dim=1) if context_list else [] text_context_list = \ torch.stack(text_context_list, dim=1) if text_context_list else [] if decay < 0: # smaller than 0, use LSTM memory context = self.decoder.context_lstm(context_list, history_lengths) text_context = self.decoder.text_context_lstm(text_context_list, history_lengths) else: # not smaller than 0, use exp forget if len(context_list) > 0: exp_weight = np.zeros((len(history_lengths), max_history)) for i, h in enumerate(history_lengths): exp_weight[i][:h] = [np.exp(-x * decay) for x in range(h)][::-1] exp_weight = F.normalize(try_cuda( torch.from_numpy(exp_weight)).float(), p=1, dim=1).unsqueeze(-1) context = (context_list * exp_weight).sum(dim=1) text_context = (text_context_list * exp_weight).sum(dim=1) else: context = try_cuda(torch.zeros(len(history_lengths), self.hidden_size)) text_context = try_cuda(torch.zeros(len(history_lengths), self.hidden_size)) return [context, text_context]
def make_speaker_models(args, vocab_size, env, tok): glove = np.load(GLOVE_PATH) encoder = SpeakerEncoderLSTM(args.feature_size, args.hidden_size, args.dropout) decoder = SpeakerDecoderLSTM(vocab_size, args.wemb, VOCAB_PAD_IDX, args.hidden_size, args.dropout, glove=glove) encoder = try_cuda(encoder) decoder = try_cuda(decoder) agent = Seq2SeqSpeaker(env, "", args, encoder, decoder, tok) return agent
def __init__(self, args, paths, states_map, distances, state_embedding, loc_embeddings, adj_dict): self.env = EnvBatch(adj_dict=adj_dict) self.margin = 3.0 self.paths = paths self.states_map = states_map self.distances = distances self.state_embedding = state_embedding self.loc_embeddings = loc_embeddings self.padding_action = try_cuda(torch.zeros(args.action_embed_size)) self.padding_feature = try_cuda( torch.zeros(args.num_views, args.action_embed_size)) self.shrink = 10 # shrink distance 10 times
def _action_variable(self, obs): ''' Get the available action embedding for the agent to select.''' max_num_a = max([len(ob['adj_loc_list']) for ob in obs]) is_valid = np.zeros((len(obs), max_num_a), np.float32) action_embeddings = [] for i, ob in enumerate(obs): is_valid[i, len(ob['adj_loc_list']):] = 1 action_embeddings.append(ob['action_embedding']) # action embed and action mask return pad_sequence(action_embeddings, batch_first=True), \ try_cuda(torch.from_numpy(is_valid).byte())
def make_follower_models(args, vocab_size, all_val_data, env): glove = np.load(GLOVE_PATH) encoder = EncoderLSTM(vocab_size, args.wemb, args.hidden_size, VOCAB_PAD_IDX, args.dropout, glove=glove) if args.coground: decoder = CogroundDecoderLSTM(args.action_embed_size, args.hidden_size, args.dropout, args.feature_size, args.max_ins_len, history=args.history) else: decoder = AttnDecoderLSTM(args.action_embed_size, args.hidden_size, args.dropout, args.feature_size, history=args.history, lstm_mem=args.exp_forget < 0) encoder = try_cuda(encoder) decoder = try_cuda(decoder) encoder_optimizer, decoder_optimizer = \ reset_optimizer(args, encoder, decoder) agent = Seq2SeqFollower(env, "", args, encoder, decoder, encoder_optimizer, decoder_optimizer) evaluator = FollowerEvaluation(env, all_val_data) return agent, evaluator
def batch_instructions_from_encoded(encoded_instructions, max_length, reverse=False, cut=True): num_instructions = len(encoded_instructions) seq_tensor = np.full((num_instructions, max_length), VOCAB_PAD_IDX) seq_lengths = [] for i, inst in enumerate(encoded_instructions): if len(inst) > 0 and inst[-1] == VOCAB_EOS_IDX: inst = inst[:-1] if reverse: inst = inst[::-1] inst = np.concatenate((inst, [VOCAB_EOS_IDX])) inst = inst[:max_length] seq_tensor[i, :len(inst)] = inst seq_lengths.append(len(inst)) if cut: seq_tensor = torch.from_numpy(seq_tensor)[:, :max(seq_lengths)] mask = (seq_tensor == VOCAB_PAD_IDX)[:, :max(seq_lengths)] else: seq_tensor = torch.from_numpy(seq_tensor) mask = (seq_tensor == VOCAB_PAD_IDX) return try_cuda(seq_tensor.long()), try_cuda(mask.byte()), seq_lengths
def _build_action_embedding(self, adj_loc_list, feature): feature_adj = feature[[adj_dict['absViewIndex'] for adj_dict in adj_loc_list]] feature_adj[0] = 0 embedding = np.zeros((len(adj_loc_list), 128), np.float32) for a, adj_dict in enumerate(adj_loc_list): if a == 0: continue else: rel_heading = adj_dict['rel_heading'] rel_elevation = adj_dict['rel_elevation'] embedding[a][0:32] = np.sin(rel_heading) embedding[a][32:64] = np.cos(rel_heading) embedding[a][64:96] = np.sin(rel_elevation) embedding[a][96:] = np.cos(rel_elevation) angle_embed = torch.from_numpy(embedding).float() return try_cuda(torch.cat((feature_adj, angle_embed), dim=-1))
def batch_observations_and_actions(path_obs, path_actions, padding_feature, padding_action): batch_size = len(path_obs) seq_lengths = np.array([len(a) for a in path_actions]) max_path_length = seq_lengths.max() mask = np.ones((batch_size, max_path_length), np.uint8) image_features = [[] for _ in range(batch_size)] action_embeddings = [[] for _ in range(batch_size)] for i in range(batch_size): assert len(path_obs[i]) == len(path_actions[i]) mask[i, :len(path_actions[i])] = 0 image_features[i] = [ob['feature'][0] for ob in path_obs[i]] action_embeddings[i] = [ob['action_embedding'][path_actions[i][j]] for j, ob in enumerate(path_obs[i])] image_features[i].extend([padding_feature] * (max_path_length - len(path_actions[i]))) action_embeddings[i].extend([padding_action] * (max_path_length - len(path_actions[i]))) image_features[i] = torch.stack(image_features[i], dim=0) action_embeddings[i] = torch.stack(action_embeddings[i], dim=0) batched_image_features = torch.stack(image_features, dim=0) batched_action_embeddings = torch.stack(action_embeddings, dim=0) mask = try_cuda(torch.from_numpy(mask).byte()) return batched_image_features, batched_action_embeddings, mask, seq_lengths
def _rollout(self, batch, feedback, reward_flag=False, history=False, exp_forget=0.5): batch_size = len(batch) # Embed history memory if history: history_context = self._make_history_context(batch, decay=exp_forget) else: history_context = None # Batch instructions seq, seq_mask, seq_lengths = \ batch_instructions_from_encoded([b['instr_encoding'] for b in batch], self.max_instruction_length, reverse=self.reverse_instruction) # Reset environment done = np.zeros(batch_size, dtype=np.uint8) obs = self.env.reset(batch) # Record starting point loss = 0 count_valid = 0 traj = [{ 'instr_id': ob['instr_id'], 'scores': [], 'heading': ob['heading'], 'trajectory': [ob['viewpoint']], 'trajectory_radians': [(ob['heading'], ob['elevation'])], 'reward': [], } for ob in obs] # Init text embed and action ctx, h_t, c_t = self.encoder(seq, seq_lengths) u_t_prev = self.env.padding_action.expand(batch_size, -1) # Do a sequence rollout and calculate the loss sequence_scores = try_cuda(torch.zeros(batch_size)) for t in range(self.episode_len): f_t = self._feature_variables(obs) all_u_t, action_mask = self._action_variable(obs) h_t, c_t, alpha, logit, alpha_v = \ self.decoder(u_t_prev, all_u_t, f_t, h_t, c_t, ctx, ctx_mask=seq_mask, history_context=history_context) # Supervised training target = self._teacher_action(obs, done) logit[action_mask] = -float('inf') if torch.isnan(logit).sum(): raise ValueError("Error! network produce nan result!") # Determine next model inputs if feedback == 'teacher': a_t = torch.clamp(target, min=0) elif feedback == 'argmax': _, a_t = logit.max(1) a_t = a_t.detach() elif feedback == 'sample': probs = F.softmax(logit, dim=1) m = D.Categorical(probs) a_t = m.sample() else: raise ValueError("Error! Invalid feedback option!") # Valid count (not done yet trajectories) count_valid += len(obs) - done.sum() # Update the previous action u_t_prev = all_u_t[np.arange(batch_size), a_t, :].detach() action_scores = -F.cross_entropy( logit, a_t.clone(), ignore_index=-1, reduction='none') action_scores[done] = 0 sequence_scores += action_scores # Calculate loss loss += self._criterion(logit, target) # Make environment action a_t[done] = 0 obs, next_done = self.env.step(obs, a_t.tolist()) # Save trajectory output for i, ob in enumerate(obs): if not done[i]: if reward_flag: traj[i]['scores'].append(-action_scores[i]) traj[i]['trajectory'].append(ob['viewpoint']) traj[i]['trajectory_radians'].append( (ob['heading'], ob['elevation'])) # Early exit if all ended done = next_done if done.all(): break for i, ob in enumerate(obs): traj[i]['score'] = sequence_scores[i].item() / len( traj[i]['trajectory']) return traj, loss / count_valid
def _teacher_action(self, obs, ended): ''' Extract teacher actions into variable. ''' a = torch.LongTensor(len(obs)) for i, ob in enumerate(obs): a[i] = ob['teacher_action'] if not ended[i] else -1 return try_cuda(a)
def _build_feature_embedding(self, view_index, feature): angle_embed = self.loc_embeddings[view_index] return try_cuda(torch.cat((feature, angle_embed), dim=-1))
def _score_obs_actions_and_instructions(self, path_obs, path_actions, encoded_instructions, feedback, train): batch_size = len(path_obs) instr_seq, _, _ = \ batch_instructions_from_encoded(encoded_instructions, self.max_instruction_length, cut=False) batched_image_features, batched_action_embeddings, path_mask, seq_len = \ batch_observations_and_actions(path_obs, path_actions, self.env.padding_feature, self.env.padding_action) ctx = self.encoder(batched_image_features, batched_action_embeddings, seq_len) h_t = try_cuda(torch.zeros(batch_size, ctx.size(-1))) c_t = try_cuda(torch.zeros(batch_size, ctx.size(-1))) ended = np.array([False] * batch_size) outputs = [{ 'instr_id': path_obs[i][0]['instr_id'], 'word_indices': [], 'scores': [] } for i in range(batch_size)] # Do a sequence rollout and calculate the loss loss = 0 w_t = try_cuda( torch.from_numpy( np.full((batch_size, 1), self.tok.vocab_bos_idx, dtype='int64')).long()) if train: w_t = torch.cat([w_t, instr_seq], dim=1) logits, _, _ = self.decoder(w_t, ctx, path_mask, h_t, c_t) logits = logits.permute(0, 2, 1).contiguous() loss = F.cross_entropy( input=logits[:, :, :-1], # -1 for aligning target=instr_seq, # "1:" to ignore the word <BOS> ignore_index=VOCAB_PAD_IDX) else: sequence_scores = try_cuda(torch.zeros(batch_size)) for t in range(self.max_instruction_length): logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx, path_mask, h_t, c_t) logit = logit.squeeze(1) logit[:, VOCAB_PAD_IDX] = -float('inf') target = instr_seq[:, t].contiguous() if torch.isnan(logit).sum(): print("Error: network produce nan result!") exit(0) # Determine next model inputs if feedback == 'teacher': w_t = target elif feedback == 'argmax': _, w_t = logit.max(1) w_t = w_t.detach() elif feedback == 'sample': probs = F.softmax(logit, dim=1) probs[:, VOCAB_PAD_IDX] = 0 m = D.Categorical(probs) w_t = m.sample() else: sys.exit('Invalid feedback option') log_probs = F.log_softmax(logit, dim=1) word_scores = -F.nll_loss(log_probs, w_t, ignore_index=VOCAB_PAD_IDX, reduction='none') sequence_scores += word_scores loss += F.nll_loss(log_probs, target, ignore_index=VOCAB_PAD_IDX) for i in range(batch_size): word_idx = w_t[i].item() if not ended[i]: outputs[i]['word_indices'].append(int(word_idx)) outputs[i]['scores'].append(word_scores[i].item()) if word_idx == VOCAB_EOS_IDX: ended[i] = True # Early exit if all ended if ended.all(): break for i, item in enumerate(outputs): item['score'] = float(sequence_scores[i].item()) / len( item['word_indices']) item['words'] = self.tok.decode_sentence(item['word_indices'], break_on_eos=True, join=False) return outputs, loss