Esempio n. 1
0
 def _make_history_context(self, batch, decay=0.5):
   ''' Embed history context both vision and text, return a list of [vision embed, text embed]'''
   history_lengths = [len(b['history_heading']) for b in batch]
   max_history = max(history_lengths)
   context_list = []
   text_context_list = []
   for hist_count in range(max_history):
     new_batch = [copy.deepcopy(b) for b in batch]
     zero_list = []
     for i, b in enumerate(new_batch):
       if len(b['history_heading']) > hist_count:
         b['heading'] = b['history_heading'][hist_count]
         b['path'] = b['history_path'][hist_count]
         b['instr_encoding'] = b['history_instr_encoding'][hist_count]
       else:
         b['path'] = [b['path'][0]]
         b['instr_encoding'] = np.array([VOCAB_EOS_IDX])
         zero_list.append(i)
     path_obs, path_actions, encoded_instructions = \
       self.env.gold_obs_actions_and_instructions(new_batch)
     batched_image_features, batched_action_embeddings, _, seq_lengths = \
       batch_observations_and_actions(path_obs, path_actions,
                                      self.env.padding_feature,
                                      self.env.padding_action)
     seq_lengths[zero_list] = 0
     context = self.decoder(batched_image_features, batched_action_embeddings,
                            seq_lengths, context=True)
     context_list.append(context)
     max_len = max([len(ins) for ins in encoded_instructions])
     batched_ins, _, ins_lengths \
       = batch_instructions_from_encoded(encoded_instructions, max_len + 2,
                                         cut=False)
     text_context = self.encoder(batched_ins, ins_lengths, context=True)
     text_context_list.append(text_context)
   
   context_list = torch.stack(context_list, dim=1) if context_list else []
   text_context_list = \
     torch.stack(text_context_list, dim=1) if text_context_list else []
   if decay < 0:  # smaller than 0, use LSTM memory
     context = self.decoder.context_lstm(context_list, history_lengths)
     text_context = self.decoder.text_context_lstm(text_context_list,
                                                   history_lengths)
   else:  # not smaller than 0, use exp forget
     if len(context_list) > 0:
       exp_weight = np.zeros((len(history_lengths), max_history))
       for i, h in enumerate(history_lengths):
         exp_weight[i][:h] = [np.exp(-x * decay) for x in range(h)][::-1]
       exp_weight = F.normalize(try_cuda(
         torch.from_numpy(exp_weight)).float(), p=1, dim=1).unsqueeze(-1)
       context = (context_list * exp_weight).sum(dim=1)
       text_context = (text_context_list * exp_weight).sum(dim=1)
     else:
       context = try_cuda(torch.zeros(len(history_lengths), self.hidden_size))
       text_context = try_cuda(torch.zeros(len(history_lengths),
                                           self.hidden_size))
   return [context, text_context]
Esempio n. 2
0
def make_speaker_models(args, vocab_size, env, tok):
  glove = np.load(GLOVE_PATH)
  encoder = SpeakerEncoderLSTM(args.feature_size, args.hidden_size,
                               args.dropout)
  decoder = SpeakerDecoderLSTM(vocab_size, args.wemb, VOCAB_PAD_IDX,
                               args.hidden_size, args.dropout, glove=glove)
  encoder = try_cuda(encoder)
  decoder = try_cuda(decoder)
  
  agent = Seq2SeqSpeaker(env, "", args, encoder, decoder, tok)
  return agent
Esempio n. 3
0
 def __init__(self, args, paths, states_map, distances, state_embedding,
              loc_embeddings, adj_dict):
     self.env = EnvBatch(adj_dict=adj_dict)
     self.margin = 3.0
     self.paths = paths
     self.states_map = states_map
     self.distances = distances
     self.state_embedding = state_embedding
     self.loc_embeddings = loc_embeddings
     self.padding_action = try_cuda(torch.zeros(args.action_embed_size))
     self.padding_feature = try_cuda(
         torch.zeros(args.num_views, args.action_embed_size))
     self.shrink = 10  # shrink distance 10 times
Esempio n. 4
0
 def _action_variable(self, obs):
     ''' Get the available action embedding for the agent to select.'''
     max_num_a = max([len(ob['adj_loc_list']) for ob in obs])
     is_valid = np.zeros((len(obs), max_num_a), np.float32)
     action_embeddings = []
     for i, ob in enumerate(obs):
         is_valid[i, len(ob['adj_loc_list']):] = 1
         action_embeddings.append(ob['action_embedding'])
     # action embed and action mask
     return pad_sequence(action_embeddings, batch_first=True), \
            try_cuda(torch.from_numpy(is_valid).byte())
Esempio n. 5
0
def make_follower_models(args, vocab_size, all_val_data, env):
  glove = np.load(GLOVE_PATH)
  encoder = EncoderLSTM(vocab_size, args.wemb, args.hidden_size, VOCAB_PAD_IDX,
                        args.dropout, glove=glove)
  if args.coground:
    decoder = CogroundDecoderLSTM(args.action_embed_size, args.hidden_size,
                                  args.dropout, args.feature_size,
                                  args.max_ins_len, history=args.history)
  else:
    decoder = AttnDecoderLSTM(args.action_embed_size, args.hidden_size,
                              args.dropout, args.feature_size,
                              history=args.history,
                              lstm_mem=args.exp_forget < 0)
  
  encoder = try_cuda(encoder)
  decoder = try_cuda(decoder)
  encoder_optimizer, decoder_optimizer = \
    reset_optimizer(args, encoder, decoder)
  
  agent = Seq2SeqFollower(env, "", args, encoder, decoder, encoder_optimizer,
                          decoder_optimizer)
  evaluator = FollowerEvaluation(env, all_val_data)
  return agent, evaluator
Esempio n. 6
0
def batch_instructions_from_encoded(encoded_instructions, max_length,
                                    reverse=False, cut=True):
  num_instructions = len(encoded_instructions)
  seq_tensor = np.full((num_instructions, max_length), VOCAB_PAD_IDX)
  seq_lengths = []
  for i, inst in enumerate(encoded_instructions):
    if len(inst) > 0 and inst[-1] == VOCAB_EOS_IDX:
      inst = inst[:-1]
    if reverse:
      inst = inst[::-1]
    inst = np.concatenate((inst, [VOCAB_EOS_IDX]))
    inst = inst[:max_length]
    seq_tensor[i, :len(inst)] = inst
    seq_lengths.append(len(inst))
  
  if cut:
    seq_tensor = torch.from_numpy(seq_tensor)[:, :max(seq_lengths)]
    mask = (seq_tensor == VOCAB_PAD_IDX)[:, :max(seq_lengths)]
  else:
    seq_tensor = torch.from_numpy(seq_tensor)
    mask = (seq_tensor == VOCAB_PAD_IDX)
  
  return try_cuda(seq_tensor.long()), try_cuda(mask.byte()), seq_lengths
Esempio n. 7
0
 def _build_action_embedding(self, adj_loc_list, feature):
   feature_adj = feature[[adj_dict['absViewIndex']
                          for adj_dict in adj_loc_list]]
   feature_adj[0] = 0
   embedding = np.zeros((len(adj_loc_list), 128), np.float32)
   for a, adj_dict in enumerate(adj_loc_list):
     if a == 0:
       continue
     else:
       rel_heading = adj_dict['rel_heading']
       rel_elevation = adj_dict['rel_elevation']
     embedding[a][0:32] = np.sin(rel_heading)
     embedding[a][32:64] = np.cos(rel_heading)
     embedding[a][64:96] = np.sin(rel_elevation)
     embedding[a][96:] = np.cos(rel_elevation)
   angle_embed = torch.from_numpy(embedding).float()
   return try_cuda(torch.cat((feature_adj, angle_embed), dim=-1))
Esempio n. 8
0
def batch_observations_and_actions(path_obs, path_actions, padding_feature,
                                   padding_action):
  batch_size = len(path_obs)
  seq_lengths = np.array([len(a) for a in path_actions])
  max_path_length = seq_lengths.max()
  mask = np.ones((batch_size, max_path_length), np.uint8)
  image_features = [[] for _ in range(batch_size)]
  action_embeddings = [[] for _ in range(batch_size)]
  for i in range(batch_size):
    assert len(path_obs[i]) == len(path_actions[i])
    mask[i, :len(path_actions[i])] = 0
    image_features[i] = [ob['feature'][0] for ob in path_obs[i]]
    action_embeddings[i] = [ob['action_embedding'][path_actions[i][j]]
                            for j, ob in enumerate(path_obs[i])]
    image_features[i].extend([padding_feature]
                             * (max_path_length - len(path_actions[i])))
    action_embeddings[i].extend([padding_action]
                                * (max_path_length - len(path_actions[i])))
    image_features[i] = torch.stack(image_features[i], dim=0)
    action_embeddings[i] = torch.stack(action_embeddings[i], dim=0)
  batched_image_features = torch.stack(image_features, dim=0)
  batched_action_embeddings = torch.stack(action_embeddings, dim=0)
  mask = try_cuda(torch.from_numpy(mask).byte())
  return batched_image_features, batched_action_embeddings, mask, seq_lengths
Esempio n. 9
0
    def _rollout(self,
                 batch,
                 feedback,
                 reward_flag=False,
                 history=False,
                 exp_forget=0.5):
        batch_size = len(batch)

        # Embed history memory
        if history:
            history_context = self._make_history_context(batch,
                                                         decay=exp_forget)
        else:
            history_context = None

        # Batch instructions
        seq, seq_mask, seq_lengths = \
          batch_instructions_from_encoded([b['instr_encoding'] for b in batch],
                                          self.max_instruction_length,
                                          reverse=self.reverse_instruction)

        # Reset environment
        done = np.zeros(batch_size, dtype=np.uint8)
        obs = self.env.reset(batch)

        # Record starting point
        loss = 0
        count_valid = 0
        traj = [{
            'instr_id': ob['instr_id'],
            'scores': [],
            'heading': ob['heading'],
            'trajectory': [ob['viewpoint']],
            'trajectory_radians': [(ob['heading'], ob['elevation'])],
            'reward': [],
        } for ob in obs]

        # Init text embed and action
        ctx, h_t, c_t = self.encoder(seq, seq_lengths)
        u_t_prev = self.env.padding_action.expand(batch_size, -1)

        # Do a sequence rollout and calculate the loss
        sequence_scores = try_cuda(torch.zeros(batch_size))
        for t in range(self.episode_len):
            f_t = self._feature_variables(obs)
            all_u_t, action_mask = self._action_variable(obs)
            h_t, c_t, alpha, logit, alpha_v = \
              self.decoder(u_t_prev, all_u_t, f_t, h_t, c_t, ctx,
                           ctx_mask=seq_mask, history_context=history_context)

            # Supervised training
            target = self._teacher_action(obs, done)

            logit[action_mask] = -float('inf')
            if torch.isnan(logit).sum():
                raise ValueError("Error! network produce nan result!")

            # Determine next model inputs
            if feedback == 'teacher':
                a_t = torch.clamp(target, min=0)
            elif feedback == 'argmax':
                _, a_t = logit.max(1)
                a_t = a_t.detach()
            elif feedback == 'sample':
                probs = F.softmax(logit, dim=1)
                m = D.Categorical(probs)
                a_t = m.sample()
            else:
                raise ValueError("Error! Invalid feedback option!")

            # Valid count (not done yet trajectories)
            count_valid += len(obs) - done.sum()

            # Update the previous action
            u_t_prev = all_u_t[np.arange(batch_size), a_t, :].detach()
            action_scores = -F.cross_entropy(
                logit, a_t.clone(), ignore_index=-1, reduction='none')
            action_scores[done] = 0
            sequence_scores += action_scores

            # Calculate loss
            loss += self._criterion(logit, target)

            # Make environment action
            a_t[done] = 0
            obs, next_done = self.env.step(obs, a_t.tolist())

            # Save trajectory output
            for i, ob in enumerate(obs):
                if not done[i]:
                    if reward_flag:
                        traj[i]['scores'].append(-action_scores[i])
                    traj[i]['trajectory'].append(ob['viewpoint'])
                    traj[i]['trajectory_radians'].append(
                        (ob['heading'], ob['elevation']))

            # Early exit if all ended
            done = next_done
            if done.all():
                break

        for i, ob in enumerate(obs):
            traj[i]['score'] = sequence_scores[i].item() / len(
                traj[i]['trajectory'])
        return traj, loss / count_valid
Esempio n. 10
0
 def _teacher_action(self, obs, ended):
     ''' Extract teacher actions into variable. '''
     a = torch.LongTensor(len(obs))
     for i, ob in enumerate(obs):
         a[i] = ob['teacher_action'] if not ended[i] else -1
     return try_cuda(a)
Esempio n. 11
0
 def _build_feature_embedding(self, view_index, feature):
     angle_embed = self.loc_embeddings[view_index]
     return try_cuda(torch.cat((feature, angle_embed), dim=-1))
Esempio n. 12
0
    def _score_obs_actions_and_instructions(self, path_obs, path_actions,
                                            encoded_instructions, feedback,
                                            train):
        batch_size = len(path_obs)
        instr_seq, _, _ = \
          batch_instructions_from_encoded(encoded_instructions,
                                          self.max_instruction_length, cut=False)
        batched_image_features, batched_action_embeddings, path_mask, seq_len = \
          batch_observations_and_actions(path_obs, path_actions,
                                         self.env.padding_feature,
                                         self.env.padding_action)

        ctx = self.encoder(batched_image_features, batched_action_embeddings,
                           seq_len)
        h_t = try_cuda(torch.zeros(batch_size, ctx.size(-1)))
        c_t = try_cuda(torch.zeros(batch_size, ctx.size(-1)))
        ended = np.array([False] * batch_size)

        outputs = [{
            'instr_id': path_obs[i][0]['instr_id'],
            'word_indices': [],
            'scores': []
        } for i in range(batch_size)]

        # Do a sequence rollout and calculate the loss
        loss = 0
        w_t = try_cuda(
            torch.from_numpy(
                np.full((batch_size, 1), self.tok.vocab_bos_idx,
                        dtype='int64')).long())

        if train:
            w_t = torch.cat([w_t, instr_seq], dim=1)
            logits, _, _ = self.decoder(w_t, ctx, path_mask, h_t, c_t)
            logits = logits.permute(0, 2, 1).contiguous()
            loss = F.cross_entropy(
                input=logits[:, :, :-1],  # -1 for aligning
                target=instr_seq,  # "1:" to ignore the word <BOS>
                ignore_index=VOCAB_PAD_IDX)
        else:
            sequence_scores = try_cuda(torch.zeros(batch_size))
            for t in range(self.max_instruction_length):
                logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx, path_mask,
                                               h_t, c_t)
                logit = logit.squeeze(1)

                logit[:, VOCAB_PAD_IDX] = -float('inf')
                target = instr_seq[:, t].contiguous()

                if torch.isnan(logit).sum():
                    print("Error: network produce nan result!")
                    exit(0)

                # Determine next model inputs
                if feedback == 'teacher':
                    w_t = target
                elif feedback == 'argmax':
                    _, w_t = logit.max(1)
                    w_t = w_t.detach()
                elif feedback == 'sample':
                    probs = F.softmax(logit, dim=1)
                    probs[:, VOCAB_PAD_IDX] = 0
                    m = D.Categorical(probs)
                    w_t = m.sample()
                else:
                    sys.exit('Invalid feedback option')

                log_probs = F.log_softmax(logit, dim=1)
                word_scores = -F.nll_loss(log_probs,
                                          w_t,
                                          ignore_index=VOCAB_PAD_IDX,
                                          reduction='none')
                sequence_scores += word_scores
                loss += F.nll_loss(log_probs,
                                   target,
                                   ignore_index=VOCAB_PAD_IDX)

                for i in range(batch_size):
                    word_idx = w_t[i].item()
                    if not ended[i]:
                        outputs[i]['word_indices'].append(int(word_idx))
                        outputs[i]['scores'].append(word_scores[i].item())
                    if word_idx == VOCAB_EOS_IDX:
                        ended[i] = True

                # Early exit if all ended
                if ended.all():
                    break

            for i, item in enumerate(outputs):
                item['score'] = float(sequence_scores[i].item()) / len(
                    item['word_indices'])
                item['words'] = self.tok.decode_sentence(item['word_indices'],
                                                         break_on_eos=True,
                                                         join=False)

        return outputs, loss