Esempio n. 1
0
def get_scores(output_file, split):
    output_ids = []

    eval = Evaluation([split], 'lstm')
    eval.scores = defaultdict(list)
    instr_ids = set(eval.instr_ids)
    with open(output_file) as f:
        for item in json.load(f):
            if item['instr_id'] in instr_ids:
                output_ids.append(item['instr_id'])
                instr_ids.remove(item['instr_id'])
                eval._score_item(item['instr_id'], item['trajectory'])

    return output_ids, eval.scores
Esempio n. 2
0
class ActorCriticAgent(BaseAgent):

    model_actions = ['left', 'right', 'up', 'down', 'forward', '<end>', '<start>', '<ignore>']
    env_actions = [
        (0,-1, 0), # left
        (0, 1, 0), # right
        (0, 0, 1), # up
        (0, 0,-1), # down
        (1, 0, 0), # forward
        (0, 0, 0), # <end>
        (0, 0, 0), # <start>
        (0, 0, 0)  # <ignore>
    ]

    SavedAction = namedtuple('SavedAction', ['log_prob', 'value', 'step'])
    eps = np.finfo(np.float32).eps.item()

    def __init__(self, env, vocab_size, results_path, batch_size, episode_len=20):
        super(ActorCriticAgent, self).__init__(env, results_path)

        #For evaluation
        self.ev = Evaluation(['train'])

        #For navigation
        self.episode_len = episode_len
        self.losses = []

        ''' Define instruction encoder '''
        word_embedding_size = 256
        hidden_size = 512
        bidirectional = False
        dropout_ratio = 0.5

	enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
	self.encoder = EncoderLSTM(vocab_size, word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda()

        context_size = 1024
        self.hist_encoder = EncoderHistory(len(self.model_actions), 32, 2048, context_size).cuda()
        self.a2c_agent = A2CAgent(enc_hidden_size, context_size, len(self.model_actions) - 2).cuda()
        self.saved_actions = []

        params = list(self.encoder.parameters()) + list(self.hist_encoder.parameters()) + list(self.a2c_agent.parameters())
	self.losses = []
        self.optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=1e-5)


    def _sort_batch(self, obs):
        seq_tensor = np.array([ob['instr_encoding'] for ob in obs])
        seq_lengths = np.argmax(seq_tensor == padding_idx, axis=1)
        seq_lengths[seq_lengths == 0] = seq_tensor.shape[1] # Full length

        seq_tensor = torch.from_numpy(seq_tensor)
        seq_lengths = torch.from_numpy(seq_lengths)

        # Sort sequences by lengths
        seq_lengths, perm_idx = seq_lengths.sort(0, True)
        sorted_tensor = seq_tensor[perm_idx]
        mask = (sorted_tensor == padding_idx)[:,:seq_lengths[0]]

        return Variable(sorted_tensor, requires_grad=False).long().cuda(), \
               mask.byte().cuda(), \
               list(seq_lengths), list(perm_idx)


    def _feature_variable(self, obs):
        feature_size = obs[0]['feature'].shape[0]
        features = np.empty((len(obs),feature_size), dtype=np.float32)
        for i,ob in enumerate(obs):
            features[i,:] = ob['feature']
        return Variable(torch.from_numpy(features), requires_grad=False).cuda()


    def _teacher_action(self, obs, ended):
        a = torch.LongTensor(len(obs))
        for i,ob in enumerate(obs):
            # Supervised teacher only moves one axis at a time
            ix,heading_chg,elevation_chg = ob['teacher']
            if heading_chg > 0:
                a[i] = self.model_actions.index('right')
            elif heading_chg < 0:
                a[i] = self.model_actions.index('left')
            elif elevation_chg > 0:
                a[i] = self.model_actions.index('up')
            elif elevation_chg < 0:
                a[i] = self.model_actions.index('down')
            elif ix > 0:
                a[i] = self.model_actions.index('forward')
            elif ended[i]:
                a[i] = self.model_actions.index('<ignore>')
            else:
                a[i] = self.model_actions.index('<end>')
        return Variable(a, requires_grad=False).cuda()


    def rollout(self, guide_prob):
        #For navigation
        obs = np.array(self.env.reset())
        batch_size = len(obs)

        seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs)
        perm_obs = obs[perm_idx]

        traj = [{
            'instr_id': ob['instr_id'],
            'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])]
        } for ob in perm_obs]

        ctx,h_t,c_t = self.encoder(seq, seq_lengths)

        a_t = Variable(torch.ones(batch_size).long() * self.model_actions.index('<start>'), requires_grad=False).cuda()

        ended = np.array([False] * len(obs))
        env_action = [None] * batch_size

        h_n, c_n = self.hist_encoder.init_hidden(batch_size)

        for t in range(self.episode_len):
            f_t = self._feature_variable(perm_obs)

            enc_data, h_n, c_n =self.hist_encoder(a_t, f_t, h_n, c_n)
            action_prob, critic_value = self.a2c_agent(ctx, seq_lengths, enc_data)

            guided = np.random.choice(2, batch_size, p=[1.0 - guide_prob, guide_prob])

            demo = self._teacher_action(perm_obs, ended)

            if guided[0] == 1:
                a_t = demo
            else:

                if len(perm_obs[0]['navigableLocations']) <= 1:
                    action_prob[0, self.model_actions.index('forward')] = -float('inf')

                action_prob = F.softmax(action_prob, dim=1)

                m = Categorical(action_prob)
                a_t = m.sample()
                if not ended[0]:
                    self.saved_actions.append(self.SavedAction(m.log_prob(a_t), critic_value, t))

            for i, (idx, ob) in enumerate(zip(perm_idx, perm_obs)):
                action_idx = a_t[i]
                if action_idx == self.model_actions.index('<end>'):
                    ended[i] = True
                env_action[idx] = self.env_actions[action_idx]

            obs = np.array(self.env.step(env_action))
            perm_obs = obs[perm_idx]

            for i,ob in enumerate(perm_obs):
                if not ended[i]:
                    traj[i]['path'].append((ob['viewpoint'], ob['heading'], ob['elevation']))

            if ended.all():
                break

        return traj


    def clear_saved_actions(self):
        del self.saved_actions[:]


    def test(self, guide_prob):
        self.encoder.eval()
        self.hist_encoder.eval()
        self.a2c_agent.eval()

	self.env.reset_epoch()
        self.losses = []
        self.results = {}
        # We rely on env showing the entire batch before repeating anything
        #print 'Testing %s' % self.__class__.__name__
        looped = False
        while True:
            for traj in self.rollout(guide_prob):
                if traj['instr_id'] in self.results:
                    looped = True
                else:
                    self.results[traj['instr_id']] = traj['path']
            if looped:
                break

        self.clear_saved_actions()


    def train(self, n_iters, guide_prob):
        self.encoder.train()
        self.hist_encoder.train()
        self.a2c_agent.train()

        policy_losses = []
        value_losses = []
	self.losses = []

        total_num = 0
        success_num = 0
        for iter in range(1, n_iters + 1):
            traj = self.rollout(guide_prob)
            for i, t in enumerate(traj):
                nav_error, oracle_error, trajectory_step, trajectory_length = self.ev._score_item(t['instr_id'], t['path'])
                reward = 1.0 if nav_error < 3.0 else 0.0

                total_num += 1.0
                success_num += reward

                for log_prob, value, step in self.saved_actions:
                    discounted_reward = pow(0.99, trajectory_step - step) * reward
                    advantage = discounted_reward - value.item()
                    policy_losses.append(-log_prob * advantage)
                    value_losses.append(F.smooth_l1_loss(value, Variable(torch.tensor([[discounted_reward]]).cuda(), requires_grad=False)))

            data_len = len(policy_losses)
            if data_len > 64:
                self.optimizer.zero_grad()
                value_loss = torch.stack(value_losses).sum()
                policy_loss = torch.stack(policy_losses).sum() 
                loss = value_loss + policy_loss
		self.losses.append(value_loss.item() / data_len)
		#print('sub iter [%d/%d], Average Value Loss: %.4f' %(iter, n_iters, value_loss.item() / data_len))
                loss.backward()
                self.optimizer.step()
                self.clear_saved_actions()
                policy_losses = []
                value_losses = []

        data_len = len(policy_losses)
        if data_len > 0:
            self.optimizer.zero_grad()
            loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
            self.losses.append(loss.item() / data_len)
            loss.backward()
            self.optimizer.step()
            self.clear_saved_actions()

        print('guide prob: %.2f, train value loss: %.4f, success: %.2f' % (guide_prob, np.average(np.array(self.losses)), (success_num / total_num)))