Ejemplo n.º 1
0
    def teacher_forcing(self, train=True, target=None):
        if train:
            self.bidaf.train()
        else:
            self.bidaf.eval()

        # Get input
        obs = self.env._get_obs()
        (img_feats, can_feats), feat_len = self.from_shortest_path(
        )  # Feature from the shortest path
        insts, inst_len = self.gt_words(obs)

        # Get Ground Truth Label
        if target is None:  # Label from the env
            target = np.array([ob['label'] for ob in obs], np.float32)
            target = torch.from_numpy(target).cuda()
        else:
            target = torch.FloatTensor([target] * self.env.batch_size).cuda()

        feat_mask = utils.length2mask(feat_len)
        inst_mask = utils.length2mask(inst_len, 80)
        logits = self.bidaf(img_feats, can_feats, feat_mask, insts, inst_mask)
        # print("TRUE:", torch.sigmoid(logits).mean())

        loss = self.bce_loss(input=logits, target=target)

        if train:
            return loss
        else:
            return loss.item()
Ejemplo n.º 2
0
    def infer_batch(self, batch=None, insts=None, train=False):
        """
        :param insts:  numpy array with [batch_size, length]. It should be PADDED
        :return: The prob numpy with [batch_size]
        """
        if train:
            self.bidaf.train()
        else:
            self.bidaf.eval()

        # Get Visual Input
        if batch is not None:
            self.env.reset(batch)
        obs = self.env._get_obs()
        (img_feats, can_feats), feat_len = self.from_shortest_path(
        )  # Feature from the shortest path

        # Get Language Input
        if insts is None:
            # Use the default inst in the dataset if the argument **insts** is not given
            insts, inst_len = self.gt_words(obs)
        else:
            # Bring the numpy to cuda
            # Use FloatTensor() so insts could be another Tensor
            if type(insts) is list:
                max_length = max([len(inst) for inst in insts])
                insts = [
                    inst + ([self.tok.word_to_index['<PAD>']] *
                            (max_length - len(inst))) for inst in insts
                ]
                insts = np.array(insts)

            # print("G infer", self.tok.decode_sentence(insts[0]))
            inst_len = (insts != self.tok.word_to_index['<PAD>']).sum(1)
            # print("len", inst_len[0])
            insts = torch.LongTensor(insts).cuda()

        # Create Mask
        feat_mask = utils.length2mask(feat_len)
        inst_mask = utils.length2mask(inst_len, insts.size(1))

        # input --> logit --> probs --> cpu_probs
        logits = self.bidaf(img_feats, can_feats, feat_mask, insts, inst_mask)
        # print("FALSE:", torch.sigmoid(logits).mean())

        if train:
            target = torch.FloatTensor([0.] * self.env.batch_size).cuda()
            loss = self.bce_loss(input=logits, target=target)
            return loss
        else:
            probs = torch.sigmoid(logits)
            answer = probs.cpu().detach().numpy()
            return answer
Ejemplo n.º 3
0
    def teacher_forcing(self, train=True, features=None, insts=None, for_listener=False):
        if train:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()

        # Get Image Input & Encode
        if features is not None:
            # It is used in calulating the speaker score in beam-search
            assert insts is not None
            (img_feats, can_feats), lengths = features
            ctx = self.encoder(can_feats, img_feats, lengths)
            batch_size = len(lengths)
        else:
            obs = self.env._get_obs()
            batch_size = len(obs)
            (img_feats, can_feats), lengths = self.from_shortest_path()      # Image Feature (from the shortest path)
            ctx = self.encoder(can_feats, img_feats, lengths)
        h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        ctx_mask = utils.length2mask(lengths)

        # Get Language Input
        if insts is None:
            insts = self.gt_words(obs)                                       # Language Feature

        # Decode
        logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t)

        # Because the softmax_loss only allow dim-1 to be logit,
        # So permute the output (batch_size, length, logit) --> (batch_size, logit, length)
        logits = logits.permute(0, 2, 1).contiguous()
        loss = self.softmax_loss(
            input  = logits[:, :, :-1],         # -1 for aligning
            target = insts[:, 1:]               # "1:" to ignore the word <BOS>
        )

        if for_listener:
            return self.nonreduced_softmax_loss(
                input  = logits[:, :, :-1],         # -1 for aligning
                target = insts[:, 1:]               # "1:" to ignore the word <BOS>
            )

        if train:
            return loss
        else:
            # Evaluation
            _, predict = logits.max(dim=1)                                  # BATCH, LENGTH
            gt_mask = (insts != self.tok.word_to_index['<PAD>'])
            correct = (predict[:, :-1] == insts[:, 1:]) * gt_mask[:, 1:]    # Not pad and equal to gt
            correct, gt_mask = correct.type(torch.LongTensor), gt_mask.type(torch.LongTensor)
            word_accu = correct.sum().item() / gt_mask[:, 1:].sum().item()     # Exclude <BOS>
            sent_accu = (correct.sum(dim=1) == gt_mask[:, 1:].sum(dim=1)).sum().item() / batch_size  # Exclude <BOS>
            return loss.item(), word_accu, sent_accu
Ejemplo n.º 4
0
    def loss(self, data):

        # forward rnn
        input, target, length = data
        rnn_output = self(input, length)

        # discard the pad
        mask = length2mask(length)
        rnn_output = rnn_output.masked_select(
            mask.unsqueeze(dim=2).expand_as(rnn_output)).view(-1, self.nhid)
        target = target.masked_select(mask)

        # forward decoder and calculate loss
        decoder_loss = self.decoder.forward_with_loss(rnn_output, target)

        return decoder_loss
Ejemplo n.º 5
0
 def forward(self, ctx, length=None):
     """
     output the fix_length memory
     :param ctx: (batch_size, max_len, input_dim)
     :param length: (batch_size)
     :return:
     """
     attn = self.f(ctx)  # (batch_size, max_len, mem_size)
     if length is not None:
         mask = utils.length2mask(length).unsqueeze(-1).expand(
             -1, -1, self.mem_size)
         # print(attn.size())
         # print(mask.size())
         attn.masked_fill_(mask, float('-inf'))
     attn = F.softmax(attn, 1,
                      _stacklevel=5)  # (batch_size, max_len, mem_size)
     attn = attn.transpose(1, 2)  # (batch_size, mem_size, max_len)
     memory = torch.bmm(attn, ctx)  # (batch_size, mem_size, rnn_dim)
     memory = self.drop(memory)
     return memory
Ejemplo n.º 6
0
    def infer_batch_bunch(self,
                          sampling=False,
                          train=False,
                          featdropmask=None):
        """

        :param sampling: if not, use argmax. else use softmax_multinomial
        :param train: Whether in the train mode
        :return: if sampling: return insts(np, [batch, max_len]),
                                     log_probs(torch, requires_grad, [batch,max_len])
                                     hiddens(torch, requires_grad, [batch, max_len, dim})
                      And if train: the log_probs and hiddens are detached
                 if not sampling: returns insts(np, [batch, max_len])
        """
        if train:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()

        # Image Input for the Encoder
        obs = self.env._get_obs()
        batch_size = len(obs)
        viewpoints_list = [list() for _ in range(batch_size)]

        # Get feature
        (img_feats, can_feats), lengths = self.from_shortest_path(
            viewpoints=viewpoints_list
        )  # Image Feature (from the shortest path)

        # This code block is only used for the featdrop.
        if featdropmask is not None:
            img_feats[..., :-args.angle_feat_size] *= featdropmask
            can_feats[..., :-args.angle_feat_size] *= featdropmask

        # Encoder
        ctx = self.encoder(can_feats,
                           img_feats,
                           lengths,
                           already_dropfeat=(featdropmask is not None))
        ctx_mask = utils.length2mask(lengths)

        # Decoder
        words = []
        log_probs = []
        hidden_states = []
        entropies = []
        h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        ended = np.zeros(len(obs), np.bool)
        word = np.ones(
            len(obs),
            np.int64) * self.tok.word_to_index['<BOS>']  # First word is <BOS>
        word = torch.from_numpy(word).view(-1, 1).cuda()
        for i in range(args.maxDecode):
            # Decode Step
            logits, h_t, c_t = self.decoder(
                word, ctx, ctx_mask, h_t,
                c_t)  # Decode, logits: (b, 1, vocab_size)

            # Select the word
            logits = logits.squeeze()  # logits: (b, vocab_size)
            logits[:, self.tok.word_to_index['<UNK>']] = -float(
                "inf")  # No <UNK> in infer
            if sampling:
                probs = F.softmax(logits, -1)
                m = torch.distributions.Categorical(probs)
                word = m.sample()
                log_prob = m.log_prob(word)
                if train:
                    log_probs.append(log_prob)
                    hidden_states.append(h_t.squeeze())
                    entropies.append(m.entropy())
                else:
                    log_probs.append(log_prob.detach())
                    hidden_states.append(h_t.squeeze().detach())
                    entropies.append(m.entropy().detach())
            else:
                values, word = logits.max(1)

            # Append the word
            cpu_word = word.cpu().numpy()
            cpu_word[ended] = self.tok.word_to_index['<PAD>']
            words.append(cpu_word)

            # Prepare the shape for next step
            word = word.view(-1, 1)

            # End?
            ended = np.logical_or(ended,
                                  cpu_word == self.tok.word_to_index['<EOS>'])
            if ended.all():
                break

        if train and sampling:
            return np.stack(words, 1), torch.stack(log_probs, 1), torch.stack(
                hidden_states, 1), torch.stack(entropies, 1)
        else:
            return np.stack(words, 1)  # [(b), (b), (b), ...] --> [b, l]
Ejemplo n.º 7
0
    def _dijkstra(self):
        """
        The dijkstra algorithm.
        Was called beam search to be consistent with existing work.
        But it actually finds the Exact K paths with smallest listener log_prob.
        :return:
        [{
            "scan": XXX
            "instr_id":XXX,
            'instr_encoding": XXX
            'dijk_path': [v1, v2, ..., vn]      (The path used for find all the candidates)
            "paths": {
                    "trajectory": [viewpoint_id1, viewpoint_id2, ..., ],
                    "action": [act_1, act_2, ..., ],
                    "listener_scores": [log_prob_act1, log_prob_act2, ..., ],
                    "visual_feature": [(f1_step1, f2_step2, ...), (f1_step2, f2_step2, ...)
            }
        }]
        """
        def make_state_id(viewpoint, action):  # Make state id
            return "%s_%s" % (viewpoint, str(action))

        def decompose_state_id(state_id):  # Make state id
            viewpoint, action = state_id.split("_")
            action = int(action)
            return viewpoint, action

        # Get first obs
        obs = self.env._get_obs()

        # Prepare the state id
        batch_size = len(obs)
        results = [{
            "scan": ob['scan'],
            "instr_id": ob['instr_id'],
            "instr_encoding": ob["instr_encoding"],
            "dijk_path": [ob['viewpoint']],
            "paths": []
        } for ob in obs]

        # Encoder
        seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs)
        recover_idx = np.zeros_like(perm_idx)
        for i, idx in enumerate(perm_idx):
            recover_idx[idx] = i
        ctx, h_t, c_t = self.encoder(seq, seq_lengths)
        ctx, h_t, c_t, ctx_mask = ctx[recover_idx], h_t[recover_idx], c_t[
            recover_idx], seq_mask[recover_idx]  # Recover the original order

        # Dijk Graph States:
        id2state = [{
            make_state_id(ob['viewpoint'], -95): {
                "next_viewpoint": ob['viewpoint'],
                "running_state": (h_t[i], h_t[i], c_t[i]),
                "location": (ob['viewpoint'], ob['heading'], ob['elevation']),
                "feature": None,
                "from_state_id": None,
                "score": 0,
                "scores": [],
                "actions": [],
            }
        } for i, ob in enumerate(obs)]  # -95 is the start point
        visited = [set() for _ in range(batch_size)]
        finished = [set() for _ in range(batch_size)]
        graphs = [utils.FloydGraph()
                  for _ in range(batch_size)]  # For the navigation path
        ended = np.array([False] * batch_size)

        # Dijk Algorithm
        for _ in range(300):
            # Get the state with smallest score for each batch
            # If the batch is not ended, find the smallest item.
            # Else use a random item from the dict  (It always exists)
            smallest_idXstate = [
                max(((state_id, state)
                     for state_id, state in id2state[i].items()
                     if state_id not in visited[i]),
                    key=lambda item: item[1]['score'])
                if not ended[i] else next(iter(id2state[i].items()))
                for i in range(batch_size)
            ]

            # Set the visited and the end seqs
            for i, (state_id, state) in enumerate(smallest_idXstate):
                assert (ended[i]) or (state_id not in visited[i])
                if not ended[i]:
                    viewpoint, action = decompose_state_id(state_id)
                    visited[i].add(state_id)
                    if action == -1:
                        finished[i].add(state_id)
                        if len(finished[i]
                               ) >= args.candidates:  # Get enough candidates
                            ended[i] = True

            # Gather the running state in the batch
            h_ts, h1s, c_ts = zip(*(idXstate[1]['running_state']
                                    for idXstate in smallest_idXstate))
            h_t, h1, c_t = torch.stack(h_ts), torch.stack(h1s), torch.stack(
                c_ts)

            # Recover the env and gather the feature
            for i, (state_id, state) in enumerate(smallest_idXstate):
                next_viewpoint = state['next_viewpoint']
                scan = results[i]['scan']
                from_viewpoint, heading, elevation = state['location']
                self.env.env.sims[i].newEpisode(
                    scan, next_viewpoint, heading,
                    elevation)  # Heading, elevation is not used in panoramic
            obs = self.env._get_obs()

            # Update the floyd graph
            # Only used to shorten the navigation length
            # Will not effect the result
            for i, ob in enumerate(obs):
                viewpoint = ob['viewpoint']
                if not graphs[i].visited(viewpoint):  # Update the Graph
                    for c in ob['candidate']:
                        next_viewpoint = c['viewpointId']
                        dis = self.env.distances[
                            ob['scan']][viewpoint][next_viewpoint]
                        graphs[i].add_edge(viewpoint, next_viewpoint, dis)
                    graphs[i].update(viewpoint)
                results[i]['dijk_path'].extend(graphs[i].path(
                    results[i]['dijk_path'][-1], viewpoint))

            input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(
                obs)

            # Run one decoding step
            h_t, c_t, alpha, logit, h1 = self.decoder(input_a_t, f_t,
                                                      candidate_feat, h_t, h1,
                                                      c_t, ctx, ctx_mask,
                                                      False)

            # Update the dijk graph's states with the newly visited viewpoint
            candidate_mask = utils.length2mask(candidate_leng)
            logit.masked_fill_(candidate_mask, -float('inf'))
            log_probs = F.log_softmax(logit, 1)  # Calculate the log_prob here
            _, max_act = log_probs.max(1)

            for i, ob in enumerate(obs):
                current_viewpoint = ob['viewpoint']
                candidate = ob['candidate']
                current_state_id, current_state = smallest_idXstate[i]
                old_viewpoint, from_action = decompose_state_id(
                    current_state_id)
                assert ob['viewpoint'] == current_state['next_viewpoint']
                if from_action == -1 or ended[
                        i]:  # If the action is <end> or the batch is ended, skip it
                    continue
                for j in range(len(ob['candidate']) +
                               1):  # +1 to include the <end> action
                    # score + log_prob[action]
                    modified_log_prob = log_probs[i][j].detach().cpu().item()
                    new_score = current_state['score'] + modified_log_prob
                    if j < len(candidate):  # A normal action
                        next_id = make_state_id(current_viewpoint, j)
                        next_viewpoint = candidate[j]['viewpointId']
                        trg_point = candidate[j]['pointId']
                        heading = (trg_point % 12) * math.pi / 6
                        elevation = (trg_point // 12 - 1) * math.pi / 6
                        location = (next_viewpoint, heading, elevation)
                    else:  # The end action
                        next_id = make_state_id(current_viewpoint,
                                                -1)  # action is -1
                        next_viewpoint = current_viewpoint  # next viewpoint is still here
                        location = (current_viewpoint, ob['heading'],
                                    ob['elevation'])

                    if next_id not in id2state[
                            i] or new_score > id2state[i][next_id]['score']:
                        id2state[i][next_id] = {
                            "next_viewpoint":
                            next_viewpoint,
                            "location":
                            location,
                            "running_state": (h_t[i], h1[i], c_t[i]),
                            "from_state_id":
                            current_state_id,
                            "feature": (f_t[i].detach().cpu(),
                                        candidate_feat[i][j].detach().cpu()),
                            "score":
                            new_score,
                            "scores":
                            current_state['scores'] + [modified_log_prob],
                            "actions":
                            current_state['actions'] + [len(candidate) + 1],
                        }

            # The active state is zero after the updating, then setting the ended to True
            for i in range(batch_size):
                if len(visited[i]) == len(
                        id2state[i]):  # It's the last active state
                    ended[i] = True

            # End?
            if ended.all():
                break

        # Move back to the start point
        for i in range(batch_size):
            results[i]['dijk_path'].extend(graphs[i].path(
                results[i]['dijk_path'][-1], results[i]['dijk_path'][0]))
        """
            "paths": {
                "trajectory": [viewpoint_id1, viewpoint_id2, ..., ],
                "action": [act_1, act_2, ..., ],
                "listener_scores": [log_prob_act1, log_prob_act2, ..., ],
                "visual_feature": [(f1_step1, f2_step2, ...), (f1_step2, f2_step2, ...)
            }
        """
        # Gather the Path
        for i, result in enumerate(results):
            assert len(finished[i]) <= args.candidates
            for state_id in finished[i]:
                path_info = {
                    "trajectory": [],
                    "action": [],
                    "listener_scores": id2state[i][state_id]['scores'],
                    "listener_actions": id2state[i][state_id]['actions'],
                    "visual_feature": []
                }
                viewpoint, action = decompose_state_id(state_id)
                while action != -95:
                    state = id2state[i][state_id]
                    path_info['trajectory'].append(state['location'])
                    path_info['action'].append(action)
                    path_info['visual_feature'].append(state['feature'])
                    state_id = id2state[i][state_id]['from_state_id']
                    viewpoint, action = decompose_state_id(state_id)
                state = id2state[i][state_id]
                path_info['trajectory'].append(state['location'])
                for need_reverse_key in [
                        "trajectory", "action", "visual_feature"
                ]:
                    path_info[need_reverse_key] = path_info[
                        need_reverse_key][::-1]
                result['paths'].append(path_info)

        return results
Ejemplo n.º 8
0
    def rollout(self, train_ml=None, train_rl=True, reset=True, speaker=None):
        """
        :param train_ml:    The weight to train with maximum likelihood
        :param train_rl:    whether use RL in training
        :param reset:       Reset the environment
        :param speaker:     Speaker used in back translation.
                            If the speaker is not None, use back translation.
                            O.w., normal training
        :return:
        """
        if self.feedback == 'teacher' or self.feedback == 'argmax':
            train_rl = False

        if reset:
            # Reset env
            obs = np.array(self.env.reset())
        else:
            obs = np.array(self.env._get_obs())

        batch_size = len(obs)

        if speaker is not None:  # Trigger the self_train mode!
            noise = self.decoder.drop_env(torch.ones(self.feature_size).cuda())
            batch = self.env.batch.copy()
            speaker.env = self.env
            insts = speaker.infer_batch(
                featdropmask=noise)  # Use the same drop mask in speaker

            # Create fake environments with the generated instruction
            boss = np.ones((batch_size, 1), np.int64) * self.tok.word_to_index[
                '<BOS>']  # First word is <BOS>
            insts = np.concatenate((boss, insts), 1)
            for i, (datum, inst) in enumerate(zip(batch, insts)):
                if inst[-1] != self.tok.word_to_index[
                        '<PAD>']:  # The inst is not ended!
                    inst[-1] = self.tok.word_to_index['<EOS>']
                datum.pop('instructions')
                datum.pop('instr_encoding')
                datum['instructions'] = self.tok.decode_sentence(inst)
                datum['instr_encoding'] = inst
            obs = np.array(self.env.reset(batch))

        # Reorder the language input for the encoder (do not ruin the original code)
        seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs)
        perm_obs = obs[perm_idx]

        ctx, h_t, c_t = self.encoder(seq, seq_lengths)
        ctx_mask = seq_mask

        # Init the reward shaping
        last_dist = np.zeros(batch_size, np.float32)
        for i, ob in enumerate(
                perm_obs
        ):  # The init distance from the view point to the target
            last_dist[i] = ob['distance']

        # Record starting point
        traj = [{
            'instr_id': ob['instr_id'],
            'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])]
        } for ob in perm_obs]

        # For test result submission
        visited = [set() for _ in perm_obs]

        # Initialization the tracking state
        ended = np.array(
            [False] *
            batch_size)  # Indices match permuation of the model, not env

        # Init the logs
        rewards = []
        hidden_states = []
        policy_log_probs = []
        masks = []
        entropys = []
        ml_loss = 0.

        h1 = h_t
        for t in range(self.episode_len):

            input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(
                perm_obs)
            if speaker is not None:  # Apply the env drop mask to the feat
                candidate_feat[..., :-args.angle_feat_size] *= noise
                f_t[..., :-args.angle_feat_size] *= noise

            h_t, c_t, logit, h1 = self.decoder(input_a_t,
                                               f_t,
                                               candidate_feat,
                                               h_t,
                                               h1,
                                               c_t,
                                               ctx,
                                               ctx_mask,
                                               already_dropfeat=(speaker
                                                                 is not None))

            hidden_states.append(h_t)

            # Mask outputs where agent can't move forward
            # Here the logit is [b, max_candidate]
            candidate_mask = utils.length2mask(candidate_leng)
            if args.submit:  # Avoding cyclic path
                for ob_id, ob in enumerate(perm_obs):
                    visited[ob_id].add(ob['viewpoint'])
                    for c_id, c in enumerate(ob['candidate']):
                        if c['viewpointId'] in visited[ob_id]:
                            candidate_mask[ob_id][c_id] = 1
            logit.masked_fill_(candidate_mask.bool(), -float('inf'))

            # Supervised training
            target = self._teacher_action(perm_obs, ended)
            ml_loss += self.criterion(logit, target)

            # Determine next model inputs
            if self.feedback == 'teacher':
                a_t = target  # teacher forcing
            elif self.feedback == 'argmax':
                _, a_t = logit.max(1)  # student forcing - argmax
                a_t = a_t.detach()
                log_probs = F.log_softmax(logit,
                                          1)  # Calculate the log_prob here
                policy_log_probs.append(log_probs.gather(
                    1, a_t.unsqueeze(1)))  # Gather the log_prob for each batch
            elif self.feedback == 'sample':
                probs = F.softmax(logit, 1)  # sampling an action from model
                c = torch.distributions.Categorical(probs)
                self.logs['entropy'].append(
                    c.entropy().sum().item())  # For log
                entropys.append(c.entropy())  # For optimization
                a_t = c.sample().detach()
                policy_log_probs.append(c.log_prob(a_t))
            else:
                print(self.feedback)
                sys.exit('Invalid feedback option')

            # Prepare environment action
            # NOTE: Env action is in the perm_obs space
            cpu_a_t = a_t.cpu().numpy()
            for i, next_id in enumerate(cpu_a_t):
                if next_id == (
                        candidate_leng[i] - 1
                ) or next_id == args.ignoreid:  # The last action is <end>
                    cpu_a_t[i] = -1  # Change the <end> and ignore action to -1

            # Make action and get the new state
            self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj)
            obs = np.array(self.env._get_obs())
            perm_obs = obs[perm_idx]  # Perm the obs for the resu

            # Calculate the mask and reward
            dist = np.zeros(batch_size, np.float32)
            reward = np.zeros(batch_size, np.float32)
            mask = np.ones(batch_size, np.float32)
            for i, ob in enumerate(perm_obs):
                dist[i] = ob['distance']
                if ended[
                        i]:  # If the action is already finished BEFORE THIS ACTION.
                    reward[i] = 0.
                    mask[i] = 0.
                else:  # Calculate the reward
                    action_idx = cpu_a_t[i]
                    if action_idx == -1:  # If the action now is end
                        if dist[i] < 3:  # Correct
                            reward[i] = 2.
                        else:  # Incorrect
                            reward[i] = -2.
                    else:  # The action is not end
                        reward[i] = -(dist[i] - last_dist[i]
                                      )  # Change of distance
                        if reward[i] > 0:  # Quantification
                            reward[i] = 1
                        elif reward[i] < 0:
                            reward[i] = -1
                        else:
                            raise NameError(
                                "The action doesn't change the move")
            rewards.append(reward)
            masks.append(mask)
            last_dist[:] = dist

            # Update the finished actions
            # -1 means ended or ignored (already ended)
            ended[:] = np.logical_or(ended, (cpu_a_t == -1))

            # Early exit if all ended
            if ended.all():
                break

        if train_rl:
            # Last action in A2C
            input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(
                perm_obs)
            if speaker is not None:
                candidate_feat[..., :-args.angle_feat_size] *= noise
                f_t[..., :-args.angle_feat_size] *= noise
            last_h_, _, _, _ = self.decoder(input_a_t, f_t, candidate_feat,
                                            h_t, h1, c_t, ctx, ctx_mask,
                                            speaker is not None)
            rl_loss = 0.

            # NOW, A2C!!!
            # Calculate the final discounted reward
            last_value__ = self.critic(last_h_).detach(
            )  # The value esti of the last state, remove the grad for safety
            discount_reward = np.zeros(batch_size,
                                       np.float32)  # The inital reward is zero
            for i in range(batch_size):
                if not ended[
                        i]:  # If the action is not ended, use the value function as the last reward
                    discount_reward[i] = last_value__[i]

            length = len(rewards)
            total = 0
            for t in range(length - 1, -1, -1):
                discount_reward = discount_reward * args.gamma + rewards[
                    t]  # If it ended, the reward will be 0
                mask_ = Variable(torch.from_numpy(masks[t]),
                                 requires_grad=False).cuda()
                clip_reward = discount_reward.copy()
                r_ = Variable(torch.from_numpy(clip_reward),
                              requires_grad=False).cuda()
                v_ = self.critic(hidden_states[t])
                a_ = (r_ - v_).detach()

                # r_: The higher, the better. -ln(p(action)) * (discount_reward - value)
                rl_loss += (-policy_log_probs[t] * a_ * mask_).sum()
                rl_loss += (((r_ - v_)**2) * mask_).sum() * 0.5  # 1/2 L2 loss
                if self.feedback == 'sample':
                    rl_loss += (-0.01 * entropys[t] * mask_).sum()
                self.logs['critic_loss'].append(
                    (((r_ - v_)**2) * mask_).sum().item())

                total = total + np.sum(masks[t])
            self.logs['total'].append(total)

            # Normalize the loss function
            if args.normalize_loss == 'total':
                rl_loss /= total
            elif args.normalize_loss == 'batch':
                rl_loss /= batch_size
            else:
                assert args.normalize_loss == 'none'

            self.loss += rl_loss

        if train_ml is not None:
            self.loss += ml_loss * train_ml / batch_size

        if type(
                self.loss
        ) is int:  # For safety, it will be activated if no losses are added
            self.losses.append(0.)
        else:
            self.losses.append(self.loss.item() /
                               self.episode_len)  # This argument is useless.

        return traj
Ejemplo n.º 9
0
    def rollout(self, train_ml=None, train_rl=True, reset=True):
        """
        :param train_ml:    The weight to train with maximum likelihood
        :param train_rl:    whether use RL in training
        :param reset:       Reset the environment

        :return:
        """
        if self.feedback == 'teacher' or self.feedback == 'argmax':
            train_rl = False

        if reset:  # Reset env
            obs = np.array(self.env.reset())
        else:
            obs = np.array(self.env._get_obs())

        batch_size = len(obs)

        # Language input
        sentence, language_attention_mask, token_type_ids, \
            seq_lengths, perm_idx = self._sort_batch(obs)
        perm_obs = obs[perm_idx]
        ''' Language BERT '''
        language_inputs = {
            'mode': 'language',
            'sentence': sentence,
            'attention_mask': language_attention_mask,
            'lang_mask': language_attention_mask,
            'token_type_ids': token_type_ids
        }
        if args.vlnbert == 'oscar':
            language_features = self.vln_bert(**language_inputs)
        elif args.vlnbert == 'prevalent':
            h_t, language_features = self.vln_bert(**language_inputs)

        # Record starting point
        traj = [{
            'instr_id': ob['instr_id'],
            'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])],
        } for ob in perm_obs]

        # Init the reward shaping
        last_dist = np.zeros(batch_size, np.float32)
        last_ndtw = np.zeros(batch_size, np.float32)
        for i, ob in enumerate(
                perm_obs
        ):  # The init distance from the view point to the target
            last_dist[i] = ob['distance']
            path_act = [vp[0] for vp in traj[i]['path']]
            last_ndtw[i] = self.ndtw_criterion[ob['scan']](path_act,
                                                           ob['gt_path'],
                                                           metric='ndtw')

        # Initialization the tracking state
        ended = np.array(
            [False] *
            batch_size)  # Indices match permuation of the model, not env

        # Init the logs
        rewards = []
        hidden_states = []
        policy_log_probs = []
        masks = []
        entropys = []
        ml_loss = 0.

        for t in range(self.episode_len):

            input_a_t, candidate_feat, candidate_leng = self.get_input_feat(
                perm_obs)

            # the first [CLS] token, initialized by the language BERT, serves
            # as the agent's state passing through time steps
            if (t >= 1) or (args.vlnbert == 'prevalent'):
                language_features = torch.cat(
                    (h_t.unsqueeze(1), language_features[:, 1:, :]), dim=1)

            visual_temp_mask = (utils.length2mask(candidate_leng) == 0).long()
            visual_attention_mask = torch.cat(
                (language_attention_mask, visual_temp_mask), dim=-1)

            self.vln_bert.vln_bert.config.directions = max(candidate_leng)
            ''' Visual BERT '''
            visual_inputs = {
                'mode': 'visual',
                'sentence': language_features,
                'attention_mask': visual_attention_mask,
                'lang_mask': language_attention_mask,
                'vis_mask': visual_temp_mask,
                'token_type_ids': token_type_ids,
                'action_feats': input_a_t,
                # 'pano_feats':         f_t,
                'cand_feats': candidate_feat
            }
            h_t, logit = self.vln_bert(**visual_inputs)
            hidden_states.append(h_t)

            # Mask outputs where agent can't move forward
            # Here the logit is [b, max_candidate]
            candidate_mask = utils.length2mask(candidate_leng)
            logit.masked_fill_(candidate_mask, -float('inf'))

            # Supervised training
            target = self._teacher_action(perm_obs, ended)
            ml_loss += self.criterion(logit, target)

            # Determine next model inputs
            if self.feedback == 'teacher':
                a_t = target  # teacher forcing
            elif self.feedback == 'argmax':
                _, a_t = logit.max(1)  # student forcing - argmax
                a_t = a_t.detach()
                log_probs = F.log_softmax(logit,
                                          1)  # Calculate the log_prob here
                policy_log_probs.append(log_probs.gather(
                    1, a_t.unsqueeze(1)))  # Gather the log_prob for each batch
            elif self.feedback == 'sample':
                probs = F.softmax(logit, 1)  # sampling an action from model
                c = torch.distributions.Categorical(probs)
                self.logs['entropy'].append(
                    c.entropy().sum().item())  # For log
                entropys.append(c.entropy())  # For optimization
                a_t = c.sample().detach()
                policy_log_probs.append(c.log_prob(a_t))
            else:
                print(self.feedback)
                sys.exit('Invalid feedback option')
            # Prepare environment action
            # NOTE: Env action is in the perm_obs space
            cpu_a_t = a_t.cpu().numpy()
            for i, next_id in enumerate(cpu_a_t):
                if next_id == (candidate_leng[i] -
                               1) or next_id == args.ignoreid or ended[
                                   i]:  # The last action is <end>
                    cpu_a_t[i] = -1  # Change the <end> and ignore action to -1

            # Make action and get the new state
            self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj)
            obs = np.array(self.env._get_obs())
            perm_obs = obs[perm_idx]  # Perm the obs for the resu

            if train_rl:
                # Calculate the mask and reward
                dist = np.zeros(batch_size, np.float32)
                ndtw_score = np.zeros(batch_size, np.float32)
                reward = np.zeros(batch_size, np.float32)
                mask = np.ones(batch_size, np.float32)
                for i, ob in enumerate(perm_obs):
                    dist[i] = ob['distance']
                    path_act = [vp[0] for vp in traj[i]['path']]
                    ndtw_score[i] = self.ndtw_criterion[ob['scan']](
                        path_act, ob['gt_path'], metric='ndtw')

                    if ended[i]:
                        reward[i] = 0.0
                        mask[i] = 0.0
                    else:
                        action_idx = cpu_a_t[i]
                        # Target reward
                        if action_idx == -1:  # If the action now is end
                            if dist[i] < 3.0:  # Correct
                                reward[i] = 2.0 + ndtw_score[i] * 2.0
                            else:  # Incorrect
                                reward[i] = -2.0
                        else:  # The action is not end
                            # Path fidelity rewards (distance & nDTW)
                            reward[i] = -(dist[i] - last_dist[i])
                            ndtw_reward = ndtw_score[i] - last_ndtw[i]
                            if reward[i] > 0.0:  # Quantification
                                reward[i] = 1.0 + ndtw_reward
                            elif reward[i] < 0.0:
                                reward[i] = -1.0 + ndtw_reward
                            else:
                                raise NameError(
                                    "The action doesn't change the move")
                            # Miss the target penalty
                            if (last_dist[i] <=
                                    1.0) and (dist[i] - last_dist[i] > 0.0):
                                reward[i] -= (1.0 - last_dist[i]) * 2.0
                rewards.append(reward)
                masks.append(mask)
                last_dist[:] = dist
                last_ndtw[:] = ndtw_score

            # Update the finished actions
            # -1 means ended or ignored (already ended)
            ended[:] = np.logical_or(ended, (cpu_a_t == -1))

            # Early exit if all ended
            if ended.all():
                break

        if train_rl:
            # Last action in A2C
            input_a_t, candidate_feat, candidate_leng = self.get_input_feat(
                perm_obs)

            language_features = torch.cat(
                (h_t.unsqueeze(1), language_features[:, 1:, :]), dim=1)

            visual_temp_mask = (utils.length2mask(candidate_leng) == 0).long()
            visual_attention_mask = torch.cat(
                (language_attention_mask, visual_temp_mask), dim=-1)

            self.vln_bert.vln_bert.config.directions = max(candidate_leng)
            ''' Visual BERT '''
            visual_inputs = {
                'mode': 'visual',
                'sentence': language_features,
                'attention_mask': visual_attention_mask,
                'lang_mask': language_attention_mask,
                'vis_mask': visual_temp_mask,
                'token_type_ids': token_type_ids,
                'action_feats': input_a_t,
                # 'pano_feats':         f_t,
                'cand_feats': candidate_feat
            }
            last_h_, _ = self.vln_bert(**visual_inputs)

            rl_loss = 0.

            # NOW, A2C!!!
            # Calculate the final discounted reward
            last_value__ = self.critic(last_h_).detach(
            )  # The value esti of the last state, remove the grad for safety
            discount_reward = np.zeros(batch_size,
                                       np.float32)  # The inital reward is zero
            for i in range(batch_size):
                if not ended[
                        i]:  # If the action is not ended, use the value function as the last reward
                    discount_reward[i] = last_value__[i]

            length = len(rewards)
            total = 0
            for t in range(length - 1, -1, -1):
                discount_reward = discount_reward * args.gamma + rewards[
                    t]  # If it ended, the reward will be 0
                mask_ = Variable(torch.from_numpy(masks[t]),
                                 requires_grad=False).cuda()
                clip_reward = discount_reward.copy()
                r_ = Variable(torch.from_numpy(clip_reward),
                              requires_grad=False).cuda()
                v_ = self.critic(hidden_states[t])
                a_ = (r_ - v_).detach()

                rl_loss += (-policy_log_probs[t] * a_ * mask_).sum()
                rl_loss += (((r_ - v_)**2) * mask_).sum() * 0.5  # 1/2 L2 loss
                if self.feedback == 'sample':
                    rl_loss += (-0.01 * entropys[t] * mask_).sum()
                self.logs['critic_loss'].append(
                    (((r_ - v_)**2) * mask_).sum().item())

                total = total + np.sum(masks[t])
            self.logs['total'].append(total)

            # Normalize the loss function
            if args.normalize_loss == 'total':
                rl_loss /= total
            elif args.normalize_loss == 'batch':
                rl_loss /= batch_size
            else:
                assert args.normalize_loss == 'none'

            self.loss += rl_loss
            self.logs['RL_loss'].append(rl_loss.item())

        if train_ml is not None:
            self.loss += ml_loss * train_ml / batch_size
            self.logs['IL_loss'].append(
                (ml_loss * train_ml / batch_size).item())

        if type(
                self.loss
        ) is int:  # For safety, it will be activated if no losses are added
            self.losses.append(0.)
        else:
            self.losses.append(self.loss.item() /
                               self.episode_len)  # This argument is useless.

        return traj
Ejemplo n.º 10
0
    def rl_train(self,
                 reward_func,
                 iters,
                 ml_weight=0.,
                 policy_weight=1.,
                 baseline_weight=.5,
                 entropy_weight=0.,
                 self_critical=False,
                 ml_env=None):
        """
        :param reward_func: A function takes the [(path, inst)] list as input, returns the reward for each inst
        :param iters:       Train how many iters
        :param ml_weight:   weight for maximum likelihood
        :param policy_weight:   weight for policy loss
        :param baseline_weight: weight for critic loss (baseline loss)
        :param entropy_weight:  weight for the entropy
        :param self_critical: Use the self_critical baseline
        :param ml_env:        Specific env for ml (in case that the train_env is aug_env)
        :return:
        """
        from collections import defaultdict
        log_dict = defaultdict(lambda: 0)
        for i in (range(iters)):
            joint_loss = 0.
            self.encoder_optimizer.zero_grad()
            self.decoder_optimizer.zero_grad()

            # Reset Env
            if args.same_in_batch:
                self.env.reset(tile_one=True)
            else:
                self.env.reset()
            rl_batch = self.env.batch

            # RL training
            insts, log_probs, hiddens, entropies = self.infer_batch(
                sampling=True, train=True)  # Sample a batch

            # Get the Reward ( and the length, mask)
            path_ids = [ob['path_id']
                        for ob in self.env._get_obs()]  # Gather the path ids
            pathXinst = [(path_id, self.tok.shrink(inst))
                         for path_id, inst in zip(path_ids, insts)]
            reward = reward_func(
                rl_batch,
                pathXinst)  # The reward func will evaluate the instruction
            reward = torch.FloatTensor(reward).cuda()
            length = np.argmax(
                np.array(insts) == self.tok.word_to_index['<EOS>'],
                1) + 1  # Get length (pos of EOS) + 1
            length[length == 1] = insts.shape[
                1]  # If there is no EOS, change the length to max length.
            mask = 1. - utils.length2mask(length).float()

            # Get the baseline
            if args.normalize_reward:
                baseline = reward.mean()
            else:
                if self_critical:
                    self.env.reset(rl_batch)
                    insts = self.infer_batch(sampling=False,
                                             train=False)  # Argmax Decoding
                    pathXinst = [(path_id, self.tok.shrink(inst))
                                 for path_id, inst in zip(path_ids, insts)]
                    baseline = reward_func(
                        rl_batch, pathXinst
                    )  # The reward func will evaluate the instruction
                    baseline = torch.FloatTensor(baseline).cuda().unsqueeze(1)
                else:
                    baseline_hiddens = hiddens if args.grad_baseline else hiddens.detach(
                    )
                    baseline = self.decoder.baseline_projection(
                        baseline_hiddens).squeeze()

            # print("Reward Mean %0.4f, std %0.4f" % (reward.mean().detach().cpu().item(), reward.std().detach().cpu().item()))
            # print("Baseline Mean %0.4f, std %0.4f" % (baseline.mean().detach().cpu().item(), baseline.std().detach().cpu().item()))
            # print("Avg abs(Reward - Baseline): %0.4f" % (torch.abs(reward - baseline).mean().detach().cpu().item()))

            # Calculating the Loss
            reward = reward.unsqueeze(1)  # (batch_size,) --> (batch_size, 1)

            if args.normalize_reward:  # Normalize the reward to mean 0, std 1
                advantage = (reward - baseline) / reward.std() * 0.2
            else:
                advantage = reward - baseline

            policy_loss = (advantage.detach() * (-log_probs) * mask).sum(
            ) / self.env.batch_size  # Normalized by the batch_size
            baseline_loss = (advantage**2 * mask).sum() / self.env.batch_size
            avg_entropy = (entropies * mask).sum() / self.env.batch_size

            # Add the Loss to the joint_loss
            if baseline_weight != 0.:  # To support the pretrain phase
                joint_loss += baseline_loss * baseline_weight

            if policy_weight != 0.:  # To support the finetune phase
                joint_loss += policy_loss * policy_weight

            if entropy_weight != 0.:  # Note that the negative entrop is added to encourage exploration
                joint_loss += -avg_entropy * entropy_weight

            # ML env preparation
            if ml_env is not None:  # Get the env from ml_env
                old_env = self.env
                self.env = ml_env
                self.env.reset()
            else:  # else reset the same env as RL
                self.env.reset(batch=rl_batch)

            # ML Training
            assert ml_weight != 0  # Because I always log the ml_weight. And it should always exists!
            if ml_weight != 0.:
                ml_loss = self.teacher_forcing(train=True)
                joint_loss += ml_loss * ml_weight
            else:
                ml_loss = 0.

            if ml_env is not None:
                self.env = old_env

            # print("Reward Mean %0.4f, std %0.4f" % (reward.mean().detach().cpu().item(), reward.std().detach().cpu().item()))
            # print("Baseline Mean %0.4f, std %0.4f" % (baseline.mean().detach().cpu().item(), baseline.std().detach().cpu().item()))
            # print("Avg abs(Reward - Baseline): %0.4f" % (torch.abs(reward - baseline).mean().detach().cpu().item()))

            # Log:
            for name, loss in (('baseline_loss',
                                baseline_loss.detach().item()),
                               ('policy_loss', policy_loss.detach().item()),
                               ('ml_loss', ml_loss.detach().item()),
                               ('baseline', baseline.mean().detach().item()),
                               ('reward', reward.mean().detach().item()),
                               ('baseline_std',
                                baseline.std().detach().item()),
                               ('reward_std', reward.std().detach().item()),
                               ('reward_diff',
                                torch.abs(reward -
                                          baseline).mean().detach().item()),
                               ('entropy', avg_entropy.item())):
                log_dict[name] += loss

            # Backward
            joint_loss.backward()
            torch.nn.utils.clip_grad_norm(self.encoder.parameters(), 40.)
            torch.nn.utils.clip_grad_norm(self.decoder.parameters(), 40.)
            self.encoder_optimizer.step()
            self.decoder_optimizer.step()
        return log_dict
Ejemplo n.º 11
0
    def beam_infer_batch(self, beam_size=5, seq_num=20, candidates=20):
        """

        :param beam_size:  Beam_size is the size of the beam-search
        :param seq_num:    Seq_num is the maximum number of returned sequence
        :param candidates: The maximum number of candidate sequences
        :return: [[seq 1, seq 2, ... (seq_num in total)] (for batch 1),
                  [seq 1, seq 2, ... (seq_num in total)] (for batch 2),
                  ...,
                  [seq 1, seq 2, ... (seq_num in total)] (for batch n)]
        """
        # Eval Model
        self.encoder.eval()
        self.decoder.eval()

        # Input for the Encoder
        obs = self.env._get_obs()
        batch_size = len(obs)
        (img_feats, can_feats), lengths = self.from_shortest_path(
        )  # Feature from the shortest path

        # Encoder
        ctx = self.encoder(can_feats, img_feats, lengths)  # Encode
        ctx_mask = utils.length2mask(lengths)

        # init of the Deocer
        results = []
        h_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        c_t = torch.zeros(1, batch_size, args.rnn_dim).cuda()
        ended = np.zeros(len(obs), np.int)
        word = np.ones(
            len(obs),
            np.int64) * self.tok.word_to_index['<BOS>']  # First word is <BOS>
        word = torch.from_numpy(word).view(-1, 1).cuda()

        # Beam Search Initialization
        bs_now = 1
        pre_scores = torch.zeros((batch_size, bs_now))
        vocab_size = self.tok.vocab_size()
        for t in range(args.maxDecode):
            logits, h_t, c_t = self.decoder(
                word, ctx, ctx_mask, h_t,
                c_t)  # Decode, logits: (b, 1, vocab_size)
            logits = logits.view(batch_size, bs_now,
                                 -1)  # logits: (b, beam_size, vocab_size)

            log_prob = F.log_softmax(
                logits, dim=2).cpu()  # logit --> log_softmax--> log_prob
            scores = pre_scores.unsqueeze(
                -1) + log_prob  # scores: (batch, beam, vocab_size)

            # select top beam_size words. save it
            scores, word = scores.view(batch_size, -1).topk(beam_size, dim=1)
            beam = word / vocab_size  # beam: (batch, beam) [[0,1,1], [0,1,2]
            word = word % vocab_size

            # Log the result
            for i in range(batch_size):
                if ended[i] >= candidates:  # if the maximum seq exceeded, don't add it
                    word[i] = self.tok.word_to_index['<PAD>']
            results.append({
                "beam": beam,
                "word": word,
                "scores": scores.detach().clone()
            })  # Save it before change the scores

            # For next step
            beam = beam + torch.arange(batch_size, dtype=torch.int64).view(
                -1, 1) * bs_now  #  [[0,1,1], [3,4,5], ..

            def gather_beam(state):  # State: (batch * beam, rnn_dim)
                return state[:, beam.view(-1)]

            h_t, c_t = (gather_beam(state) for state in (h_t, c_t))
            pre_scores = scores
            bs_now = beam_size
            assert bs_now == beam.size(1)

            # Handle the end_beams by setting the pre_scores to a very small value
            for i in range(word.size(0)):
                flag = True
                for j in range(word.size(1)):
                    if word[i][j] == self.tok.word_to_index['<EOS>']:
                        pre_scores[i][j] = -float(
                            'inf'
                        )  # Set the score to -inf (so it will not appear in next step)
                        ended[i] += 1  # One more <end> seq for batch i
                    else:
                        flag = False
                if flag:  # If all ended, set it to maximum
                    ended[i] = candidates
                #assert not flag         # If all the beams want to end, just stop here.

            # At last, change the input
            word = word.view(-1, 1).cuda()

            # Should it stop now?
            if (ended >= candidates).all():
                break

        seqs = self.get_all_ends(results, batch_size)
        results = []
        for i in range(batch_size):
            # sorted_seq = sorted(seqs[i], key=lambda x: x['score'] / len(x['inst']), reverse=True)
            # sorted_seq = sorted(seqs[i], key=lambda x: x['score'] - 0.5 * abs(29 - len(x['inst'])), reverse=True)
            sorted_seq = sorted(seqs[i],
                                key=lambda x: x['score'],
                                reverse=True)
            # print(sorted_seq)
            results.append([list(seq['inst']) for seq in sorted_seq[:seq_num]])

        # print()

        # for seq in results[0]:
        #     print(self.tok.decode_sentence(seq))

        return results  # [[inst_1, inst_2, ..., inst_{seq_num}], ... ]
Ejemplo n.º 12
0
    def rollout(self):
        obs = np.array(self.env.reset())
        batch_size = len(obs)

        # Reorder the language input for the encoder
        seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs)
        perm_obs = obs[perm_idx]

        # Record starting point
        traj = [{
            'inst_idx': ob['inst_idx'],
            'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])]
        } for ob in perm_obs]

        # Forward through encoder, giving initial hidden state and memory cell for decoder
        Last_QA_tensor = np.array([obb['Last_QA_enc'] for obb in perm_obs])
        Last_QA_lengths = np.argmax(Last_QA_tensor == padding_idx, axis=1)
        Last_QA_lengths[Last_QA_lengths == 0] = Last_QA_tensor.shape[
            1]  # Full length
        Last_QA_lengths = torch.from_numpy(Last_QA_lengths)
        Last_QA_lengths = Last_QA_lengths.long().cuda()
        Last_QA_tensor = torch.from_numpy(Last_QA_tensor)
        Last_QA = Variable(Last_QA_tensor, requires_grad=False).long().cuda()

        H = []
        H_l = []
        for i, obbb in enumerate(perm_obs):
            H.append([])
            H_l.append([])
            for j in range(15):
                H[i].append(obbb['hist_enc'][j])
                h = np.array([obbb['hist_enc'][j]])
                h_l = np.argmax(h == padding_idx, axis=1)
                if h_l == 0:
                    H_l[i].append(1)
                else:
                    H_l[i].append(h_l)
        hist_tensor = np.array(H)
        hist_tensor = torch.from_numpy(hist_tensor)
        hist = Variable(hist_tensor, requires_grad=False).long().cuda()
        hist_lengths = np.array(H_l)
        hist_lengths = torch.from_numpy(hist_lengths)
        hist_lengths = hist_lengths.long().cuda()

        tar_tensor = np.array([obbbb['tar_enc'] for obbbb in perm_obs])
        tar_lengths = np.ones(batch_size)
        tar_lengths = torch.from_numpy(tar_lengths)
        tar_lengths = tar_lengths.long().cuda()
        tar_tensor = torch.from_numpy(tar_tensor)
        tar = Variable(tar_tensor, requires_grad=False).long().cuda()

        ctx, h_t, c_t = self.encoder(seq, seq_lengths, Last_QA,
                                     Last_QA_lengths, hist, hist_lengths, tar,
                                     tar_lengths)  # Initial action

        # Last_QA_mask = utils.length2mask(Last_QA_lengths.cpu())
        a_t = Variable(torch.ones(batch_size).long() *
                       self.model_actions.index('<start>'),
                       requires_grad=False).cuda()
        ended = np.array(
            [False] *
            batch_size)  # Indices match permuation of the model, not env

        visited = [set() for _ in perm_obs]

        # Do a sequence rollout and calculate the loss
        self.loss = 0
        env_action = [None] * batch_size
        h1 = h_t
        for t in range(self.episode_len):
            input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(
                perm_obs)
            candidate_mask = utils.length2mask(candidate_leng)
            f_t = self._feature_variable(perm_obs)  # Image features from obs
            h_t, c_t, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat,
                                               h_t, h1, c_t, ctx, None)

            if 'test' in self.env.splits:
                for ob_id, ob in enumerate(perm_obs):
                    visited[ob_id].add(ob['viewpoint'])
                    for c_id, c in enumerate(ob['candidate']):
                        if c['viewpointId'] in visited[ob_id]:
                            candidate_mask[ob_id][c_id] = 1

            logit.masked_fill_(candidate_mask, -float('inf'))

            # Supervised training
            if 'test' not in self.env.splits:
                target = self._teacher_action(perm_obs, ended)
                # self.loss += self.criterion(logit, target)
                tmp_loss = self.criterion(logit, target)
                if not math.isinf(tmp_loss):
                    self.loss += tmp_loss

            # Determine next model inputs
            if self.feedback == 'teacher':
                a_t = target  # teacher forcing
            elif self.feedback == 'argmax':
                _, a_t = logit.max(1)  # student forcing - argmax
                a_t = a_t.detach()
            elif self.feedback == 'sample':
                probs = F.softmax(logit, dim=1)
                m = D.Categorical(probs)
                a_t = m.sample()  # sampling an action from model
            else:
                sys.exit('Invalid feedback option')

            cpu_a_t = a_t.cpu().numpy()
            for i, next_id in enumerate(cpu_a_t):
                if next_id == (
                        candidate_leng[i] - 1
                ) or next_id == args.ignoreid:  # The last action is <end>
                    cpu_a_t[i] = -1

            self.make_equiv_action(cpu_a_t, perm_obs, perm_idx, traj)
            obs = np.array(self.env._get_obs())
            perm_obs = obs[perm_idx]

            ended[:] = np.logical_or(ended, (cpu_a_t == -1))
            # Early exit if all ended
            if ended.all():
                break
        if 'test' not in self.env.splits:
            self.losses.append(self.loss.item() / self.episode_len)
        return traj
Ejemplo n.º 13
0
    def from_shortest_path(self, viewpoints=None, get_first_feat=False):
        """
        :param viewpoints: [[], [], ....(batch_size)]. Only for dropout viewpoint
        :param get_first_feat: whether output the first feat
        :return:
        """
        obs = self.env._get_obs()
        ended = np.array(
            [False] *
            len(obs))  # Indices match permuation of the model, not env
        length = np.zeros(len(obs), np.int64)
        img_feats = []
        can_feats = []
        teacher_actions = []
        teacher_actions_1h = []
        candidate_feats = []
        candidate_masks = []
        first_feat = np.zeros((len(obs), self.obs_dim), np.float32)
        for i, ob in enumerate(obs):
            first_feat[i, -args.angle_feat_size:] = utils.angle_feature(
                ob['heading'], ob['elevation'])
        first_feat = torch.from_numpy(first_feat).cuda()
        while not ended.all():
            if viewpoints is not None:
                for i, ob in enumerate(obs):
                    viewpoints[i].append(ob['viewpoint'])
            teacher_action = self._teacher_action(obs, ended)
            teacher_action = teacher_action.cpu().numpy()
            # TODO: why last teacher action not -1
            teacher_actions.append(teacher_action.copy())
            candidate_length = [len(ob['candidate']) + 1
                                for ob in obs]  # +1 is for the end
            candidate_feat = np.zeros(
                (len(obs), max(candidate_length), self.obs_dim))
            # NOTE: The candidate_feat at len(ob['candidate']) is the feature for the END, which is zero in my implementation
            for i, ob in enumerate(obs):
                for j, c in enumerate(ob['candidate']):
                    candidate_feat[i, j, :] = c['feature']
            candidate_feats.append(torch.Tensor(candidate_feat).cuda())
            candidate_masks.append(utils.length2mask(candidate_length))
            img_feats.append(self._feature_variable(obs))
            for i, act in enumerate(teacher_action):
                if act < 0 or act == len(
                        obs[i]['candidate']):  # Ignore or Stop
                    teacher_action[i] = -1  # Stop Action
            can_feats.append(self._candidate_variable(obs, teacher_action))
            self.make_equiv_action(teacher_action, obs)
            length += (1 - ended)
            ended[:] = np.logical_or(ended, (teacher_action == -1))
            obs = self.env._get_obs()
            # TODO: heading random ?
            # TODO: policy decoder behavior clone
            # TODO: state decoder mse
            # TODO: state decoder weight = 0 ?

        assert len(teacher_actions) == len(candidate_feats) == len(
            candidate_masks)
        _max = 0
        for i in range(len(candidate_feats)):
            _max = max(_max, candidate_feats[i].shape[1])
        shape_list = np.array(candidate_feats[0].shape)
        shape_list[1] = 1
        feat_pad_vec = torch.zeros(tuple(shape_list)).cuda()
        shape_list = np.array(candidate_masks[0].shape)
        shape_list[1] = 1
        mask_pad_vec = torch.ones(tuple(shape_list)).bool().cuda()
        for i in range(len(candidate_feats)):
            diff = _max - candidate_feats[i].shape[1]
            diff2 = _max - candidate_masks[i].shape[1]
            assert diff == diff2
            if diff > 0:
                candidate_feats[i] = torch.cat(
                    [candidate_feats[i],
                     feat_pad_vec.repeat(1, diff, 1)],
                    dim=1)
                candidate_masks[i] = torch.cat(
                    [candidate_masks[i],
                     mask_pad_vec.repeat(1, diff)], dim=1)
            # convert teacher actions to one-hot vectors
            teacher_actions_1h.append(
                torch.nn.functional.one_hot(torch.LongTensor(
                    teacher_actions[i]),
                                            num_classes=_max).cuda())

        img_feats = torch.stack(
            img_feats, 1).contiguous()  # batch_size, max_len, 36, 2052
        can_feats = torch.stack(can_feats,
                                1).contiguous()  # batch_size, max_len, 2052
        teacher_actions_1h = torch.stack(teacher_actions_1h, 1).contiguous()
        candidate_feats = torch.stack(candidate_feats, 1).contiguous()
        candidate_masks = torch.stack(candidate_masks, 1).contiguous()
        if get_first_feat:
            return (img_feats, can_feats, first_feat), length
        else:
            return (img_feats, can_feats, teacher_actions_1h, candidate_feats,
                    candidate_masks), length
Ejemplo n.º 14
0
    def rollout(self, train_ml=None, train_rl=True, reset=True, speaker=None):
        """
        :param train_ml:    The weight to train with maximum likelihood
        :param train_rl:    whether use RL in training
        :param reset:       Reset the environment
        :param speaker:     Speaker used in back translation.
                            If the speaker is not None, use back translation.
                            O.w., normal training
        :return:
        """
        if self.feedback == 'teacher' or self.feedback == 'argmax':
            train_rl = False

        if reset:
            # Reset env
            obs = np.array(self.env.reset())
        else:
            obs = np.array(self.env._get_obs())

        batch_size = len(obs)

        if speaker is not None:         # Trigger the self_train mode!
            noise = self.decoder.drop_env(torch.ones(self.feature_size).cuda())
            batch = self.env.batch.copy()
            speaker.env = self.env
            insts = speaker.infer_batch(featdropmask=noise)     # Use the same drop mask in speaker

            # Create fake environments with the generated instruction
            boss = np.ones((batch_size, 1), np.int64) * self.tok.word_to_index['<BOS>']  # First word is <BOS>
            insts = np.concatenate((boss, insts), 1)
            for i, (datum, inst) in enumerate(zip(batch, insts)):
                if inst[-1] != self.tok.word_to_index['<PAD>']: # The inst is not ended!
                    inst[-1] = self.tok.word_to_index['<EOS>']
                datum.pop('instructions')
                datum.pop('instr_encoding')
                datum['instructions'] = self.tok.decode_sentence(inst)
                datum['instr_encoding'] = inst
            obs = np.array(self.env.reset(batch))

        # Reorder the language input for the encoder (do not ruin the original code)
        seq, seq_mask, seq_lengths, perm_idx = self._sort_batch(obs)
        perm_obs = obs[perm_idx]

        ctx, h_t, c_t = self.encoder(seq, seq_lengths)
        ctx_mask = seq_mask

        # Init the reward shaping
        last_dist = np.zeros(batch_size, np.float32)
        for i, ob in enumerate(perm_obs):   # The init distance from the view point to the target
            last_dist[i] = ob['distance']

        # Record starting point
        traj = [{
            'instr_id': ob['instr_id'],
            'path': [(ob['viewpoint'], ob['heading'], ob['elevation'])]
        } for ob in perm_obs]

        # For test result submission
        visited = [set() for _ in perm_obs]

        # Initialization the tracking state
        ended = np.array([False] * batch_size)  # Indices match permuation of the model, not env

        # Init the logs
        rewards = []
        hidden_states = []
        policy_log_probs = []
        masks = []
        entropys = []
        ml_loss = 0.

        h1 = h_t
        for t in range(self.episode_len):

            input_a_t, f_t, candidate_feat, candidate_leng = self.get_input_feat(perm_obs)
            if speaker is not None:       # Apply the env drop mask to the feat
                candidate_feat[..., :-args.angle_feat_size] *= noise
                f_t[..., :-args.angle_feat_size] *= noise

            h_t, c_t, logit, h1 = self.decoder(input_a_t, f_t, candidate_feat,
                                               h_t, h1, c_t,
                                               ctx, ctx_mask,
                                               already_dropfeat=(speaker is not None))

            hidden_states.append(h_t)

            # Mask outputs where agent can't move forward
            # Here the logit is [b, max_candidate]
            candidate_mask = utils.length2mask(candidate_leng)
            if args.submit:     # Avoding cyclic path
                for ob_id, ob in enumerate(perm_obs):
                    visited[ob_id].add(ob['viewpoint'])
                    for c_id, c in enumerate(ob['candidate']):
                        if c['viewpointId'] in visited[ob_id]:
                            candidate_mask[ob_id][c_id] = 1
            logit.masked_fill_(candidate_mask, -float('inf')) # fill the masked True (not candidate) with -inf
Ejemplo n.º 15
0
    def beam_search(self, beam_size=3, train=False):
        if train:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()

        # Image Input for the Encoder
        obs = self.env._get_obs()
        batch_size = len(obs)
        viewpoints_list = [list() for _ in range(batch_size)]

        # Get feature
        (img_feats, can_feats), lengths = self.from_shortest_path(
            viewpoints=viewpoints_list
        )  # Image Feature (from the shortest path)

        # Encoder
        ctx = self.encoder(can_feats,
                           img_feats,
                           lengths,
                           already_dropfeat=True)
        ctx_mask = utils.length2mask(lengths)

        h_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda()
        c_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda()

        completed = []
        for _ in range(batch_size):
            completed.append([])

        beams = [[
            InferenceState(prev_inference_state=None,
                           flat_index=i,
                           last_word=self.tok.word_to_index['<BOS>'],
                           word_count=0,
                           score=0.0)
        ] for i in range(batch_size)]

        for t in range(self.args.maxDecode):
            flat_indices = []
            beam_indices = []
            w_t_list = []
            for beam_index, beam in enumerate(beams):
                for inf_state in beam:
                    beam_indices.append(beam_index)
                    flat_indices.append(inf_state.flat_index)
                    w_t_list.append(inf_state.last_word)

            # w_t = try_cuda(Variable(torch.LongTensor(w_t_list), requires_grad=False))
            w_t = torch.from_numpy(np.array(w_t_list)).long().cuda()
            # if len(w_t.shape) == 1:
            #     w_t = w_t.unsqueeze(0)

            logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx[beam_indices],
                                           ctx_mask[beam_indices],
                                           h_t[:, flat_indices],
                                           c_t[:, flat_indices])

            logit = logit.squeeze(1)

            logit[:, self.tok.word_to_index['<UNK>']] = -float(
                "inf")  # No <UNK> in infer

            # h_t,c_t,alpha,logit = self.decoder(w_t.view(-1, 1), h_t[flat_indices], c_t[flat_indices], ctx[beam_indices], path_mask[beam_indices])

            log_probs = F.log_softmax(logit, dim=1).data  # num x dim
            _, word_indices = logit.data.topk(min(beam_size,
                                                  logit.size()[1]),
                                              dim=1)  # num x beam_size
            word_scores = log_probs.gather(1, word_indices)
            assert word_scores.size() == word_indices.size()

            start_index = 0
            new_beams = []
            all_successors = []
            for beam_index, beam in enumerate(beams):
                successors = []
                end_index = start_index + len(beam)
                if beam:
                    for inf_index, (inf_state, word_score_row, word_index_row) in \
                        enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])):
                        for word_score, word_index in zip(
                                word_score_row, word_index_row):
                            flat_index = start_index + inf_index
                            successors.append(
                                InferenceState(
                                    prev_inference_state=inf_state,
                                    flat_index=flat_index,
                                    last_word=word_index.item(),
                                    word_count=inf_state.word_count + 1,
                                    score=inf_state.score + word_score.item()))
                start_index = end_index
                successors = sorted(successors,
                                    key=lambda t: t.score,
                                    reverse=True)[:beam_size]
                all_successors.append(successors)

            for beam_index, successors in enumerate(all_successors):
                new_beam = []
                for successor in successors:
                    if successor.last_word == self.tok.word_to_index[
                            '<EOS>'] or t == self.args.maxDecode - 1:
                        completed[beam_index].append(successor)
                    else:
                        new_beam.append(successor)
                if len(completed[beam_index]) >= beam_size:
                    new_beam = []
                new_beams.append(new_beam)

            beams = new_beams

            if not any(beam for beam in beams):
                break

        words_batch = {}
        # max_len = 0
        for i in range(batch_size):
            path_id = obs[i]['path_id']
            if not path_id in words_batch:
                words_batch[path_id] = []
            this_completed = completed[i]
            this_completed = sorted(this_completed,
                                    key=lambda t: t.score,
                                    reverse=True)[:beam_size]
            for inf_state in this_completed:
                word_indices = backchain_inference_states(inf_state)
                words_batch[path_id].append(word_indices)
                # max_len = max(max_len,len(word_indices))

        # res = np.ones([batch_size, max_len]).astype(np.int32) * self.tok.word_to_index['<PAD>']
        # for i,words in enumerate(words_batch):
        #     for j,w in enumerate(words):
        #         res[i,j] = w

        return words_batch
Ejemplo n.º 16
0
    def teacher_forcing(self,
                        train=True,
                        features=None,
                        insts=None,
                        for_listener=False,
                        perm_idx=None,
                        creator=None):
        if train:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()

        # Get Image Input & Encode
        if features is not None:
            # It is used in calulating the speaker score in beam-search
            # assert insts is not None
            obs = np.array(self.env._get_obs())
            if perm_idx is not None:
                obs = obs[perm_idx]
            (img_feats, can_feats), lengths = features
            ctx = self.encoder(can_feats,
                               img_feats,
                               lengths,
                               already_dropfeat=True)
            batch_size = len(lengths)
        else:
            obs = self.env._get_obs()
            batch_size = len(obs)
            if creator is not None:
                (img_feats,
                 can_feats), lengths, weights_reg = self.from_shortest_path(
                     creator=creator)  # Image Feature (from the shortest path)
            else:
                (img_feats, can_feats), lengths = self.from_shortest_path()

            ctx = self.encoder(can_feats, img_feats, lengths)

        h_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda()
        c_t = torch.zeros(1, batch_size, self.args.rnn_dim).cuda()
        ctx_mask = utils.length2mask(lengths)

        # Get Language Input
        if insts is None:
            insts = self.gt_words(obs)  # Language Feature

        # Decode
        logits, _, _ = self.decoder(insts, ctx, ctx_mask, h_t, c_t)

        # Because the softmax_loss only allow dim-1 to be logit,
        # So permute the output (batch_size, length, logit) --> (batch_size, logit, length)
        logits = logits.permute(0, 2, 1).contiguous()
        loss = self.softmax_loss(
            input=logits[:, :, :-1],  # -1 for aligning
            target=insts[:, 1:]  # "1:" to ignore the word <BOS>
        )

        if check(loss):
            print('lengths', lengths)
            print('loss is nan', loss)
            # print('logits', logits)
            for i, t in enumerate(insts):
                l = self.softmax_loss(input=logits[i, :, :-1].unsqueeze(0),
                                      target=t[1:].unsqueeze(0))
                if check(l):
                    print('case', i)
                    print('inst', t[1:])
                    print('ctx', check(ctx[i]))

                    print('length', lengths[i])

                    # for j,label in enumerate(t[1:]):
                    #     label = label.item()
                    #     if label != self.tok.word_to_index['<PAD>']:
                    #         print('pos %d, word %s, logit'%(j, self.tok.index_to_word[label]),logits[i,j])

            assert False

        if for_listener:
            inst_mask = insts[:, 1:] != self.tok.word_to_index['<PAD>']
            return self.nonreduced_softmax_loss(
                input=logits[:, :, :-1],  # -1 for aligning
                target=insts[:, 1:]  # "1:" to ignore the word <BOS>
            ), inst_mask

        if train:
            if creator is not None:
                return loss, weights_reg
            return loss
        else:
            # Evaluation
            _, predict = logits.max(dim=1)  # BATCH, LENGTH
            gt_mask = (insts != self.tok.word_to_index['<PAD>'])
            correct = (predict[:, :-1] == insts[:, 1:]
                       ) * gt_mask[:, 1:]  # Not pad and equal to gt
            correct, gt_mask = correct.type(torch.LongTensor), gt_mask.type(
                torch.LongTensor)
            word_accu = correct.sum().item() / gt_mask[:, 1:].sum().item(
            )  # Exclude <BOS>
            sent_accu = (correct.sum(dim=1) == gt_mask[:, 1:].sum(
                dim=1)).sum().item() / batch_size  # Exclude <BOS>
            return loss.item(), word_accu, sent_accu