Esempio n. 1
0
def make_scorer(args):

    bidirectional = args.bidirectional
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    if args.useObjLabelOrVis == 'none':
        feature_size, action_embedding_size = 2048 + 128, 2048 + 128
    elif args.useObjLabelOrVis == 'vis':
        feature_size, action_embedding_size = 2048 + 128 + args.objVisFeatDim, 2048 + 128 + args.objVisFeatDim
    elif args.useObjLabelOrVis == 'label':
        feature_size, action_embedding_size = 2048 + 128 + args.objLanFeatDim, 2048 + 128 + args.objLanFeatDim
    elif args.useObjLabelOrVis == 'both':
        feature_size = 2048 + 128 + args.objVisFeatDim + args.objLanFeatDim
        action_embedding_size = 2048 + args.objVisFeatDim + args.objLanFeatDim + 128

    traj_encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=args.bidirectional))
    scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size))
    scorer = Scorer(scorer_module, traj_encoder)
    if args.load_scorer is not '':
        scorer.load(args.load_scorer)
        print(colorize('load scorer traj ' + args.load_scorer))
    elif args.load_traj_encoder is not '':
        scorer.load_traj_encoder(args.load_traj_encoder)
        print(colorize('load traj encoder ' + args.load_traj_encoder))
    return scorer
Esempio n. 2
0
def batch_observations_and_actions(path_obs, path_actions,
                                   encoded_instructions):
    seq_lengths = np.array([len(a) for a in path_actions])
    max_path_length = seq_lengths.max()

    # DO NOT permute the sequence, since here we are doing manual LSTM unrolling in encoder
    # perm_indices = np.argsort(-seq_lengths)
    perm_indices = np.arange(len(path_obs))
    #path_obs, path_actions, encoded_instructions, seq_lengths = zip(*sorted(zip(path_obs, path_actions, encoded_instructions, seq_lengths), key=lambda p: p[-1], reverse=True))
    # path_obs = [path_obs[i] for i in perm_indices]
    # path_actions = [path_actions[i] for i in perm_indices]
    # if encoded_instructions:
    #     encoded_instructions = [encoded_instructions[i] for i in perm_indices]
    # seq_lengths = [seq_lengths[i] for i in perm_indices]

    batch_size = len(path_obs)
    assert batch_size == len(path_actions)

    mask = np.ones((batch_size, max_path_length), np.uint8)
    action_embedding_dim = path_obs[0][0]['action_embedding'].shape[-1]
    batched_action_embeddings = [
        np.zeros((batch_size, action_embedding_dim), np.float32)
        for _ in range(max_path_length)
    ]
    feature_list = path_obs[0][0]['feature']
    assert len(feature_list) == 1
    image_feature_shape = feature_list[0].shape
    batched_image_features = [
        np.zeros((batch_size, ) + image_feature_shape, np.float32)
        for _ in range(max_path_length)
    ]
    for i, (obs, actions) in enumerate(zip(path_obs, path_actions)):
        # don't include the last state, which should result after the stop action
        if len(obs) == len(actions) + 1:
            obs = obs[:-1]
        assert len(obs) == len(actions)
        mask[i, :len(actions)] = 0
        for t, (ob, a) in enumerate(zip(obs, actions)):
            batched_image_features[t][i] = ob['feature'][0]
            batched_action_embeddings[t][i] = ob['action_embedding'][a]
    batched_action_embeddings = [
        try_cuda(Variable(torch.from_numpy(act), requires_grad=False))
        for act in batched_action_embeddings
    ]
    batched_image_features = [
        try_cuda(Variable(torch.from_numpy(feat), requires_grad=False))
        for feat in batched_image_features
    ]
    mask = try_cuda(torch.from_numpy(mask))

    start_obs = [obs[0] for obs in path_obs]

    return start_obs, \
           batched_image_features, \
           batched_action_embeddings, \
           mask, \
           list(seq_lengths), \
           encoded_instructions, \
           list(perm_indices)
Esempio n. 3
0
 def prepare_proposals(self, batch_h, batch_c, batch_obs, batch_acs):
     ''' for each action proposal, prepare its h,c
     input: existing traj h,c; observation; actions
     output: proposed (h,c) * [batch_size, max_proposal_size]
     '''
     batch_size, ac_size, _ = batch_acs.size()
     hidden_size = self.encoder.hidden_size
     proposal_h = try_cuda(torch.zeros(batch_size, ac_size, hidden_size))
     proposal_c = try_cuda(torch.zeros(batch_size, ac_size, hidden_size))
     for i in range(batch_size):
         h = batch_h[i].expand(ac_size, -1)
         c = batch_c[i].expand(ac_size, -1)
         obs = batch_obs[i].expand(ac_size, -1, -1)
         proposal_h[i], proposal_c[i] = self.encoder._forward_one_step(
             h, c, batch_acs[i], obs)
     return proposal_h.detach(), proposal_c.detach()
Esempio n. 4
0
 def combine_logit(self, scorer_logit, follower_logit):
     #import pdb;pdb.set_trace()
     if self.gamma == 0.0:
         return scorer_logit
     if self.gamma == 1.0:
         return follower_logit
     g, h = self.gamma, 1 - self.gamma
     prob = h * self.sm(scorer_logit) + g * self.sm(follower_logit)
     return try_cuda(torch.log(prob))
Esempio n. 5
0
 def _feature_variable(self, obs, beamed=False):
     ''' Extract precomputed features into variable. '''
     features = [ob['feature'] for ob in (flatten(obs) if beamed else obs)]
     assert all(
         len(f) == 1 for f in features
     )  #currently only support one image featurizer (without attention)
     features = np.stack(features)
     return try_cuda(
         Variable(torch.from_numpy(features), requires_grad=False))
Esempio n. 6
0
def make_speaker(args):
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    vocab = read_vocab(TRAIN_VOCAB)
    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           hidden_size,
                           dropout_ratio,
                           glove=glove))
    agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
    return agent
Esempio n. 7
0
def make_follower(args, vocab):
    enc_hidden_size = hidden_size // 2 if args.bidirectional else hidden_size
    glove_path = osp.join(file_path, 'data', 'train_glove.npy')  # not used
    glove = np.load(glove_path) if args.use_glove else None

    if args.useObjLabelOrVis == 'none':
        feature_size, action_embedding_size = 2048 + 128, 2048 + 128
    elif args.useObjLabelOrVis == 'vis':
        feature_size, action_embedding_size = 2048 + 128 + args.objVisFeatDim, 2048 + 128 + args.objVisFeatDim
    elif args.useObjLabelOrVis == 'label':
        feature_size, action_embedding_size = 2048 + 128 + args.objLanFeatDim, 2048 + 128 + args.objLanFeatDim
    elif args.useObjLabelOrVis == 'both':
        feature_size = 2048 + 128 + args.objVisFeatDim + args.objLanFeatDim
        action_embedding_size = 2048 + args.objVisFeatDim + args.objLanFeatDim + 128

    Encoder = TransformerEncoder if args.transformer else EncoderLSTM
    Decoder = CogroundDecoderLSTM if args.coground else AttnDecoderLSTM
    word_embedding_size = 256 if args.coground else 300
    encoder = try_cuda(
        Encoder(len(vocab),
                word_embedding_size,
                enc_hidden_size,
                vocab_pad_idx,
                dropout_ratio,
                bidirectional=args.bidirectional,
                glove=glove))
    decoder = try_cuda(
        Decoder(action_embedding_size,
                hidden_size,
                dropout_ratio,
                feature_size=feature_size,
                num_head=args.num_head))
    prog_monitor = try_cuda(ProgressMonitor(
        action_embedding_size, hidden_size)) if args.prog_monitor else None
    bt_button = try_cuda(BacktrackButton()) if args.bt_button else None
    dev_monitor = try_cuda(DeviationMonitor(
        action_embedding_size, hidden_size)) if args.dev_monitor else None

    agent = Seq2SeqAgent(None,
                         "",
                         encoder,
                         decoder,
                         max_episode_len,
                         max_instruction_length=MAX_INPUT_LENGTH,
                         attn_only_verb=args.attn_only_verb)
    agent.prog_monitor = prog_monitor
    agent.dev_monitor = dev_monitor  # not used
    agent.bt_button = bt_button  # not used
    agent.soft_align = args.soft_align  # not used

    if args.useObjLabelOrVis != 'none':
        if args.useDect:
            print('Using detectoin-based pointer')
            agent.pointer = DectPointer(args)
        else:
            print('Using gt-based pointer')
            agent.pointer = Pointer(args)
        agent.useObjLabelOrVis = args.useObjLabelOrVis
        agent.objTopK = args.objTopK
        agent.objVisFeatDim = args.objVisFeatDim
        agent.objLanFeatDim = args.objLanFeatDim
        agent.ObjEachViewVisFeatPath = osp.join(root_path, 'img_features',
                                                args.ObjEachViewVisFeatDir)
        agent.ObjEachViewLanFeatPath = osp.join(root_path, 'img_features',
                                                args.ObjEachViewLanFeatDir)

        agent.ObjEachViewVisFeat = {}
        agent.ObjEachViewLanFeat = {}
        dict_glove = np.load(args.labelGlovePath)  # for object label encoding
        if args.useObjLabelOrVis in ['label', 'both']:
            agent.objLabelEncoder = try_cuda(
                EncoderLSTMGlove(dict_glove.shape[0],
                                 300,
                                 int(enc_hidden_size / 2),
                                 vocab_pad_idx,
                                 dropout_ratio,
                                 bidirectional=True,
                                 glove=dict_glove))
        else:
            agent.objLabelEncoder = None
    else:
        agent.pointer = None

    if args.scorer:  # not used
        agent.scorer = make_scorer(args)

    if args.load_follower is not '':
        scorer_exists = osp.isfile(args.load_follower + '_scorer_enc')
        agent.load(args.load_follower,
                   load_scorer=(args.load_scorer is '' and scorer_exists))
        print(colorize('load follower ' + args.load_follower))

    return agent
Esempio n. 8
0
 def __init__(self):
     self.scorer = scorer
     self.text_encoder = encoder
     self.traj_encoder = None
     self.sm = try_cuda(nn.Softmax(dim=1))
     self.gamma = 0.0  # how much follower_logit to consider
Esempio n. 9
0
 def transform(lst, wrap_with_var=True):
     features = np.stack(lst)
     x = torch.from_numpy(features)
     if wrap_with_var:
         x = Variable(x, requires_grad=False)
     return try_cuda(x)
Esempio n. 10
0
 def batch_features(self, feature_list):
     features = np.stack(feature_list)
     return try_cuda(
         Variable(torch.from_numpy(features), requires_grad=False))
Esempio n. 11
0
    def beam_search(self, beam_size, path_obs, path_actions):

        # TODO: here
        assert len(path_obs) == len(path_actions)

        start_obs, batched_image_features, batched_action_embeddings, path_mask, \
            path_lengths, _, perm_indices = \
            batch_observations_and_actions(path_obs, path_actions, None)
        batch_size = len(start_obs)
        assert len(perm_indices) == batch_size

        ctx, h_t, c_t = self.encoder(batched_action_embeddings,
                                     batched_image_features)

        completed = []
        for _ in range(batch_size):
            completed.append([])

        beams = [[
            InferenceState(prev_inference_state=None,
                           flat_index=i,
                           last_word=vocab_bos_idx,
                           word_count=0,
                           score=0.0,
                           last_alpha=None)
        ] for i in range(batch_size)]

        for t in range(self.instruction_len):
            flat_indices = []
            beam_indices = []
            w_t_list = []
            for beam_index, beam in enumerate(beams):
                for inf_state in beam:
                    beam_indices.append(beam_index)
                    flat_indices.append(inf_state.flat_index)
                    w_t_list.append(inf_state.last_word)
            w_t = try_cuda(
                Variable(torch.LongTensor(w_t_list), requires_grad=False))
            if len(w_t.shape) == 1:
                w_t = w_t.unsqueeze(0)

            h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1),
                                                  h_t[flat_indices],
                                                  c_t[flat_indices],
                                                  ctx[beam_indices],
                                                  path_mask[beam_indices])

            log_probs = F.log_softmax(logit, dim=1).data
            _, word_indices = logit.data.topk(min(beam_size,
                                                  logit.size()[1]),
                                              dim=1)
            word_scores = log_probs.gather(1, word_indices)
            assert word_scores.size() == word_indices.size()

            start_index = 0
            new_beams = []
            all_successors = []
            for beam_index, beam in enumerate(beams):
                successors = []
                end_index = start_index + len(beam)
                if beam:
                    for inf_index, (inf_state, word_score_row, word_index_row) in \
                        enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])):
                        for word_score, word_index in zip(
                                word_score_row, word_index_row):
                            flat_index = start_index + inf_index
                            successors.append(
                                InferenceState(
                                    prev_inference_state=inf_state,
                                    flat_index=flat_index,
                                    last_word=word_index,
                                    word_count=inf_state.word_count + 1,
                                    score=inf_state.score + word_score,
                                    last_alpha=alpha[flat_index].data))
                start_index = end_index
                successors = sorted(successors,
                                    key=lambda t: t.score,
                                    reverse=True)[:beam_size]
                all_successors.append(successors)

            for beam_index, successors in enumerate(all_successors):
                new_beam = []
                for successor in successors:
                    if successor.last_word == vocab_eos_idx or t == self.instruction_len - 1:
                        completed[beam_index].append(successor)
                    else:
                        new_beam.append(successor)
                if len(completed[beam_index]) >= beam_size:
                    new_beam = []
                new_beams.append(new_beam)

            beams = new_beams

            if not any(beam for beam in beams):
                break

        outputs = []
        for _ in range(batch_size):
            outputs.append([])

        for perm_index, src_index in enumerate(perm_indices):
            this_outputs = outputs[src_index]
            assert len(this_outputs) == 0

            this_completed = completed[perm_index]
            instr_id = start_obs[perm_index]['instr_id']
            for inf_state in sorted(this_completed,
                                    key=lambda t: t.score,
                                    reverse=True)[:beam_size]:
                word_indices, scores, attentions = backchain_inference_states(
                    inf_state)
                this_outputs.append({
                    'instr_id':
                    instr_id,
                    'word_indices':
                    word_indices,
                    'score':
                    inf_state.score,
                    'scores':
                    scores,
                    'words':
                    self.env.tokenizer.decode_sentence(word_indices,
                                                       break_on_eos=True,
                                                       join=False),
                    'attentions':
                    attentions,
                })
        return outputs
Esempio n. 12
0
    def _score_obs_actions_and_instructions(self, path_obs, path_actions,
                                            encoded_instructions, feedback):
        assert len(path_obs) == len(path_actions)
        assert len(path_obs) == len(encoded_instructions)
        start_obs, batched_image_features, batched_action_embeddings, path_mask, \
            path_lengths, encoded_instructions, perm_indices = \
            batch_observations_and_actions(
                path_obs, path_actions, encoded_instructions)

        instr_seq, _, _ = batch_instructions_from_encoded(
            encoded_instructions, self.instruction_len)

        batch_size = len(start_obs)

        ctx, h_t, c_t = self.encoder(batched_action_embeddings,
                                     batched_image_features)

        w_t = try_cuda(
            Variable(torch.from_numpy(
                np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(),
                     requires_grad=False))
        ended = np.array([False] * batch_size)

        assert len(perm_indices) == batch_size
        outputs = [None] * batch_size
        for perm_index, src_index in enumerate(perm_indices):
            outputs[src_index] = {
                'instr_id': start_obs[perm_index]['instr_id'],
                'word_indices': [],
                'scores': [],
                #'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]),
            }
        assert all(outputs)

        # for i in range(batch_size):
        #     assert outputs[i]['instr_id'] != '1008_0', "found example at index {}".format(i)

        # Do a sequence rollout and calculate the loss
        loss = 0
        sequence_scores = try_cuda(torch.zeros(batch_size))
        for t in range(self.instruction_len):
            h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1), h_t, c_t,
                                                  ctx, path_mask)
            # Supervised training

            # BOS are not part of the encoded sequences
            target = instr_seq[:, t].contiguous()

            # Determine next model inputs
            if feedback == 'teacher':
                w_t = target
            elif feedback == 'argmax':
                _, w_t = logit.max(1)  # student forcing - argmax
                w_t = w_t.detach()
            elif feedback == 'sample':
                probs = F.softmax(logit)  # sampling an action from model
                m = D.Categorical(probs)
                w_t = m.sample()
                #w_t = probs.multinomial(1).detach().squeeze(-1)
            else:
                sys.exit('Invalid feedback option')

            log_probs = F.log_softmax(logit, dim=1)
            word_scores = -F.nll_loss(
                log_probs, w_t, ignore_index=vocab_pad_idx, reduction='none')
            sequence_scores += word_scores.data
            loss += F.nll_loss(log_probs,
                               target,
                               ignore_index=vocab_pad_idx,
                               reduction='elementwise_mean')

            for perm_index, src_index in enumerate(perm_indices):
                word_idx = w_t[perm_index].item()
                if not ended[perm_index]:
                    outputs[src_index]['word_indices'].append(int(word_idx))
                    outputs[src_index]['score'] = float(
                        sequence_scores[perm_index])
                    outputs[src_index]['scores'].append(
                        word_scores[perm_index].data.tolist())
                if word_idx == vocab_eos_idx:
                    ended[perm_index] = True

            # print("t: %s\tstate: %s\taction: %s\tscore: %s" % (t, world_states[0], a_t.data[0], sequence_scores[0]))

            # Early exit if all ended
            if ended.all():
                break

        for item in outputs:
            item['words'] = self.env.tokenizer.decode_sentence(
                item['word_indices'], break_on_eos=True, join=False)

        return outputs, loss