Ejemplo n.º 1
0
 def init_state(self, batch_size):
     ''' Initialize to zero cell states and hidden states.'''
     h0 = Variable(torch.zeros(batch_size, self.hidden_size),
                   requires_grad=False)
     c0 = Variable(torch.zeros(batch_size, self.hidden_size),
                   requires_grad=False)
     return try_cuda(h0), try_cuda(c0)
Ejemplo n.º 2
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        batch_size=BATCH_SIZE):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=args.bidirectional, glove=glove))
    decoder = try_cuda(AttnDecoderLSTM(
        action_embedding_size, hidden_size, dropout_ratio,
        feature_size=feature_size))
    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok),
                eval.Evaluation([split]))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Ejemplo n.º 3
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))

    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Ejemplo n.º 4
0
    def _batch_observations_and_actions(self, path_obs, path_actions,
                                        encoded_instructions):
        seq_lengths = np.array([len(a) for a in path_actions])
        max_path_length = seq_lengths.max()

        # DO NOT permute the sequence, since here we are doing manual LSTM unrolling in encoder
        # perm_indices = np.argsort(-seq_lengths)
        perm_indices = np.arange(len(path_obs))
        #path_obs, path_actions, encoded_instructions, seq_lengths = zip(*sorted(zip(path_obs, path_actions, encoded_instructions, seq_lengths), key=lambda p: p[-1], reverse=True))
        # path_obs = [path_obs[i] for i in perm_indices]
        # path_actions = [path_actions[i] for i in perm_indices]
        # if encoded_instructions:
        #     encoded_instructions = [encoded_instructions[i] for i in perm_indices]
        # seq_lengths = [seq_lengths[i] for i in perm_indices]

        batch_size = len(path_obs)
        assert batch_size == len(path_actions)

        mask = np.ones((batch_size, max_path_length), np.uint8)
        action_embedding_dim = path_obs[0][0]['action_embedding'].shape[-1]
        batched_action_embeddings = [
            np.zeros((batch_size, action_embedding_dim), np.float32)
            for _ in range(max_path_length)
        ]
        feature_list = path_obs[0][0]['feature']
        assert len(feature_list) == 1
        image_feature_shape = feature_list[0].shape
        batched_image_features = [
            np.zeros((batch_size, ) + image_feature_shape, np.float32)
            for _ in range(max_path_length)
        ]
        for i, (obs, actions) in enumerate(zip(path_obs, path_actions)):
            # don't include the last state, which should result after the stop action
            assert len(obs) == len(actions) + 1
            obs = obs[:-1]
            mask[i, :len(actions)] = 0
            for t, (ob, a) in enumerate(zip(obs, actions)):
                assert a >= 0
                batched_image_features[t][i] = ob['feature'][0]
                batched_action_embeddings[t][i] = ob['action_embedding'][a]
        batched_action_embeddings = [
            try_cuda(Variable(torch.from_numpy(act), requires_grad=False))
            for act in batched_action_embeddings
        ]
        batched_image_features = [
            try_cuda(Variable(torch.from_numpy(feat), requires_grad=False))
            for feat in batched_image_features
        ]
        mask = try_cuda(torch.from_numpy(mask))

        start_obs = [obs[0] for obs in path_obs]

        return start_obs, \
               batched_image_features, \
               batched_action_embeddings, \
               mask, \
               list(seq_lengths), \
               encoded_instructions, \
               list(perm_indices)
Ejemplo n.º 5
0
 def init_state(self, batch_size):
     ''' Initialize to zero cell states and hidden states.'''
     h0 = torch.zeros(batch_size,
                      self.hidden_size * self.num_layers *
                      self.num_directions,
                      requires_grad=False)
     c0 = torch.zeros(batch_size,
                      self.hidden_size * self.num_layers *
                      self.num_directions,
                      requires_grad=False)
     return try_cuda(h0), try_cuda(c0)
Ejemplo n.º 6
0
def make_speaker(args):
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    vocab = read_vocab(TRAIN_VOCAB)
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))
    agent = Seq2SeqSpeaker(
        None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
    return agent
Ejemplo n.º 7
0
def make_scorer(args):
    bidirectional = args.bidirectional
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    feature_size = FEATURE_SIZE
    traj_encoder = try_cuda(SpeakerEncoderLSTM(action_embedding_size, feature_size,
                                          enc_hidden_size, dropout_ratio, bidirectional=args.bidirectional))
    scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size))
    scorer = Scorer(scorer_module, traj_encoder)
    if args.load_scorer is not '':
        scorer.load(args.load_scorer)
        print(colorize('load scorer traj '+ args.load_scorer))
    elif args.load_traj_encoder is not '':
        scorer.load_traj_encoder(args.load_traj_encoder)
        print(colorize('load traj encoder '+ args.load_traj_encoder))
    return scorer
Ejemplo n.º 8
0
 def _feature_variable(self, obs, beamed=False):
   ''' Extract precomputed features into variable. '''
   features = [ob['feature'] for ob in (flatten(obs) if beamed else obs)]
   # currently only support one image featurizer (without attention)
   assert all(len(f) == 1 for f in features)
   features = np.stack(features)
   return try_cuda(Variable(torch.from_numpy(features), requires_grad=False))
Ejemplo n.º 9
0
def make_image_attention_layers(args, image_features_list, hidden_size):
    image_attention_size = args.image_attention_size or hidden_size
    attention_mechs = []
    for featurizer in image_features_list:
        if isinstance(featurizer, ConvolutionalImageFeatures):
            if args.image_attention_type == 'feedforward':
                attention_mechs.append(
                    MultiplicativeImageAttention(
                        hidden_size,
                        image_attention_size,
                        image_feature_size=featurizer.feature_dim))
            elif args.image_attention_type == 'multiplicative':
                attention_mechs.append(
                    FeedforwardImageAttention(
                        hidden_size,
                        image_attention_size,
                        image_feature_size=featurizer.feature_dim))
        elif isinstance(featurizer, BottomUpImageFeatures):
            attention_mechs.append(
                BottomUpImageAttention(
                    hidden_size, args.bottom_up_detection_embedding_size,
                    args.bottom_up_detection_embedding_size,
                    image_attention_size, featurizer.num_objects,
                    featurizer.num_attributes, featurizer.feature_dim))
        else:
            attention_mechs.append(None)
    attention_mechs = [
        try_cuda(mech) if mech else mech for mech in attention_mechs
    ]
    return attention_mechs
Ejemplo n.º 10
0
 def prepare_proposals(self, batch_h, batch_c, batch_obs, batch_acs):
     ''' for each action proposal, prepare its h,c
     input: existing traj h,c; observation; actions
     output: proposed (h,c) * [batch_size, max_proposal_size]
     '''
     batch_size, ac_size, _ = batch_acs.size()
     hidden_size = self.encoder.hidden_size
     proposal_h = try_cuda(torch.zeros(batch_size, ac_size, hidden_size))
     proposal_c = try_cuda(torch.zeros(batch_size, ac_size, hidden_size))
     for i in range(batch_size):
         h = batch_h[i].expand(ac_size, -1)
         c = batch_c[i].expand(ac_size, -1)
         obs = batch_obs[i].expand(ac_size, -1, -1)
         proposal_h[i], proposal_c[i] = self.encoder._forward_one_step(
             h, c, batch_acs[i], obs)
     return proposal_h.detach(), proposal_c.detach()
Ejemplo n.º 11
0
 def __init__(self,
              embedding_size,
              hidden_size,
              dropout_ratio,
              feature_size=2048 + 128,
              image_attention_layers=None,
              visual_hidden_size=1024,
              num_head=8):
     super(CogroundDecoderLSTM, self).__init__()
     self.embedding_size = embedding_size
     self.feature_size = feature_size
     self.hidden_size = hidden_size
     self.u_begin = try_cuda(
         Variable(torch.zeros(embedding_size), requires_grad=False))
     self.drop = nn.Dropout(p=dropout_ratio)
     # For now the text attention output size is hidden_size
     self.lstm = nn.LSTMCell(2 * embedding_size + hidden_size, hidden_size)
     self.text_attention_layer = WhSoftDotAttention(hidden_size,
                                                    hidden_size)
     self.positional_encoding = PositionalEncoding(hidden_size, dropout=0)
     self.visual_attention_layer = WhSoftDotAttention(
         hidden_size, visual_hidden_size)
     self.visual_mlp = nn.Sequential(
         nn.BatchNorm1d(feature_size),
         nn.Linear(feature_size, visual_hidden_size),
         nn.BatchNorm1d(visual_hidden_size), nn.Dropout(dropout_ratio),
         nn.ReLU())
     self.action_attention_layer = WhSoftDotAttention(
         hidden_size + hidden_size, visual_hidden_size)
     #self.action_attention_layer = VisualSoftDotAttention(hidden_size+hidden_size, visual_hidden_size)
     self.sm = nn.Softmax(dim=1)
Ejemplo n.º 12
0
 def combine_logit(self, scorer_logit, follower_logit):
     #import pdb;pdb.set_trace()
     if self.gamma == 0.0:
         return scorer_logit
     if self.gamma == 1.0:
         return follower_logit
     g, h = self.gamma, 1 - self.gamma
     prob = h * self.sm(scorer_logit) + g * self.sm(follower_logit)
     return try_cuda(torch.log(prob))
Ejemplo n.º 13
0
def main(args):
    if args.job == 'test':
        args.use_test_set = True
        args.use_pretraining = False

    # Train a goal button
    #if args.job == 'train' and args.scorer is False:
    #    print(colorize('we need a scorer'))
    #    args.scorer = True

    if args.use_pretraining:
        agent, train_env, val_envs, pretrain_env = setup_agent_envs(args)
    else:
        agent, train_env, val_envs = setup_agent_envs(args)

    agent.search = True
    agent.search_logit = args.logit
    agent.search_mean = args.mean
    agent.search_early_stop = args.early_stop
    agent.episode_len = args.max_episode_len
    agent.gamma = args.gamma
    agent.revisit = args.revisit

    if args.load_reranker != '':
        agent.reranker = try_cuda(SimpleCandReranker(28))
        agent.reranker.load_state_dict(torch.load(args.load_reranker))
    agent.inject_stop = args.inject_stop
    agent.K = args.K


    # Load speaker
    if args.load_speaker is not '':
        speaker = make_speaker(args)
        speaker.load(args.load_speaker)
        agent.speaker = speaker

    if args.job == 'search':
        agent.episode_len = args.max_episode_len
        agent.gamma = args.gamma
        print('gamma', args.gamma, 'ep_len', args.ep_len)
        run_search(args, agent, train_env, val_envs)
    elif args.job == 'sweep':
        for gamma in [float(g)/100 for g in range(0,101,5)]:
            for ep_len in [40]:
                agent.episode_len = ep_len
                agent.gamma = gamma
                print('gamma', gamma, 'ep_len', ep_len)
        #eval_gamma(args, agent, train_env, val_envs)
    elif args.job == 'cache':
        cache(args, agent, train_env, val_envs)
    elif args.job == 'train':
        train_val(args, agent, train_env, val_envs)
    elif args.job == 'test':
        test(args, agent, val_envs)
    else:
        print("no job specified")
Ejemplo n.º 14
0
 def forward(self, h_t_minus_1, c_t, v_ground, text_attn):
     h_input = torch.cat((h_t_minus_1, v_ground), 1)
     h_t_pm = torch.sigmoid(self.linear_h(h_input)) * torch.tanh(c_t)
     batch_size, seq_len = text_attn.size()
     if seq_len < self.text_len:
         pads = try_cuda(torch.zeros(batch_size, self.text_len - seq_len))
         pm_input = torch.cat((text_attn, pads, h_t_pm), 1)
     else:
         pm_input = torch.cat((text_attn, h_t_pm), 1)
     pm_output = torch.tanh(self.linear_pm(pm_input).squeeze(-1))
     return pm_output
Ejemplo n.º 15
0
def make_speaker(args, action_embedding_size=-1, feature_size=-1):
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    wordvec = np.load(args.wordvec_path)

    vocab = read_vocab(TRAIN_VOCAB, args.language)
    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           hidden_size,
                           dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))
    agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH)
    return agent
Ejemplo n.º 16
0
def make_speaker(args, action_embedding_size=-1, feature_size=-1):
    enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size
    wordvec = np.load(args.wordvec_path)

    vocab = read_vocab(TRAIN_VOCAB, args.language)
    word_embedding_size = get_word_embedding_size(args)
    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           args.dropout_ratio,
                           bidirectional=args.bidirectional))
    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           args.hidden_size,
                           args.dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))
    agent = Seq2SeqSpeaker(None, "", encoder, decoder, args.max_input_length)
    return agent
Ejemplo n.º 17
0
def make_follower(args, vocab):
    enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size
    glove = np.load(glove_path) if args.use_glove else None
    feature_size = FEATURE_SIZE
    Encoder = TransformerEncoder if args.transformer else EncoderLSTM
    Decoder = CogroundDecoderLSTM if args.coground else AttnDecoderLSTM
    word_embedding_size = 256 if args.coground else 300
    encoder = try_cuda(Encoder(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=args.bidirectional, glove=glove))
    decoder = try_cuda(Decoder(
        action_embedding_size, hidden_size, dropout_ratio,
        feature_size=feature_size, num_head=args.num_head))
    prog_monitor = try_cuda(ProgressMonitor(action_embedding_size,
                            hidden_size)) if args.prog_monitor else None
    bt_button = try_cuda(BacktrackButton()) if args.bt_button else None
    dev_monitor = try_cuda(DeviationMonitor(action_embedding_size,
                            hidden_size)) if args.dev_monitor else None

    agent = Seq2SeqAgent(
        None, "", encoder, decoder, max_episode_len,
        max_instruction_length=MAX_INPUT_LENGTH,
        attn_only_verb=args.attn_only_verb)
    agent.prog_monitor = prog_monitor
    agent.dev_monitor = dev_monitor
    agent.bt_button = bt_button
    agent.soft_align = args.soft_align

    if args.scorer:
        agent.scorer = make_scorer(args)

    if args.load_follower is not '':
        scorer_exists = os.path.isfile(args.load_follower + '_scorer_enc')
        agent.load(args.load_follower, load_scorer=(args.load_scorer is '' and scorer_exists))
        print(colorize('load follower '+ args.load_follower))

    return agent
Ejemplo n.º 18
0
 def __init__(self, embedding_size, hidden_size, dropout_ratio,
              feature_size=2048+128, image_attention_layers=None):
     super(AttnDecoderLSTM, self).__init__()
     self.embedding_size = embedding_size
     self.feature_size = feature_size
     self.hidden_size = hidden_size
     # self.embedding = nn.Embedding(input_action_size, embedding_size)
     self.u_begin = try_cuda(Variable(
         torch.zeros(embedding_size), requires_grad=False))
     self.drop = nn.Dropout(p=dropout_ratio)
     self.lstm = nn.LSTMCell(embedding_size+feature_size, hidden_size)
     self.visual_attention_layer = VisualSoftDotAttention(
         hidden_size, feature_size)
     self.text_attention_layer = SoftDotAttention(hidden_size)
     self.decoder2action = EltwiseProdScoring(hidden_size, embedding_size)
Ejemplo n.º 19
0
def batch_instructions_from_encoded(encoded_instructions,
                                    max_length,
                                    reverse=False,
                                    sort=False):
    # encoded_instructions: list of lists of token indices (should not be padded, or contain BOS or EOS tokens)
    #seq_tensor = np.array(encoded_instructions)
    # make sure pad does not start any sentence
    num_instructions = len(encoded_instructions)
    seq_tensor = np.full((num_instructions, max_length), vocab_pad_idx)
    seq_lengths = []
    for i, inst in enumerate(encoded_instructions):

        #if len(inst) > 0:
        #    assert inst[-1] != vocab_eos_idx
        if reverse:
            inst = inst[::-1]
        inst = np.concatenate((inst.cpu(), [vocab_eos_idx]))
        inst = inst[:max_length]
        seq_tensor[i, :len(inst)] = inst
        seq_lengths.append(len(inst))

    seq_tensor = torch.from_numpy(seq_tensor)
    if sort:
        seq_lengths, perm_idx = torch.from_numpy(np.array(seq_lengths)).sort(
            0, True)
        seq_lengths = list(seq_lengths)
        seq_tensor = seq_tensor[perm_idx]

    mask = (seq_tensor == vocab_pad_idx)[:, :max(seq_lengths)]

    ret_tp = try_cuda(Variable(seq_tensor, requires_grad=False).long()), \
             try_cuda(mask.bool()), \
             seq_lengths
    if sort:
        ret_tp = ret_tp + (list(perm_idx), )
    return ret_tp
Ejemplo n.º 20
0
    def generate(self, path_obs, path_actions, encoded_instructions=[0]):

        start_obs, batched_image_features, batched_action_embeddings, path_mask, \
        path_lengths, encoded_instructions, perm_indices = \
        self._batch_observations_and_actions(
            path_obs, path_actions, encoded_instructions)

        batch_size = 1
        batched_action_embeddings = batched_action_embeddings
        batched_image_features = batched_image_features
        ctx, h_t, c_t = self.encoder(batched_action_embeddings,
                                     batched_image_features)
        w_t = try_cuda(
            Variable(torch.from_numpy(
                np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(),
                     requires_grad=False))

        ended = np.array([False] * batch_size)
        word_indices = []
        #print(w_t.size(),h_t.size(),c_t.size(),ctx.size(),path_mask.size())
        for t in range(self.instruction_len):
            h_t, c_t, alpha, logit = self.decoder(w_t, h_t, c_t, ctx,
                                                  path_mask)

            #_,w_t = logit.max(1)        # student forcing - argmax
            #w_t = w_t.detach()
            probs = F.softmax(logit, dim=1)  # sampling an action from model
            m = D.Categorical(probs)
            w_t = m.sample()

            word_idx = w_t[0].item()
            print(word_idx)
            word_indices.append(word_idx)
            if ended.all():
                break

        decoded_words = self.tokenizer.decode_sentence(word_indices,
                                                       break_on_eos=True,
                                                       join=False)

        return decoded_words
Ejemplo n.º 21
0
 def batch_features(self, feature_list):
     features = np.stack(feature_list)
     return try_cuda(Variable(torch.from_numpy(features), requires_grad=False))
Ejemplo n.º 22
0
from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB

MAX_INPUT_LENGTH = 80
feature_size = 2048+128
max_episode_len = 10
word_embedding_size = 300
glove_path = 'tasks/R2R/data/train_glove.npy'
action_embedding_size = 2048+128
hidden_size = 512
dropout_ratio = 0.5
vocab = read_vocab(TRAIN_VOCAB)
tok = Tokenizer(vocab=vocab)
glove = np.load(glove_path)

encoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, hidden_size, vocab_pad_idx,
        dropout_ratio, glove=glove))
decoder = try_cuda(AttnDecoderLSTM(
    action_embedding_size, hidden_size, dropout_ratio,
    feature_size=feature_size))

agent = Seq2SeqAgent(
        None, "", encoder, decoder, max_episode_len,
        max_instruction_length=MAX_INPUT_LENGTH)
            
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent.load('tasks/R2R/snapshots/release/follower_final_release', map_location = device)



Ejemplo n.º 23
0
 def __init__(self):
     self.scorer = scorer
     self.text_encoder = encoder
     self.traj_encoder = None
     self.sm = try_cuda(nn.Softmax(dim=1))
     self.gamma = 0.0  # how much follower_logit to consider
Ejemplo n.º 24
0
 def init_state(self, batch_size):
     h0 = Variable(torch.zeros(batch_size,self.hidden_size),requires_grad = False)
     c0 = Variable(torch.zeros(batch_size,self.hidden_size),requires_grad = False)
     
     return try_cuda(h0), try_cuda(c0)
Ejemplo n.º 25
0
  def _score_obs_actions_and_instructions(self, path_obs, path_actions, encoded_instructions, feedback):
    assert len(path_obs) == len(path_actions)
    assert len(path_obs) == len(encoded_instructions)
    start_obs, batched_image_features, batched_action_embeddings, path_mask, \
        path_lengths, encoded_instructions, perm_indices = \
        batch_observations_and_actions(
            path_obs, path_actions, encoded_instructions)

    instr_seq, _, _ = batch_instructions_from_encoded(
        encoded_instructions, self.instruction_len)

    batch_size = len(start_obs)

    ctx, h_t, c_t = self.encoder(
        batched_action_embeddings, batched_image_features)

    w_t = try_cuda(Variable(torch.from_numpy(np.full((batch_size,), vocab_bos_idx, dtype='int64')).long(),
                            requires_grad=False))
    ended = np.array([False] * batch_size)

    assert len(perm_indices) == batch_size
    outputs = [None] * batch_size
    for perm_index, src_index in enumerate(perm_indices):
      outputs[src_index] = {
          'instr_id': start_obs[perm_index]['instr_id'],
          'word_indices': [],
          'scores': [],
          # 'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]),
      }
    assert all(outputs)

    # for i in range(batch_size):
    #     assert outputs[i]['instr_id'] != '1008_0', 'found example at index {}'.format(i)

    # Do a sequence rollout and calculate the loss
    loss = 0
    sequence_scores = try_cuda(torch.zeros(batch_size))
    for t in range(self.instruction_len):
      h_t, c_t, alpha, logit = self.decoder(
          w_t.view(-1, 1), h_t, c_t, ctx, path_mask)
      # Supervised training

      # BOS are not part of the encoded sequences
      target = instr_seq[:, t].contiguous()

      # Determine next model inputs
      if feedback == 'teacher':
        w_t = target
      elif feedback == 'argmax':
        _, w_t = logit.max(1)        # student forcing - argmax
        w_t = w_t.detach()
      elif feedback == 'sample':
        probs = F.softmax(logit)    # sampling an action from model
        m = D.Categorical(probs)
        w_t = m.sample()
        #w_t = probs.multinomial(1).detach().squeeze(-1)
      else:
        sys.exit('Invalid feedback option')

      log_probs = F.log_softmax(logit, dim=1)
      word_scores = -F.nll_loss(log_probs, w_t,
                                ignore_index=vocab_pad_idx, reduction='none')
      sequence_scores += word_scores.data
      loss += F.nll_loss(log_probs, target,
                         ignore_index=vocab_pad_idx, reduction='mean')

      for perm_index, src_index in enumerate(perm_indices):
        word_idx = w_t[perm_index].item()
        if not ended[perm_index]:
          outputs[src_index]['word_indices'].append(int(word_idx))
          outputs[src_index]['score'] = float(sequence_scores[perm_index])
          outputs[src_index]['scores'].append(
              word_scores[perm_index].data.tolist())
        if word_idx == vocab_eos_idx:
          ended[perm_index] = True

      # print('t: %s\tstate: %s\taction: %s\tscore: %s' % (t, world_states[0], a_t.data[0], sequence_scores[0]))

      # Early exit if all ended
      if ended.all():
        break

    for item in outputs:
      item['words'] = self.env.tokenizer.decode_sentence(
          item['word_indices'], break_on_eos=True, join=False)
      # pdb.set_trace()
    return outputs, loss
Ejemplo n.º 26
0
    def _score_obs_actions_and_instructions(self,
                                            path_obs,
                                            path_actions,
                                            encoded_instructions,
                                            feedback,
                                            lamda=0.95):
        assert len(path_obs) == len(path_actions)
        assert len(path_obs) == len(encoded_instructions)
        start_obs, batched_image_features, batched_action_embeddings, path_mask, \
            path_lengths, encoded_instructions, perm_indices = \
            self._batch_observations_and_actions(
                path_obs, path_actions, encoded_instructions)

        instr_seq, _, _ = batch_instructions_from_encoded(
            encoded_instructions, self.instruction_len)

        batch_size = len(start_obs)

        ctx, h_t, c_t = self.encoder(batched_action_embeddings,
                                     batched_image_features)

        w_t = try_cuda(
            Variable(torch.from_numpy(
                np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(),
                     requires_grad=False))
        ended = np.array([False] * batch_size)

        assert len(perm_indices) == batch_size
        outputs = [None] * batch_size
        for perm_index, src_index in enumerate(perm_indices):
            outputs[src_index] = {
                'instr_id': start_obs[perm_index]['instr_id'],
                'word_indices': [],
                'scores': [],
                #'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]),
            }
        assert all(outputs)

        # for i in range(batch_size):
        #     assert outputs[i]['instr_id'] != '1008_0', "found example at index {}".format(i)

        # Do a sequence rollout and calculate the loss
        loss = 0
        sequence_scores = try_cuda(torch.zeros(batch_size))
        output_soft = []
        instr_pred = []

        for t in range(self.instruction_len):
            h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1), h_t, c_t,
                                                  ctx, path_mask)
            # Supervised training

            # BOS are not part of the encoded sequences
            target = instr_seq[:, t].contiguous()
            probs = F.softmax(logit, dim=1)
            # Determine next model inputs
            if feedback == 'teacher':
                w_t = target
            elif feedback == 'argmax':
                _, w_t = logit.max(1)  # student forcing - argmax
                w_t = w_t.detach()
            elif feedback == 'sample':
                #probs = F.softmax(logit)    # sampling an action from model
                m = D.Categorical(probs)
                w_t = m.sample()
                #w_t = probs.multinomial(1).detach().squeeze(-1)
            else:
                sys.exit('Invalid feedback option')

            log_probs = F.log_softmax(logit, dim=1)
            output_soft.append(probs.unsqueeze(0))
            instr_pred.append(w_t.unsqueeze(0))
            word_scores = -F.nll_loss(
                log_probs, w_t, ignore_index=vocab_pad_idx, reduction='none')
            sequence_scores += word_scores.data
            loss += F.nll_loss(log_probs,
                               target,
                               ignore_index=vocab_pad_idx,
                               reduction='mean')

            for perm_index, src_index in enumerate(perm_indices):
                word_idx = w_t[perm_index].item()
                if not ended[perm_index]:
                    outputs[src_index]['word_indices'].append(int(word_idx))
                    outputs[src_index]['score'] = float(
                        sequence_scores[perm_index])
                    outputs[src_index]['scores'].append(
                        word_scores[perm_index].data.tolist())
                if word_idx == vocab_eos_idx:
                    ended[perm_index] = True

            # print("t: %s\tstate: %s\taction: %s\tscore: %s" % (t, world_states[0], a_t.data[0], sequence_scores[0]))

            # Early exit if all ended
            if ended.all():
                break

        output_soft = torch.cat(output_soft, 0)
        output_soft = output_soft.transpose(0, 1)
        instr_pred = torch.cat(instr_pred, 0)
        instr_pred = instr_pred.transpose(0, 1).int().tolist()
        instr_seq = instr_seq.int().tolist()

        def unpad(ls):
            length = len(ls)
            output = [None] * length
            for i in range(len(ls)):
                try:
                    idx = ls[i].index(vocab_eos_idx) + 1
                except:
                    idx = len(ls[i])

                output[i] = ls[i][:idx]
            return output

        instr_pred = unpad(instr_pred)
        instr_seq = unpad(instr_seq)

        #print(instr_seq[0],instr_pred[0], BLEU([instr_seq[0]], instr_pred[0],weights=(1/3,1/3,1/3)))
        bleus = []
        lossRL = 0

        if not self.stat == 'test':
            # =============================================================================
            #             #####################################################################################
            #             ##########################bleu reward###############################################
            #             #print(output_soft,output_soft.size(),instr_pred.size(),instr_seq.size())
            #             #print(sum(bleus)/len(bleus))
            #             for batch_idx in range(batch_size):
            #                 #print(batch_idx)
            #                 pred_i = instr_pred[batch_idx]
            #                 seq_i = instr_seq[batch_idx]
            #                 #print(seq_i, pred_i)
            #                 bleus.append(BLEU([seq_i],pred_i))
            #                 for i in range(len(pred_i)):
            #
            #                     G = 0
            #                     for j in range(len(pred_i)-i,len(pred_i)+1):
            #                         if j > 1:
            #                             G = G + BLEU([seq_i],pred_i[:j]) - BLEU([seq_i],pred_i[:j-1])
            #                         else:
            #                             G = G + BLEU([seq_i],pred_i[:j])
            #
            #                     lossRL += - G * torch.log(output_soft[batch_idx][len(pred_i)-i-1][pred_i[len(pred_i)-i-1]])
            #
            # =============================================================================
            #######################################################################################################
            ###########################Bertscore reward############################################################
            #vocab = read_vocab(TRAIN_VOCAB)
            #tok = Tokenizer(vocab=vocab)

            lossRL2 = 0

            def get_instr_list(ls):
                ls_ls = []
                for i in range(len(ls)):
                    ls_ls.append([
                        self.tok.decode_sentence(ls[:i + 1],
                                                 break_on_eos=True,
                                                 join=True)
                    ])

                return ls_ls

            def get_bscore(ls, ref):
                ls_ls = get_instr_list(ls)
                bscore_ls = []
                for cand in ls_ls:
                    _, _, F1 = self.scorer.score(cand, [ref])
                    bscore_ls.append(F1)
                return bscore_ls

            for batch_idx in range(batch_size):
                #print(batch_idx)
                pred_i = instr_pred[batch_idx]
                #pred_i = [tok.decode_sentence(pred_i,break_on_eos=True,join=True)]

                seq_i = instr_seq[batch_idx]
                seq_i = [
                    self.tok.decode_sentence(seq_i,
                                             break_on_eos=True,
                                             join=True)
                ]
                bscore_ls = get_bscore(pred_i, seq_i)

                bleus.append(bscore_ls[-1])

                for i in range(len(pred_i)):
                    G = 0
                    for j in range(len(pred_i) - i - 1, len(pred_i)):
                        if j > 0:
                            t = j - (len(pred_i) - i - 1)
                            G += (bscore_ls[j] - bscore_ls[j - 1]) * np.power(
                                lamda, t)
                        else:
                            G += bscore_ls[j]
                    lossRL2 += -G.cuda() * torch.log(
                        output_soft[batch_idx][len(pred_i) - i -
                                               1][pred_i[len(pred_i) - i - 1]])

            #####################distance as reward##################################
            #follower will be loaded in advance

            for batch_idx in range(batch_size):

                #print('{}/{}'.format(batch_idx,batch_size))
                pred_i = instr_pred[batch_idx]
                if pred_i[-1] == 2:
                    pred_i_full = pred_i[:-1][::-1] + [2]
                else:
                    pred_i_full = pred_i[::-1]
                #pred_i = torch.tensor(pred_i,device = torch.device('cuda'))
                location_end = path_obs[batch_idx][-1][
                    'viewpoint']  # end point of the traj
                location_start = path_obs[batch_idx][0][
                    'viewpoint']  # start point of the traj
                ob_1 = start_obs[batch_idx]
                scanId = ob_1['scan']
                viewpoint = ob_1['viewpoint']
                elevation = ob_1['elevation']
                heading = ob_1['heading']

                #print(dist_i)

                traj = self.agent.generate(
                    self.sim,
                    torch.tensor(pred_i_full, device=torch.device('cuda')),
                    scanId, viewpoint, heading, elevation)
                end_pose_pred = traj['trajectory'][-1][0]
                dist_i = self.env.distances[scanId][end_pose_pred][
                    location_end]  # distance towards goal
                length_i = self.env.distances[scanId][location_start][
                    location_end]  # total length of the traj

                bonus = 3 if dist_i < 3 else 0
                bleus.append(dist_i)

                for i in range(len(pred_i)):

                    if i == 0:
                        G = -(dist_i - self.env.distances[scanId][viewpoint]
                              [location_end]) / length_i + bonus
                    else:

                        pred_i_i = pred_i[:
                                          i]  # form the input sequence (reverse them)
                        if pred_i_i[-1] == 2:
                            pred_i_i = pred_i_i[:-1][::-1] + [2]
                        else:
                            pred_i_i = pred_i_i[::-1]
                        pred_i_i = torch.tensor(pred_i_i,
                                                device=torch.device('cuda'))

                        traj_j = self.agent.generate(self.sim, pred_i_i,
                                                     scanId, viewpoint,
                                                     heading, elevation)
                        end_pose_j = traj_j['trajectory'][-1][0]
                        G = -(dist_i - self.env.distances[scanId][end_pose_j]
                              [location_end]) / length_i + bonus

                    lossRL += -G * torch.log(
                        output_soft[batch_idx][i][pred_i[i]])

# =============================================================================
#             #######################################################################################################
#             ####################### compat score as reward ########################################################
#             def get_instr_list(ls):
#                 ls_ls=[]
#                 for i in range(len(ls)):
#                     ls_ls.append([ls[:i+1]])
#                 return ls_ls
#             def get_score_ls(path_obs, path_actions, pred_i):
#                 ls_ls = get_instr_list(pred_i)
#                 score_ls = []
#                 for cand in ls_ls:
#                      score = self.compat.predict([path_obs],[path_actions],cand)
#                      score_ls.append(score)
#                 return score_ls
#
#             for batch_idx in range(batch_size):
#                 path_action = path_actions[batch_idx]
#                 path_ob = path_obs[batch_idx]
#
#                 encoded_instructions_init, _ = self.tok.encode_sentence('')
#                 start_score = self.compat.predict([path_ob],[path_action],torch.tensor([encoded_instructions_init], device = 'cpu'))
#                 #print('{}/{}'.format(batch_idx,batch_size))
#                 pred_i = instr_pred[batch_idx]
#                 if pred_i[-1] == 2:
#                     pred_i = pred_i[:-1][::-1] + [2]
#                 else: pred_i.reverse()
#                 pred_i = torch.tensor(pred_i,device = torch.device('cuda'))
#
#                 score_ls = get_score_ls(path_ob, path_action, pred_i)
#                 bleus.append(score_ls[-1].detach())
#                 for i in range(len(pred_i)):
#                       G = 0
#                       for j in range(len(pred_i)-i-1,len(pred_i)):
#                           if j > 0:
#                               t = j - (len(pred_i)-i-1)
#                               G += (score_ls[j]-score_ls[j-1])*np.power(lamda,t)
#                           else:
#                               G += score_ls[j] - start_score
#
#                       lossRL += - G * torch.log(output_soft[batch_idx][i][pred_i[i]])
#
#
#
# =============================================================================
            npy = np.load('VLN_training.npy')
            bleu_avg = sum(bleus) / len(bleus)
            print(bleu_avg, pred_i)
            npy = np.append(npy, bleu_avg)
            #np.save('BLEU_training.npy',npy)
            with open('VLN_training.npy', 'wb') as f:

                np.save(f, npy)
        #print(lossRL, loss)

        loss = 1 * lossRL + 1 * lossRL2
        #loss = lossRL
        for item in outputs:
            item['words'] = self.env.tokenizer.decode_sentence(
                item['word_indices'], break_on_eos=True, join=False)

        return outputs, loss
Ejemplo n.º 27
0
weight_decay = 0.0005
#weight_decay = 0.0001
FEATURE_SIZE = 2048+128
n_iters = 5000
log_every = 100
save_every = 100

vocab = read_vocab(TRAIN_VOCAB)
tok = Tokenizer(vocab=vocab)
glove = np.load(glove_path)

enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
feature_size = FEATURE_SIZE

visEncoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))    
lanEncoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=False, glove=glove))
dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size))

agent = compatModel(None, "", visEncoder, lanEncoder, dotSim)
#agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location = 'cpu')
agent.load('tasks/R2R/compat/trained_1/compat_sample_imagenet_mean_pooled_train_iter_1000', map_location = 'cpu')
if __name__ == "__main__":
    traj = {'scan':'5q7pvUzZiYa', 'path':["7dc12a67ddfc4a4a849ce620db5b777b", "0e84cf4dec784bc28b78a80bee35c550", "a77784b955454209857d745976a1676d", "67971a17c26f4e2ca117b4fca73507fe", "8db06d3a0dd44508b3c078d60126ce19", "43ac37dfa1db4a13a8a9df4e454eb016", "4bd82c990a6548a994daa97c8f52db06", "6d11ca4d41e04bb1a725c2223c36b2aa", "29fb3c58b29348558d36a9f9440a1379", "c23f26401359426982d11ca494ee739b", "397403366d784caf804d741f32fd68b9", "3c6a35e15ada4b649990d6568cce8bd9", "55e4436f528c4bf09e4550079c572f7b", "69fad7dd177847dbabf69e8fb7c00ddf", "c629c7f1cf6f47a78c45a8ae9ff82247", "21fca0d6192940e580587fe317440f56", "4b85d61dd3a94e8a812affe78f3a322d", "3c025b8e3d2040969cd00dd0e9f29b09"][:2], 'heading':0.0,'elevation_init':0.0}
    encoded_instructions, _ = tok.encode_sentence('')
    encoded_instructions = torch.tensor([encoded_instructions], device = 'cpu')
    rdv_test = rdv(traj)
    
Ejemplo n.º 28
0
def make_env_and_models(args,
                        train_vocab_path,
                        train_splits,
                        test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=train_splits,
                         tokenizer=tok)

    train_env.data.extend(hardNeg_train)  # extend train data and shuffle
    random.shuffle(train_env.data)

    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE

    # =============================================================================
    #     visEncoder = try_cuda(CompatVisEncoderLSTM(
    #         action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
    #         bidirectional=bidirectional))
    # =============================================================================
    visEncoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           dropout_ratio,
                           bidirectional=bidirectional))
    # =============================================================================
    #     lanEncoder = try_cuda(CompatLanEncoderLSTM(
    #         len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
    #         dropout_ratio, bidirectional=True, glove=glove))
    # =============================================================================
    lanEncoder = try_cuda(
        EncoderLSTM(len(vocab),
                    word_embedding_size,
                    enc_hidden_size,
                    vocab_pad_idx,
                    dropout_ratio,
                    bidirectional=False,
                    glove=glove))
    dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size))
    #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc'))
    #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc'))

    test_envs = {
        split: (R2RBatch(image_features_list,
                         batch_size=batch_size,
                         splits=[split],
                         tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits
    }

    #test_envs['val_seen'][0].data.extend(hardNeg_val_seen)
    test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen)
    test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[
        3000:4000]
    return train_env, test_envs, visEncoder, lanEncoder, dotSim
Ejemplo n.º 29
0
  def beam_search(self, beam_size, path_obs, path_actions):

    # TODO: here
    assert len(path_obs) == len(path_actions)

    start_obs, batched_image_features, batched_action_embeddings, path_mask, \
        path_lengths, _, perm_indices = \
        batch_observations_and_actions(path_obs, path_actions, None)
    batch_size = len(start_obs)
    assert len(perm_indices) == batch_size

    ctx, h_t, c_t = self.encoder(
        batched_action_embeddings, batched_image_features)

    completed = []
    for _ in range(batch_size):
      completed.append([])

    beams = [
        [InferenceState(prev_inference_state=None,
                        flat_index=i,
                        last_word=vocab_bos_idx,
                        word_count=0,
                        score=0.0,
                        last_alpha=None)]
        for i in range(batch_size)
    ]

    for t in range(self.instruction_len):
      flat_indices = []
      beam_indices = []
      w_t_list = []
      for beam_index, beam in enumerate(beams):
        for inf_state in beam:
          beam_indices.append(beam_index)
          flat_indices.append(inf_state.flat_index)
          w_t_list.append(inf_state.last_word)
      w_t = try_cuda(Variable(torch.LongTensor(w_t_list), requires_grad=False))
      if len(w_t.shape) == 1:
        w_t = w_t.unsqueeze(0)

      h_t, c_t, alpha, logit = self.decoder(
          w_t.view(-1, 1), h_t[flat_indices], c_t[flat_indices], ctx[beam_indices], path_mask[beam_indices])

      log_probs = F.log_softmax(logit, dim=1).data
      _, word_indices = logit.data.topk(min(beam_size, logit.size()[1]), dim=1)
      word_scores = log_probs.gather(1, word_indices)
      assert word_scores.size() == word_indices.size()

      start_index = 0
      new_beams = []
      all_successors = []
      for beam_index, beam in enumerate(beams):
        successors = []
        end_index = start_index + len(beam)
        if beam:
          for inf_index, (inf_state, word_score_row, word_index_row) in \
                  enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])):
            for word_score, word_index in zip(word_score_row, word_index_row):
              flat_index = start_index + inf_index
              successors.append(
                  InferenceState(
                      prev_inference_state=inf_state,
                      flat_index=flat_index,
                      last_word=word_index,
                      word_count=inf_state.word_count + 1,
                      score=inf_state.score + word_score,
                      last_alpha=alpha[flat_index].data)
              )
        start_index = end_index
        successors = sorted(successors, key=lambda t: t.score, reverse=True)[
            :beam_size]
        all_successors.append(successors)

      for beam_index, successors in enumerate(all_successors):
        new_beam = []
        for successor in successors:
          if successor.last_word == vocab_eos_idx or t == self.instruction_len - 1:
            completed[beam_index].append(successor)
          else:
            new_beam.append(successor)
        if len(completed[beam_index]) >= beam_size:
          new_beam = []
        new_beams.append(new_beam)

      beams = new_beams

      if not any(beam for beam in beams):
        break

    outputs = []
    for _ in range(batch_size):
      outputs.append([])

    for perm_index, src_index in enumerate(perm_indices):
      this_outputs = outputs[src_index]
      assert len(this_outputs) == 0

      this_completed = completed[perm_index]
      instr_id = start_obs[perm_index]['instr_id']
      for inf_state in sorted(this_completed, key=lambda t: t.score, reverse=True)[:beam_size]:
        word_indices, scores, attentions = backchain_inference_states(
            inf_state)
        this_outputs.append({
            'instr_id': instr_id,
            'word_indices': word_indices,
            'score': inf_state.score,
            'scores': scores,
            'words': self.env.tokenizer.decode_sentence(word_indices, break_on_eos=True, join=False),
            'attentions': attentions,
        })
    return outputs
Ejemplo n.º 30
0
 def transform(lst, wrap_with_var=True):
     features = np.stack(lst)
     x = torch.from_numpy(features)
     if wrap_with_var:
         x = Variable(x, requires_grad=False)
     return try_cuda(x)