def init_state(self, batch_size): ''' Initialize to zero cell states and hidden states.''' h0 = Variable(torch.zeros(batch_size, self.hidden_size), requires_grad=False) c0 = Variable(torch.zeros(batch_size, self.hidden_size), requires_grad=False) return try_cuda(h0), try_cuda(c0)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, batch_size=BATCH_SIZE): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} return train_env, test_envs, encoder, decoder
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits} return train_env, test_envs, encoder, decoder
def _batch_observations_and_actions(self, path_obs, path_actions, encoded_instructions): seq_lengths = np.array([len(a) for a in path_actions]) max_path_length = seq_lengths.max() # DO NOT permute the sequence, since here we are doing manual LSTM unrolling in encoder # perm_indices = np.argsort(-seq_lengths) perm_indices = np.arange(len(path_obs)) #path_obs, path_actions, encoded_instructions, seq_lengths = zip(*sorted(zip(path_obs, path_actions, encoded_instructions, seq_lengths), key=lambda p: p[-1], reverse=True)) # path_obs = [path_obs[i] for i in perm_indices] # path_actions = [path_actions[i] for i in perm_indices] # if encoded_instructions: # encoded_instructions = [encoded_instructions[i] for i in perm_indices] # seq_lengths = [seq_lengths[i] for i in perm_indices] batch_size = len(path_obs) assert batch_size == len(path_actions) mask = np.ones((batch_size, max_path_length), np.uint8) action_embedding_dim = path_obs[0][0]['action_embedding'].shape[-1] batched_action_embeddings = [ np.zeros((batch_size, action_embedding_dim), np.float32) for _ in range(max_path_length) ] feature_list = path_obs[0][0]['feature'] assert len(feature_list) == 1 image_feature_shape = feature_list[0].shape batched_image_features = [ np.zeros((batch_size, ) + image_feature_shape, np.float32) for _ in range(max_path_length) ] for i, (obs, actions) in enumerate(zip(path_obs, path_actions)): # don't include the last state, which should result after the stop action assert len(obs) == len(actions) + 1 obs = obs[:-1] mask[i, :len(actions)] = 0 for t, (ob, a) in enumerate(zip(obs, actions)): assert a >= 0 batched_image_features[t][i] = ob['feature'][0] batched_action_embeddings[t][i] = ob['action_embedding'][a] batched_action_embeddings = [ try_cuda(Variable(torch.from_numpy(act), requires_grad=False)) for act in batched_action_embeddings ] batched_image_features = [ try_cuda(Variable(torch.from_numpy(feat), requires_grad=False)) for feat in batched_image_features ] mask = try_cuda(torch.from_numpy(mask)) start_obs = [obs[0] for obs in path_obs] return start_obs, \ batched_image_features, \ batched_action_embeddings, \ mask, \ list(seq_lengths), \ encoded_instructions, \ list(perm_indices)
def init_state(self, batch_size): ''' Initialize to zero cell states and hidden states.''' h0 = torch.zeros(batch_size, self.hidden_size * self.num_layers * self.num_directions, requires_grad=False) c0 = torch.zeros(batch_size, self.hidden_size * self.num_layers * self.num_directions, requires_grad=False) return try_cuda(h0), try_cuda(c0)
def make_speaker(args): enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE vocab = read_vocab(TRAIN_VOCAB) encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) agent = Seq2SeqSpeaker( None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) return agent
def make_scorer(args): bidirectional = args.bidirectional enc_hidden_size = hidden_size//2 if bidirectional else hidden_size feature_size = FEATURE_SIZE traj_encoder = try_cuda(SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=args.bidirectional)) scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size)) scorer = Scorer(scorer_module, traj_encoder) if args.load_scorer is not '': scorer.load(args.load_scorer) print(colorize('load scorer traj '+ args.load_scorer)) elif args.load_traj_encoder is not '': scorer.load_traj_encoder(args.load_traj_encoder) print(colorize('load traj encoder '+ args.load_traj_encoder)) return scorer
def _feature_variable(self, obs, beamed=False): ''' Extract precomputed features into variable. ''' features = [ob['feature'] for ob in (flatten(obs) if beamed else obs)] # currently only support one image featurizer (without attention) assert all(len(f) == 1 for f in features) features = np.stack(features) return try_cuda(Variable(torch.from_numpy(features), requires_grad=False))
def make_image_attention_layers(args, image_features_list, hidden_size): image_attention_size = args.image_attention_size or hidden_size attention_mechs = [] for featurizer in image_features_list: if isinstance(featurizer, ConvolutionalImageFeatures): if args.image_attention_type == 'feedforward': attention_mechs.append( MultiplicativeImageAttention( hidden_size, image_attention_size, image_feature_size=featurizer.feature_dim)) elif args.image_attention_type == 'multiplicative': attention_mechs.append( FeedforwardImageAttention( hidden_size, image_attention_size, image_feature_size=featurizer.feature_dim)) elif isinstance(featurizer, BottomUpImageFeatures): attention_mechs.append( BottomUpImageAttention( hidden_size, args.bottom_up_detection_embedding_size, args.bottom_up_detection_embedding_size, image_attention_size, featurizer.num_objects, featurizer.num_attributes, featurizer.feature_dim)) else: attention_mechs.append(None) attention_mechs = [ try_cuda(mech) if mech else mech for mech in attention_mechs ] return attention_mechs
def prepare_proposals(self, batch_h, batch_c, batch_obs, batch_acs): ''' for each action proposal, prepare its h,c input: existing traj h,c; observation; actions output: proposed (h,c) * [batch_size, max_proposal_size] ''' batch_size, ac_size, _ = batch_acs.size() hidden_size = self.encoder.hidden_size proposal_h = try_cuda(torch.zeros(batch_size, ac_size, hidden_size)) proposal_c = try_cuda(torch.zeros(batch_size, ac_size, hidden_size)) for i in range(batch_size): h = batch_h[i].expand(ac_size, -1) c = batch_c[i].expand(ac_size, -1) obs = batch_obs[i].expand(ac_size, -1, -1) proposal_h[i], proposal_c[i] = self.encoder._forward_one_step( h, c, batch_acs[i], obs) return proposal_h.detach(), proposal_c.detach()
def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size=2048 + 128, image_attention_layers=None, visual_hidden_size=1024, num_head=8): super(CogroundDecoderLSTM, self).__init__() self.embedding_size = embedding_size self.feature_size = feature_size self.hidden_size = hidden_size self.u_begin = try_cuda( Variable(torch.zeros(embedding_size), requires_grad=False)) self.drop = nn.Dropout(p=dropout_ratio) # For now the text attention output size is hidden_size self.lstm = nn.LSTMCell(2 * embedding_size + hidden_size, hidden_size) self.text_attention_layer = WhSoftDotAttention(hidden_size, hidden_size) self.positional_encoding = PositionalEncoding(hidden_size, dropout=0) self.visual_attention_layer = WhSoftDotAttention( hidden_size, visual_hidden_size) self.visual_mlp = nn.Sequential( nn.BatchNorm1d(feature_size), nn.Linear(feature_size, visual_hidden_size), nn.BatchNorm1d(visual_hidden_size), nn.Dropout(dropout_ratio), nn.ReLU()) self.action_attention_layer = WhSoftDotAttention( hidden_size + hidden_size, visual_hidden_size) #self.action_attention_layer = VisualSoftDotAttention(hidden_size+hidden_size, visual_hidden_size) self.sm = nn.Softmax(dim=1)
def combine_logit(self, scorer_logit, follower_logit): #import pdb;pdb.set_trace() if self.gamma == 0.0: return scorer_logit if self.gamma == 1.0: return follower_logit g, h = self.gamma, 1 - self.gamma prob = h * self.sm(scorer_logit) + g * self.sm(follower_logit) return try_cuda(torch.log(prob))
def main(args): if args.job == 'test': args.use_test_set = True args.use_pretraining = False # Train a goal button #if args.job == 'train' and args.scorer is False: # print(colorize('we need a scorer')) # args.scorer = True if args.use_pretraining: agent, train_env, val_envs, pretrain_env = setup_agent_envs(args) else: agent, train_env, val_envs = setup_agent_envs(args) agent.search = True agent.search_logit = args.logit agent.search_mean = args.mean agent.search_early_stop = args.early_stop agent.episode_len = args.max_episode_len agent.gamma = args.gamma agent.revisit = args.revisit if args.load_reranker != '': agent.reranker = try_cuda(SimpleCandReranker(28)) agent.reranker.load_state_dict(torch.load(args.load_reranker)) agent.inject_stop = args.inject_stop agent.K = args.K # Load speaker if args.load_speaker is not '': speaker = make_speaker(args) speaker.load(args.load_speaker) agent.speaker = speaker if args.job == 'search': agent.episode_len = args.max_episode_len agent.gamma = args.gamma print('gamma', args.gamma, 'ep_len', args.ep_len) run_search(args, agent, train_env, val_envs) elif args.job == 'sweep': for gamma in [float(g)/100 for g in range(0,101,5)]: for ep_len in [40]: agent.episode_len = ep_len agent.gamma = gamma print('gamma', gamma, 'ep_len', ep_len) #eval_gamma(args, agent, train_env, val_envs) elif args.job == 'cache': cache(args, agent, train_env, val_envs) elif args.job == 'train': train_val(args, agent, train_env, val_envs) elif args.job == 'test': test(args, agent, val_envs) else: print("no job specified")
def forward(self, h_t_minus_1, c_t, v_ground, text_attn): h_input = torch.cat((h_t_minus_1, v_ground), 1) h_t_pm = torch.sigmoid(self.linear_h(h_input)) * torch.tanh(c_t) batch_size, seq_len = text_attn.size() if seq_len < self.text_len: pads = try_cuda(torch.zeros(batch_size, self.text_len - seq_len)) pm_input = torch.cat((text_attn, pads, h_t_pm), 1) else: pm_input = torch.cat((text_attn, h_t_pm), 1) pm_output = torch.tanh(self.linear_pm(pm_input).squeeze(-1)) return pm_output
def make_speaker(args, action_embedding_size=-1, feature_size=-1): enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size wordvec = np.load(args.wordvec_path) vocab = read_vocab(TRAIN_VOCAB, args.language) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, hidden_size, dropout_ratio, wordvec=wordvec, wordvec_finetune=args.wordvec_finetune)) agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) return agent
def make_speaker(args, action_embedding_size=-1, feature_size=-1): enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size wordvec = np.load(args.wordvec_path) vocab = read_vocab(TRAIN_VOCAB, args.language) word_embedding_size = get_word_embedding_size(args) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, args.dropout_ratio, bidirectional=args.bidirectional)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, args.hidden_size, args.dropout_ratio, wordvec=wordvec, wordvec_finetune=args.wordvec_finetune)) agent = Seq2SeqSpeaker(None, "", encoder, decoder, args.max_input_length) return agent
def make_follower(args, vocab): enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) if args.use_glove else None feature_size = FEATURE_SIZE Encoder = TransformerEncoder if args.transformer else EncoderLSTM Decoder = CogroundDecoderLSTM if args.coground else AttnDecoderLSTM word_embedding_size = 256 if args.coground else 300 encoder = try_cuda(Encoder( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(Decoder( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size, num_head=args.num_head)) prog_monitor = try_cuda(ProgressMonitor(action_embedding_size, hidden_size)) if args.prog_monitor else None bt_button = try_cuda(BacktrackButton()) if args.bt_button else None dev_monitor = try_cuda(DeviationMonitor(action_embedding_size, hidden_size)) if args.dev_monitor else None agent = Seq2SeqAgent( None, "", encoder, decoder, max_episode_len, max_instruction_length=MAX_INPUT_LENGTH, attn_only_verb=args.attn_only_verb) agent.prog_monitor = prog_monitor agent.dev_monitor = dev_monitor agent.bt_button = bt_button agent.soft_align = args.soft_align if args.scorer: agent.scorer = make_scorer(args) if args.load_follower is not '': scorer_exists = os.path.isfile(args.load_follower + '_scorer_enc') agent.load(args.load_follower, load_scorer=(args.load_scorer is '' and scorer_exists)) print(colorize('load follower '+ args.load_follower)) return agent
def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size=2048+128, image_attention_layers=None): super(AttnDecoderLSTM, self).__init__() self.embedding_size = embedding_size self.feature_size = feature_size self.hidden_size = hidden_size # self.embedding = nn.Embedding(input_action_size, embedding_size) self.u_begin = try_cuda(Variable( torch.zeros(embedding_size), requires_grad=False)) self.drop = nn.Dropout(p=dropout_ratio) self.lstm = nn.LSTMCell(embedding_size+feature_size, hidden_size) self.visual_attention_layer = VisualSoftDotAttention( hidden_size, feature_size) self.text_attention_layer = SoftDotAttention(hidden_size) self.decoder2action = EltwiseProdScoring(hidden_size, embedding_size)
def batch_instructions_from_encoded(encoded_instructions, max_length, reverse=False, sort=False): # encoded_instructions: list of lists of token indices (should not be padded, or contain BOS or EOS tokens) #seq_tensor = np.array(encoded_instructions) # make sure pad does not start any sentence num_instructions = len(encoded_instructions) seq_tensor = np.full((num_instructions, max_length), vocab_pad_idx) seq_lengths = [] for i, inst in enumerate(encoded_instructions): #if len(inst) > 0: # assert inst[-1] != vocab_eos_idx if reverse: inst = inst[::-1] inst = np.concatenate((inst.cpu(), [vocab_eos_idx])) inst = inst[:max_length] seq_tensor[i, :len(inst)] = inst seq_lengths.append(len(inst)) seq_tensor = torch.from_numpy(seq_tensor) if sort: seq_lengths, perm_idx = torch.from_numpy(np.array(seq_lengths)).sort( 0, True) seq_lengths = list(seq_lengths) seq_tensor = seq_tensor[perm_idx] mask = (seq_tensor == vocab_pad_idx)[:, :max(seq_lengths)] ret_tp = try_cuda(Variable(seq_tensor, requires_grad=False).long()), \ try_cuda(mask.bool()), \ seq_lengths if sort: ret_tp = ret_tp + (list(perm_idx), ) return ret_tp
def generate(self, path_obs, path_actions, encoded_instructions=[0]): start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, encoded_instructions, perm_indices = \ self._batch_observations_and_actions( path_obs, path_actions, encoded_instructions) batch_size = 1 batched_action_embeddings = batched_action_embeddings batched_image_features = batched_image_features ctx, h_t, c_t = self.encoder(batched_action_embeddings, batched_image_features) w_t = try_cuda( Variable(torch.from_numpy( np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(), requires_grad=False)) ended = np.array([False] * batch_size) word_indices = [] #print(w_t.size(),h_t.size(),c_t.size(),ctx.size(),path_mask.size()) for t in range(self.instruction_len): h_t, c_t, alpha, logit = self.decoder(w_t, h_t, c_t, ctx, path_mask) #_,w_t = logit.max(1) # student forcing - argmax #w_t = w_t.detach() probs = F.softmax(logit, dim=1) # sampling an action from model m = D.Categorical(probs) w_t = m.sample() word_idx = w_t[0].item() print(word_idx) word_indices.append(word_idx) if ended.all(): break decoded_words = self.tokenizer.decode_sentence(word_indices, break_on_eos=True, join=False) return decoded_words
def batch_features(self, feature_list): features = np.stack(feature_list) return try_cuda(Variable(torch.from_numpy(features), requires_grad=False))
from vocab import SUBTRAIN_VOCAB, TRAIN_VOCAB, TRAINVAL_VOCAB MAX_INPUT_LENGTH = 80 feature_size = 2048+128 max_episode_len = 10 word_embedding_size = 300 glove_path = 'tasks/R2R/data/train_glove.npy' action_embedding_size = 2048+128 hidden_size = 512 dropout_ratio = 0.5 vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) glove = np.load(glove_path) encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, hidden_size, vocab_pad_idx, dropout_ratio, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) agent = Seq2SeqAgent( None, "", encoder, decoder, max_episode_len, max_instruction_length=MAX_INPUT_LENGTH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent.load('tasks/R2R/snapshots/release/follower_final_release', map_location = device)
def __init__(self): self.scorer = scorer self.text_encoder = encoder self.traj_encoder = None self.sm = try_cuda(nn.Softmax(dim=1)) self.gamma = 0.0 # how much follower_logit to consider
def init_state(self, batch_size): h0 = Variable(torch.zeros(batch_size,self.hidden_size),requires_grad = False) c0 = Variable(torch.zeros(batch_size,self.hidden_size),requires_grad = False) return try_cuda(h0), try_cuda(c0)
def _score_obs_actions_and_instructions(self, path_obs, path_actions, encoded_instructions, feedback): assert len(path_obs) == len(path_actions) assert len(path_obs) == len(encoded_instructions) start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, encoded_instructions, perm_indices = \ batch_observations_and_actions( path_obs, path_actions, encoded_instructions) instr_seq, _, _ = batch_instructions_from_encoded( encoded_instructions, self.instruction_len) batch_size = len(start_obs) ctx, h_t, c_t = self.encoder( batched_action_embeddings, batched_image_features) w_t = try_cuda(Variable(torch.from_numpy(np.full((batch_size,), vocab_bos_idx, dtype='int64')).long(), requires_grad=False)) ended = np.array([False] * batch_size) assert len(perm_indices) == batch_size outputs = [None] * batch_size for perm_index, src_index in enumerate(perm_indices): outputs[src_index] = { 'instr_id': start_obs[perm_index]['instr_id'], 'word_indices': [], 'scores': [], # 'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]), } assert all(outputs) # for i in range(batch_size): # assert outputs[i]['instr_id'] != '1008_0', 'found example at index {}'.format(i) # Do a sequence rollout and calculate the loss loss = 0 sequence_scores = try_cuda(torch.zeros(batch_size)) for t in range(self.instruction_len): h_t, c_t, alpha, logit = self.decoder( w_t.view(-1, 1), h_t, c_t, ctx, path_mask) # Supervised training # BOS are not part of the encoded sequences target = instr_seq[:, t].contiguous() # Determine next model inputs if feedback == 'teacher': w_t = target elif feedback == 'argmax': _, w_t = logit.max(1) # student forcing - argmax w_t = w_t.detach() elif feedback == 'sample': probs = F.softmax(logit) # sampling an action from model m = D.Categorical(probs) w_t = m.sample() #w_t = probs.multinomial(1).detach().squeeze(-1) else: sys.exit('Invalid feedback option') log_probs = F.log_softmax(logit, dim=1) word_scores = -F.nll_loss(log_probs, w_t, ignore_index=vocab_pad_idx, reduction='none') sequence_scores += word_scores.data loss += F.nll_loss(log_probs, target, ignore_index=vocab_pad_idx, reduction='mean') for perm_index, src_index in enumerate(perm_indices): word_idx = w_t[perm_index].item() if not ended[perm_index]: outputs[src_index]['word_indices'].append(int(word_idx)) outputs[src_index]['score'] = float(sequence_scores[perm_index]) outputs[src_index]['scores'].append( word_scores[perm_index].data.tolist()) if word_idx == vocab_eos_idx: ended[perm_index] = True # print('t: %s\tstate: %s\taction: %s\tscore: %s' % (t, world_states[0], a_t.data[0], sequence_scores[0])) # Early exit if all ended if ended.all(): break for item in outputs: item['words'] = self.env.tokenizer.decode_sentence( item['word_indices'], break_on_eos=True, join=False) # pdb.set_trace() return outputs, loss
def _score_obs_actions_and_instructions(self, path_obs, path_actions, encoded_instructions, feedback, lamda=0.95): assert len(path_obs) == len(path_actions) assert len(path_obs) == len(encoded_instructions) start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, encoded_instructions, perm_indices = \ self._batch_observations_and_actions( path_obs, path_actions, encoded_instructions) instr_seq, _, _ = batch_instructions_from_encoded( encoded_instructions, self.instruction_len) batch_size = len(start_obs) ctx, h_t, c_t = self.encoder(batched_action_embeddings, batched_image_features) w_t = try_cuda( Variable(torch.from_numpy( np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(), requires_grad=False)) ended = np.array([False] * batch_size) assert len(perm_indices) == batch_size outputs = [None] * batch_size for perm_index, src_index in enumerate(perm_indices): outputs[src_index] = { 'instr_id': start_obs[perm_index]['instr_id'], 'word_indices': [], 'scores': [], #'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]), } assert all(outputs) # for i in range(batch_size): # assert outputs[i]['instr_id'] != '1008_0', "found example at index {}".format(i) # Do a sequence rollout and calculate the loss loss = 0 sequence_scores = try_cuda(torch.zeros(batch_size)) output_soft = [] instr_pred = [] for t in range(self.instruction_len): h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1), h_t, c_t, ctx, path_mask) # Supervised training # BOS are not part of the encoded sequences target = instr_seq[:, t].contiguous() probs = F.softmax(logit, dim=1) # Determine next model inputs if feedback == 'teacher': w_t = target elif feedback == 'argmax': _, w_t = logit.max(1) # student forcing - argmax w_t = w_t.detach() elif feedback == 'sample': #probs = F.softmax(logit) # sampling an action from model m = D.Categorical(probs) w_t = m.sample() #w_t = probs.multinomial(1).detach().squeeze(-1) else: sys.exit('Invalid feedback option') log_probs = F.log_softmax(logit, dim=1) output_soft.append(probs.unsqueeze(0)) instr_pred.append(w_t.unsqueeze(0)) word_scores = -F.nll_loss( log_probs, w_t, ignore_index=vocab_pad_idx, reduction='none') sequence_scores += word_scores.data loss += F.nll_loss(log_probs, target, ignore_index=vocab_pad_idx, reduction='mean') for perm_index, src_index in enumerate(perm_indices): word_idx = w_t[perm_index].item() if not ended[perm_index]: outputs[src_index]['word_indices'].append(int(word_idx)) outputs[src_index]['score'] = float( sequence_scores[perm_index]) outputs[src_index]['scores'].append( word_scores[perm_index].data.tolist()) if word_idx == vocab_eos_idx: ended[perm_index] = True # print("t: %s\tstate: %s\taction: %s\tscore: %s" % (t, world_states[0], a_t.data[0], sequence_scores[0])) # Early exit if all ended if ended.all(): break output_soft = torch.cat(output_soft, 0) output_soft = output_soft.transpose(0, 1) instr_pred = torch.cat(instr_pred, 0) instr_pred = instr_pred.transpose(0, 1).int().tolist() instr_seq = instr_seq.int().tolist() def unpad(ls): length = len(ls) output = [None] * length for i in range(len(ls)): try: idx = ls[i].index(vocab_eos_idx) + 1 except: idx = len(ls[i]) output[i] = ls[i][:idx] return output instr_pred = unpad(instr_pred) instr_seq = unpad(instr_seq) #print(instr_seq[0],instr_pred[0], BLEU([instr_seq[0]], instr_pred[0],weights=(1/3,1/3,1/3))) bleus = [] lossRL = 0 if not self.stat == 'test': # ============================================================================= # ##################################################################################### # ##########################bleu reward############################################### # #print(output_soft,output_soft.size(),instr_pred.size(),instr_seq.size()) # #print(sum(bleus)/len(bleus)) # for batch_idx in range(batch_size): # #print(batch_idx) # pred_i = instr_pred[batch_idx] # seq_i = instr_seq[batch_idx] # #print(seq_i, pred_i) # bleus.append(BLEU([seq_i],pred_i)) # for i in range(len(pred_i)): # # G = 0 # for j in range(len(pred_i)-i,len(pred_i)+1): # if j > 1: # G = G + BLEU([seq_i],pred_i[:j]) - BLEU([seq_i],pred_i[:j-1]) # else: # G = G + BLEU([seq_i],pred_i[:j]) # # lossRL += - G * torch.log(output_soft[batch_idx][len(pred_i)-i-1][pred_i[len(pred_i)-i-1]]) # # ============================================================================= ####################################################################################################### ###########################Bertscore reward############################################################ #vocab = read_vocab(TRAIN_VOCAB) #tok = Tokenizer(vocab=vocab) lossRL2 = 0 def get_instr_list(ls): ls_ls = [] for i in range(len(ls)): ls_ls.append([ self.tok.decode_sentence(ls[:i + 1], break_on_eos=True, join=True) ]) return ls_ls def get_bscore(ls, ref): ls_ls = get_instr_list(ls) bscore_ls = [] for cand in ls_ls: _, _, F1 = self.scorer.score(cand, [ref]) bscore_ls.append(F1) return bscore_ls for batch_idx in range(batch_size): #print(batch_idx) pred_i = instr_pred[batch_idx] #pred_i = [tok.decode_sentence(pred_i,break_on_eos=True,join=True)] seq_i = instr_seq[batch_idx] seq_i = [ self.tok.decode_sentence(seq_i, break_on_eos=True, join=True) ] bscore_ls = get_bscore(pred_i, seq_i) bleus.append(bscore_ls[-1]) for i in range(len(pred_i)): G = 0 for j in range(len(pred_i) - i - 1, len(pred_i)): if j > 0: t = j - (len(pred_i) - i - 1) G += (bscore_ls[j] - bscore_ls[j - 1]) * np.power( lamda, t) else: G += bscore_ls[j] lossRL2 += -G.cuda() * torch.log( output_soft[batch_idx][len(pred_i) - i - 1][pred_i[len(pred_i) - i - 1]]) #####################distance as reward################################## #follower will be loaded in advance for batch_idx in range(batch_size): #print('{}/{}'.format(batch_idx,batch_size)) pred_i = instr_pred[batch_idx] if pred_i[-1] == 2: pred_i_full = pred_i[:-1][::-1] + [2] else: pred_i_full = pred_i[::-1] #pred_i = torch.tensor(pred_i,device = torch.device('cuda')) location_end = path_obs[batch_idx][-1][ 'viewpoint'] # end point of the traj location_start = path_obs[batch_idx][0][ 'viewpoint'] # start point of the traj ob_1 = start_obs[batch_idx] scanId = ob_1['scan'] viewpoint = ob_1['viewpoint'] elevation = ob_1['elevation'] heading = ob_1['heading'] #print(dist_i) traj = self.agent.generate( self.sim, torch.tensor(pred_i_full, device=torch.device('cuda')), scanId, viewpoint, heading, elevation) end_pose_pred = traj['trajectory'][-1][0] dist_i = self.env.distances[scanId][end_pose_pred][ location_end] # distance towards goal length_i = self.env.distances[scanId][location_start][ location_end] # total length of the traj bonus = 3 if dist_i < 3 else 0 bleus.append(dist_i) for i in range(len(pred_i)): if i == 0: G = -(dist_i - self.env.distances[scanId][viewpoint] [location_end]) / length_i + bonus else: pred_i_i = pred_i[: i] # form the input sequence (reverse them) if pred_i_i[-1] == 2: pred_i_i = pred_i_i[:-1][::-1] + [2] else: pred_i_i = pred_i_i[::-1] pred_i_i = torch.tensor(pred_i_i, device=torch.device('cuda')) traj_j = self.agent.generate(self.sim, pred_i_i, scanId, viewpoint, heading, elevation) end_pose_j = traj_j['trajectory'][-1][0] G = -(dist_i - self.env.distances[scanId][end_pose_j] [location_end]) / length_i + bonus lossRL += -G * torch.log( output_soft[batch_idx][i][pred_i[i]]) # ============================================================================= # ####################################################################################################### # ####################### compat score as reward ######################################################## # def get_instr_list(ls): # ls_ls=[] # for i in range(len(ls)): # ls_ls.append([ls[:i+1]]) # return ls_ls # def get_score_ls(path_obs, path_actions, pred_i): # ls_ls = get_instr_list(pred_i) # score_ls = [] # for cand in ls_ls: # score = self.compat.predict([path_obs],[path_actions],cand) # score_ls.append(score) # return score_ls # # for batch_idx in range(batch_size): # path_action = path_actions[batch_idx] # path_ob = path_obs[batch_idx] # # encoded_instructions_init, _ = self.tok.encode_sentence('') # start_score = self.compat.predict([path_ob],[path_action],torch.tensor([encoded_instructions_init], device = 'cpu')) # #print('{}/{}'.format(batch_idx,batch_size)) # pred_i = instr_pred[batch_idx] # if pred_i[-1] == 2: # pred_i = pred_i[:-1][::-1] + [2] # else: pred_i.reverse() # pred_i = torch.tensor(pred_i,device = torch.device('cuda')) # # score_ls = get_score_ls(path_ob, path_action, pred_i) # bleus.append(score_ls[-1].detach()) # for i in range(len(pred_i)): # G = 0 # for j in range(len(pred_i)-i-1,len(pred_i)): # if j > 0: # t = j - (len(pred_i)-i-1) # G += (score_ls[j]-score_ls[j-1])*np.power(lamda,t) # else: # G += score_ls[j] - start_score # # lossRL += - G * torch.log(output_soft[batch_idx][i][pred_i[i]]) # # # # ============================================================================= npy = np.load('VLN_training.npy') bleu_avg = sum(bleus) / len(bleus) print(bleu_avg, pred_i) npy = np.append(npy, bleu_avg) #np.save('BLEU_training.npy',npy) with open('VLN_training.npy', 'wb') as f: np.save(f, npy) #print(lossRL, loss) loss = 1 * lossRL + 1 * lossRL2 #loss = lossRL for item in outputs: item['words'] = self.env.tokenizer.decode_sentence( item['word_indices'], break_on_eos=True, join=False) return outputs, loss
weight_decay = 0.0005 #weight_decay = 0.0001 FEATURE_SIZE = 2048+128 n_iters = 5000 log_every = 100 save_every = 100 vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab) glove = np.load(glove_path) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size feature_size = FEATURE_SIZE visEncoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) lanEncoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) agent = compatModel(None, "", visEncoder, lanEncoder, dotSim) #agent.load('tasks/R2R/snapshots/release/speaker_final_release', map_location = 'cpu') agent.load('tasks/R2R/compat/trained_1/compat_sample_imagenet_mean_pooled_train_iter_1000', map_location = 'cpu') if __name__ == "__main__": traj = {'scan':'5q7pvUzZiYa', 'path':["7dc12a67ddfc4a4a849ce620db5b777b", "0e84cf4dec784bc28b78a80bee35c550", "a77784b955454209857d745976a1676d", "67971a17c26f4e2ca117b4fca73507fe", "8db06d3a0dd44508b3c078d60126ce19", "43ac37dfa1db4a13a8a9df4e454eb016", "4bd82c990a6548a994daa97c8f52db06", "6d11ca4d41e04bb1a725c2223c36b2aa", "29fb3c58b29348558d36a9f9440a1379", "c23f26401359426982d11ca494ee739b", "397403366d784caf804d741f32fd68b9", "3c6a35e15ada4b649990d6568cce8bd9", "55e4436f528c4bf09e4550079c572f7b", "69fad7dd177847dbabf69e8fb7c00ddf", "c629c7f1cf6f47a78c45a8ae9ff82247", "21fca0d6192940e580587fe317440f56", "4b85d61dd3a94e8a812affe78f3a322d", "3c025b8e3d2040969cd00dd0e9f29b09"][:2], 'heading':0.0,'elevation_init':0.0} encoded_instructions, _ = tok.encode_sentence('') encoded_instructions = torch.tensor([encoded_instructions], device = 'cpu') rdv_test = rdv(traj)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) train_env.data.extend(hardNeg_train) # extend train data and shuffle random.shuffle(train_env.data) enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE # ============================================================================= # visEncoder = try_cuda(CompatVisEncoderLSTM( # action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, # bidirectional=bidirectional)) # ============================================================================= visEncoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) # ============================================================================= # lanEncoder = try_cuda(CompatLanEncoderLSTM( # len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, # dropout_ratio, bidirectional=True, glove=glove)) # ============================================================================= lanEncoder = try_cuda( EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=False, glove=glove)) dotSim = try_cuda(dotSimilarity(batch_size, enc_hidden_size)) #visEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/speaker_final_release_enc')) #lanEncoder.load_state_dict(torch.load('tasks/R2R/snapshots/release/follower_final_release_enc')) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits } #test_envs['val_seen'][0].data.extend(hardNeg_val_seen) test_envs['val_unseen'][0].data.extend(hardNeg_val_unseen) test_envs['val_unseen'][0].data = test_envs['val_unseen'][0].data[ 3000:4000] return train_env, test_envs, visEncoder, lanEncoder, dotSim
def beam_search(self, beam_size, path_obs, path_actions): # TODO: here assert len(path_obs) == len(path_actions) start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, _, perm_indices = \ batch_observations_and_actions(path_obs, path_actions, None) batch_size = len(start_obs) assert len(perm_indices) == batch_size ctx, h_t, c_t = self.encoder( batched_action_embeddings, batched_image_features) completed = [] for _ in range(batch_size): completed.append([]) beams = [ [InferenceState(prev_inference_state=None, flat_index=i, last_word=vocab_bos_idx, word_count=0, score=0.0, last_alpha=None)] for i in range(batch_size) ] for t in range(self.instruction_len): flat_indices = [] beam_indices = [] w_t_list = [] for beam_index, beam in enumerate(beams): for inf_state in beam: beam_indices.append(beam_index) flat_indices.append(inf_state.flat_index) w_t_list.append(inf_state.last_word) w_t = try_cuda(Variable(torch.LongTensor(w_t_list), requires_grad=False)) if len(w_t.shape) == 1: w_t = w_t.unsqueeze(0) h_t, c_t, alpha, logit = self.decoder( w_t.view(-1, 1), h_t[flat_indices], c_t[flat_indices], ctx[beam_indices], path_mask[beam_indices]) log_probs = F.log_softmax(logit, dim=1).data _, word_indices = logit.data.topk(min(beam_size, logit.size()[1]), dim=1) word_scores = log_probs.gather(1, word_indices) assert word_scores.size() == word_indices.size() start_index = 0 new_beams = [] all_successors = [] for beam_index, beam in enumerate(beams): successors = [] end_index = start_index + len(beam) if beam: for inf_index, (inf_state, word_score_row, word_index_row) in \ enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])): for word_score, word_index in zip(word_score_row, word_index_row): flat_index = start_index + inf_index successors.append( InferenceState( prev_inference_state=inf_state, flat_index=flat_index, last_word=word_index, word_count=inf_state.word_count + 1, score=inf_state.score + word_score, last_alpha=alpha[flat_index].data) ) start_index = end_index successors = sorted(successors, key=lambda t: t.score, reverse=True)[ :beam_size] all_successors.append(successors) for beam_index, successors in enumerate(all_successors): new_beam = [] for successor in successors: if successor.last_word == vocab_eos_idx or t == self.instruction_len - 1: completed[beam_index].append(successor) else: new_beam.append(successor) if len(completed[beam_index]) >= beam_size: new_beam = [] new_beams.append(new_beam) beams = new_beams if not any(beam for beam in beams): break outputs = [] for _ in range(batch_size): outputs.append([]) for perm_index, src_index in enumerate(perm_indices): this_outputs = outputs[src_index] assert len(this_outputs) == 0 this_completed = completed[perm_index] instr_id = start_obs[perm_index]['instr_id'] for inf_state in sorted(this_completed, key=lambda t: t.score, reverse=True)[:beam_size]: word_indices, scores, attentions = backchain_inference_states( inf_state) this_outputs.append({ 'instr_id': instr_id, 'word_indices': word_indices, 'score': inf_state.score, 'scores': scores, 'words': self.env.tokenizer.decode_sentence(word_indices, break_on_eos=True, join=False), 'attentions': attentions, }) return outputs
def transform(lst, wrap_with_var=True): features = np.stack(lst) x = torch.from_numpy(features) if wrap_with_var: x = Variable(x, requires_grad=False) return try_cuda(x)