def make_scorer(args): bidirectional = args.bidirectional enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size if args.useObjLabelOrVis == 'none': feature_size, action_embedding_size = 2048 + 128, 2048 + 128 elif args.useObjLabelOrVis == 'vis': feature_size, action_embedding_size = 2048 + 128 + args.objVisFeatDim, 2048 + 128 + args.objVisFeatDim elif args.useObjLabelOrVis == 'label': feature_size, action_embedding_size = 2048 + 128 + args.objLanFeatDim, 2048 + 128 + args.objLanFeatDim elif args.useObjLabelOrVis == 'both': feature_size = 2048 + 128 + args.objVisFeatDim + args.objLanFeatDim action_embedding_size = 2048 + args.objVisFeatDim + args.objLanFeatDim + 128 traj_encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=args.bidirectional)) scorer_module = try_cuda(DotScorer(enc_hidden_size, enc_hidden_size)) scorer = Scorer(scorer_module, traj_encoder) if args.load_scorer is not '': scorer.load(args.load_scorer) print(colorize('load scorer traj ' + args.load_scorer)) elif args.load_traj_encoder is not '': scorer.load_traj_encoder(args.load_traj_encoder) print(colorize('load traj encoder ' + args.load_traj_encoder)) return scorer
def batch_observations_and_actions(path_obs, path_actions, encoded_instructions): seq_lengths = np.array([len(a) for a in path_actions]) max_path_length = seq_lengths.max() # DO NOT permute the sequence, since here we are doing manual LSTM unrolling in encoder # perm_indices = np.argsort(-seq_lengths) perm_indices = np.arange(len(path_obs)) #path_obs, path_actions, encoded_instructions, seq_lengths = zip(*sorted(zip(path_obs, path_actions, encoded_instructions, seq_lengths), key=lambda p: p[-1], reverse=True)) # path_obs = [path_obs[i] for i in perm_indices] # path_actions = [path_actions[i] for i in perm_indices] # if encoded_instructions: # encoded_instructions = [encoded_instructions[i] for i in perm_indices] # seq_lengths = [seq_lengths[i] for i in perm_indices] batch_size = len(path_obs) assert batch_size == len(path_actions) mask = np.ones((batch_size, max_path_length), np.uint8) action_embedding_dim = path_obs[0][0]['action_embedding'].shape[-1] batched_action_embeddings = [ np.zeros((batch_size, action_embedding_dim), np.float32) for _ in range(max_path_length) ] feature_list = path_obs[0][0]['feature'] assert len(feature_list) == 1 image_feature_shape = feature_list[0].shape batched_image_features = [ np.zeros((batch_size, ) + image_feature_shape, np.float32) for _ in range(max_path_length) ] for i, (obs, actions) in enumerate(zip(path_obs, path_actions)): # don't include the last state, which should result after the stop action if len(obs) == len(actions) + 1: obs = obs[:-1] assert len(obs) == len(actions) mask[i, :len(actions)] = 0 for t, (ob, a) in enumerate(zip(obs, actions)): batched_image_features[t][i] = ob['feature'][0] batched_action_embeddings[t][i] = ob['action_embedding'][a] batched_action_embeddings = [ try_cuda(Variable(torch.from_numpy(act), requires_grad=False)) for act in batched_action_embeddings ] batched_image_features = [ try_cuda(Variable(torch.from_numpy(feat), requires_grad=False)) for feat in batched_image_features ] mask = try_cuda(torch.from_numpy(mask)) start_obs = [obs[0] for obs in path_obs] return start_obs, \ batched_image_features, \ batched_action_embeddings, \ mask, \ list(seq_lengths), \ encoded_instructions, \ list(perm_indices)
def prepare_proposals(self, batch_h, batch_c, batch_obs, batch_acs): ''' for each action proposal, prepare its h,c input: existing traj h,c; observation; actions output: proposed (h,c) * [batch_size, max_proposal_size] ''' batch_size, ac_size, _ = batch_acs.size() hidden_size = self.encoder.hidden_size proposal_h = try_cuda(torch.zeros(batch_size, ac_size, hidden_size)) proposal_c = try_cuda(torch.zeros(batch_size, ac_size, hidden_size)) for i in range(batch_size): h = batch_h[i].expand(ac_size, -1) c = batch_c[i].expand(ac_size, -1) obs = batch_obs[i].expand(ac_size, -1, -1) proposal_h[i], proposal_c[i] = self.encoder._forward_one_step( h, c, batch_acs[i], obs) return proposal_h.detach(), proposal_c.detach()
def combine_logit(self, scorer_logit, follower_logit): #import pdb;pdb.set_trace() if self.gamma == 0.0: return scorer_logit if self.gamma == 1.0: return follower_logit g, h = self.gamma, 1 - self.gamma prob = h * self.sm(scorer_logit) + g * self.sm(follower_logit) return try_cuda(torch.log(prob))
def _feature_variable(self, obs, beamed=False): ''' Extract precomputed features into variable. ''' features = [ob['feature'] for ob in (flatten(obs) if beamed else obs)] assert all( len(f) == 1 for f in features ) #currently only support one image featurizer (without attention) features = np.stack(features) return try_cuda( Variable(torch.from_numpy(features), requires_grad=False))
def make_speaker(args): enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE vocab = read_vocab(TRAIN_VOCAB) encoder = try_cuda( SpeakerEncoderLSTM(action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda( SpeakerDecoderLSTM(len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) agent = Seq2SeqSpeaker(None, "", encoder, decoder, MAX_INSTRUCTION_LENGTH) return agent
def make_follower(args, vocab): enc_hidden_size = hidden_size // 2 if args.bidirectional else hidden_size glove_path = osp.join(file_path, 'data', 'train_glove.npy') # not used glove = np.load(glove_path) if args.use_glove else None if args.useObjLabelOrVis == 'none': feature_size, action_embedding_size = 2048 + 128, 2048 + 128 elif args.useObjLabelOrVis == 'vis': feature_size, action_embedding_size = 2048 + 128 + args.objVisFeatDim, 2048 + 128 + args.objVisFeatDim elif args.useObjLabelOrVis == 'label': feature_size, action_embedding_size = 2048 + 128 + args.objLanFeatDim, 2048 + 128 + args.objLanFeatDim elif args.useObjLabelOrVis == 'both': feature_size = 2048 + 128 + args.objVisFeatDim + args.objLanFeatDim action_embedding_size = 2048 + args.objVisFeatDim + args.objLanFeatDim + 128 Encoder = TransformerEncoder if args.transformer else EncoderLSTM Decoder = CogroundDecoderLSTM if args.coground else AttnDecoderLSTM word_embedding_size = 256 if args.coground else 300 encoder = try_cuda( Encoder(len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda( Decoder(action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size, num_head=args.num_head)) prog_monitor = try_cuda(ProgressMonitor( action_embedding_size, hidden_size)) if args.prog_monitor else None bt_button = try_cuda(BacktrackButton()) if args.bt_button else None dev_monitor = try_cuda(DeviationMonitor( action_embedding_size, hidden_size)) if args.dev_monitor else None agent = Seq2SeqAgent(None, "", encoder, decoder, max_episode_len, max_instruction_length=MAX_INPUT_LENGTH, attn_only_verb=args.attn_only_verb) agent.prog_monitor = prog_monitor agent.dev_monitor = dev_monitor # not used agent.bt_button = bt_button # not used agent.soft_align = args.soft_align # not used if args.useObjLabelOrVis != 'none': if args.useDect: print('Using detectoin-based pointer') agent.pointer = DectPointer(args) else: print('Using gt-based pointer') agent.pointer = Pointer(args) agent.useObjLabelOrVis = args.useObjLabelOrVis agent.objTopK = args.objTopK agent.objVisFeatDim = args.objVisFeatDim agent.objLanFeatDim = args.objLanFeatDim agent.ObjEachViewVisFeatPath = osp.join(root_path, 'img_features', args.ObjEachViewVisFeatDir) agent.ObjEachViewLanFeatPath = osp.join(root_path, 'img_features', args.ObjEachViewLanFeatDir) agent.ObjEachViewVisFeat = {} agent.ObjEachViewLanFeat = {} dict_glove = np.load(args.labelGlovePath) # for object label encoding if args.useObjLabelOrVis in ['label', 'both']: agent.objLabelEncoder = try_cuda( EncoderLSTMGlove(dict_glove.shape[0], 300, int(enc_hidden_size / 2), vocab_pad_idx, dropout_ratio, bidirectional=True, glove=dict_glove)) else: agent.objLabelEncoder = None else: agent.pointer = None if args.scorer: # not used agent.scorer = make_scorer(args) if args.load_follower is not '': scorer_exists = osp.isfile(args.load_follower + '_scorer_enc') agent.load(args.load_follower, load_scorer=(args.load_scorer is '' and scorer_exists)) print(colorize('load follower ' + args.load_follower)) return agent
def __init__(self): self.scorer = scorer self.text_encoder = encoder self.traj_encoder = None self.sm = try_cuda(nn.Softmax(dim=1)) self.gamma = 0.0 # how much follower_logit to consider
def transform(lst, wrap_with_var=True): features = np.stack(lst) x = torch.from_numpy(features) if wrap_with_var: x = Variable(x, requires_grad=False) return try_cuda(x)
def batch_features(self, feature_list): features = np.stack(feature_list) return try_cuda( Variable(torch.from_numpy(features), requires_grad=False))
def beam_search(self, beam_size, path_obs, path_actions): # TODO: here assert len(path_obs) == len(path_actions) start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, _, perm_indices = \ batch_observations_and_actions(path_obs, path_actions, None) batch_size = len(start_obs) assert len(perm_indices) == batch_size ctx, h_t, c_t = self.encoder(batched_action_embeddings, batched_image_features) completed = [] for _ in range(batch_size): completed.append([]) beams = [[ InferenceState(prev_inference_state=None, flat_index=i, last_word=vocab_bos_idx, word_count=0, score=0.0, last_alpha=None) ] for i in range(batch_size)] for t in range(self.instruction_len): flat_indices = [] beam_indices = [] w_t_list = [] for beam_index, beam in enumerate(beams): for inf_state in beam: beam_indices.append(beam_index) flat_indices.append(inf_state.flat_index) w_t_list.append(inf_state.last_word) w_t = try_cuda( Variable(torch.LongTensor(w_t_list), requires_grad=False)) if len(w_t.shape) == 1: w_t = w_t.unsqueeze(0) h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1), h_t[flat_indices], c_t[flat_indices], ctx[beam_indices], path_mask[beam_indices]) log_probs = F.log_softmax(logit, dim=1).data _, word_indices = logit.data.topk(min(beam_size, logit.size()[1]), dim=1) word_scores = log_probs.gather(1, word_indices) assert word_scores.size() == word_indices.size() start_index = 0 new_beams = [] all_successors = [] for beam_index, beam in enumerate(beams): successors = [] end_index = start_index + len(beam) if beam: for inf_index, (inf_state, word_score_row, word_index_row) in \ enumerate(zip(beam, word_scores[start_index:end_index], word_indices[start_index:end_index])): for word_score, word_index in zip( word_score_row, word_index_row): flat_index = start_index + inf_index successors.append( InferenceState( prev_inference_state=inf_state, flat_index=flat_index, last_word=word_index, word_count=inf_state.word_count + 1, score=inf_state.score + word_score, last_alpha=alpha[flat_index].data)) start_index = end_index successors = sorted(successors, key=lambda t: t.score, reverse=True)[:beam_size] all_successors.append(successors) for beam_index, successors in enumerate(all_successors): new_beam = [] for successor in successors: if successor.last_word == vocab_eos_idx or t == self.instruction_len - 1: completed[beam_index].append(successor) else: new_beam.append(successor) if len(completed[beam_index]) >= beam_size: new_beam = [] new_beams.append(new_beam) beams = new_beams if not any(beam for beam in beams): break outputs = [] for _ in range(batch_size): outputs.append([]) for perm_index, src_index in enumerate(perm_indices): this_outputs = outputs[src_index] assert len(this_outputs) == 0 this_completed = completed[perm_index] instr_id = start_obs[perm_index]['instr_id'] for inf_state in sorted(this_completed, key=lambda t: t.score, reverse=True)[:beam_size]: word_indices, scores, attentions = backchain_inference_states( inf_state) this_outputs.append({ 'instr_id': instr_id, 'word_indices': word_indices, 'score': inf_state.score, 'scores': scores, 'words': self.env.tokenizer.decode_sentence(word_indices, break_on_eos=True, join=False), 'attentions': attentions, }) return outputs
def _score_obs_actions_and_instructions(self, path_obs, path_actions, encoded_instructions, feedback): assert len(path_obs) == len(path_actions) assert len(path_obs) == len(encoded_instructions) start_obs, batched_image_features, batched_action_embeddings, path_mask, \ path_lengths, encoded_instructions, perm_indices = \ batch_observations_and_actions( path_obs, path_actions, encoded_instructions) instr_seq, _, _ = batch_instructions_from_encoded( encoded_instructions, self.instruction_len) batch_size = len(start_obs) ctx, h_t, c_t = self.encoder(batched_action_embeddings, batched_image_features) w_t = try_cuda( Variable(torch.from_numpy( np.full((batch_size, ), vocab_bos_idx, dtype='int64')).long(), requires_grad=False)) ended = np.array([False] * batch_size) assert len(perm_indices) == batch_size outputs = [None] * batch_size for perm_index, src_index in enumerate(perm_indices): outputs[src_index] = { 'instr_id': start_obs[perm_index]['instr_id'], 'word_indices': [], 'scores': [], #'actions': ' '.join(FOLLOWER_MODEL_ACTIONS[ac] for ac in path_actions[src_index]), } assert all(outputs) # for i in range(batch_size): # assert outputs[i]['instr_id'] != '1008_0', "found example at index {}".format(i) # Do a sequence rollout and calculate the loss loss = 0 sequence_scores = try_cuda(torch.zeros(batch_size)) for t in range(self.instruction_len): h_t, c_t, alpha, logit = self.decoder(w_t.view(-1, 1), h_t, c_t, ctx, path_mask) # Supervised training # BOS are not part of the encoded sequences target = instr_seq[:, t].contiguous() # Determine next model inputs if feedback == 'teacher': w_t = target elif feedback == 'argmax': _, w_t = logit.max(1) # student forcing - argmax w_t = w_t.detach() elif feedback == 'sample': probs = F.softmax(logit) # sampling an action from model m = D.Categorical(probs) w_t = m.sample() #w_t = probs.multinomial(1).detach().squeeze(-1) else: sys.exit('Invalid feedback option') log_probs = F.log_softmax(logit, dim=1) word_scores = -F.nll_loss( log_probs, w_t, ignore_index=vocab_pad_idx, reduction='none') sequence_scores += word_scores.data loss += F.nll_loss(log_probs, target, ignore_index=vocab_pad_idx, reduction='elementwise_mean') for perm_index, src_index in enumerate(perm_indices): word_idx = w_t[perm_index].item() if not ended[perm_index]: outputs[src_index]['word_indices'].append(int(word_idx)) outputs[src_index]['score'] = float( sequence_scores[perm_index]) outputs[src_index]['scores'].append( word_scores[perm_index].data.tolist()) if word_idx == vocab_eos_idx: ended[perm_index] = True # print("t: %s\tstate: %s\taction: %s\tscore: %s" % (t, world_states[0], a_t.data[0], sequence_scores[0])) # Early exit if all ended if ended.all(): break for item in outputs: item['words'] = self.env.tokenizer.decode_sentence( item['word_indices'], break_on_eos=True, join=False) return outputs, loss