def update(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.replay_memory) < self.replay_batch_size: return None transitions = self.replay_memory.sample(self.replay_batch_size) batch = Transition(*zip(*transitions)) observation_id_list = pad_sequences( batch.observation_id_list, maxlen=max_len(batch.observation_id_list)).astype('int32') input_observation = to_pt(observation_id_list, self.use_cuda) next_observation_id_list = pad_sequences( batch.next_observation_id_list, maxlen=max_len(batch.next_observation_id_list)).astype('int32') next_input_observation = to_pt(next_observation_id_list, self.use_cuda) chosen_indices = list(list(zip(*batch.word_indices))) chosen_indices = [torch.stack(item, 0) for item in chosen_indices] # list of batch x 1 word_ranks = self.infer_word_ranks( input_observation ) # list of batch x vocab, len=5 (one per potential output word) word_qvalues = [ w_rank.gather(1, idx).squeeze(-1) for w_rank, idx in zip(word_ranks, chosen_indices) ] # list of batch q_value = torch.mean(torch.stack(word_qvalues, -1), -1) # batch next_word_ranks = self.infer_word_ranks( next_input_observation ) # batch x n_verb, batch x n_noun, batchx n_second_noun next_word_masks = list(list(zip(*batch.next_word_masks))) next_word_masks = [np.stack(item, 0) for item in next_word_masks] next_word_qvalues, _ = _choose_maxQ_command(next_word_ranks, next_word_masks, self.use_cuda) next_q_value = torch.mean(torch.stack(next_word_qvalues, -1), -1) # batch next_q_value = next_q_value.detach() rewards = torch.stack(batch.reward) # batch not_done = 1.0 - np.array(batch.done, dtype='float32') # batch not_done = to_pt(not_done, self.use_cuda, type='float') rewards = rewards + not_done * next_q_value * self.discount_gamma # batch mask = torch.stack(batch.mask) # batch loss = F.smooth_l1_loss(q_value * mask, rewards * mask) return loss
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ inventory_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["inventory"]] inventory_id_list = [_words_to_ids(tokens, self.word2id) for tokens in inventory_token_list] feedback_token_list = [preproc(item, str_type='feedback', tokenizer=self.nlp) for item in obs] feedback_id_list = [_words_to_ids(tokens, self.word2id) for tokens in feedback_token_list] quest_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["extra.recipe"]] quest_id_list = [_words_to_ids(tokens, self.word2id) for tokens in quest_token_list] prev_action_token_list = [preproc(item, tokenizer=self.nlp) for item in self.prev_actions] prev_action_id_list = [_words_to_ids(tokens, self.word2id) for tokens in prev_action_token_list] description_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["description"]] for i, d in enumerate(description_token_list): if len(d) == 0: description_token_list[i] = ["end"] # if empty description, insert word "end" description_id_list = [_words_to_ids(tokens, self.word2id) for tokens in description_token_list] description_id_list = [_d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list)] input_description = pad_sequences(description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list
def get_qa_loss(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.qa_replay_memory) < self.replay_batch_size: return None transitions = self.qa_replay_memory.sample(self.replay_batch_size) batch = qa_Transition(*zip(*transitions)) answer_distribution, obs_mask = self.answer_question( batch.observation_list, batch.quest_list, use_model="online") # answer_distribution is batch x time x 2 answer_distribution = masked_softmax(answer_distribution, obs_mask.unsqueeze(-1), axis=1) answer_strings = [item[0] for item in batch.answer_strings] groundtruth_answer_positions = get_answer_position( batch.observation_list, answer_strings) # list: batch x 2 groundtruth = pad_sequences(groundtruth_answer_positions).astype( 'int32') groundtruth = to_pt(groundtruth, self.use_cuda) # batch x 2 batch_loss = NegativeLogLoss( answer_distribution * obs_mask.unsqueeze(-1), groundtruth) return torch.mean(batch_loss)
def get_agent_inputs(self, string_list): sentence_token_list = [item.split() for item in string_list] sentence_id_list = [ _words_to_ids(tokens, self.word2id) for tokens in sentence_token_list ] input_sentence_char = list_of_token_list_to_char_input( sentence_token_list, self.char2id) input_sentence = pad_sequences( sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32') input_sentence = to_pt(input_sentence, self.use_cuda) input_sentence_char = to_pt(input_sentence_char, self.use_cuda) return input_sentence, input_sentence_char, sentence_id_list
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ word2id = self.vocab.word2id inventory_id_list = get_token_ids_for_items(infos["inventory"], word2id, tokenizer=self.nlp) feedback_id_list = get_token_ids_for_items(obs, word2id, tokenizer=self.nlp) quest_id_list = get_token_ids_for_items(infos["extra.recipe"], word2id, tokenizer=self.nlp) prev_action_id_list = get_token_ids_for_items(self.prev_actions, word2id, tokenizer=self.nlp) description_id_list = get_token_ids_for_items(infos["description"], word2id, tokenizer=self.nlp, subst_if_empty=['end']) description_id_list = [ _d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa ) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list) ] input_description = pad_sequences( description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list
def get_sufficient_info_reward_attribute(reward_helper_info): asked_entities = reward_helper_info["_entities"] asked_attributes = reward_helper_info["_attributes"] init_game_facts = reward_helper_info["init_game_facts"] full_facts = reward_helper_info["full_facts"] answers = reward_helper_info["answers"] game_facts_per_step = reward_helper_info[ "game_facts_per_step"] # batch x game step+1 commands_per_step = reward_helper_info[ "commands_per_step"] # batch x game step+1 game_finishing_mask = reward_helper_info[ "game_finishing_mask"] # game step x batch size rewards = [] coverage_rewards = [] seen_entity_reward = [] for i in range(len(asked_entities)): # Iterate over batch reward = check_reasoning_path_reward_sequence(asked_entities[i], asked_attributes[i], game_facts_per_step[i], commands_per_step[i], bool(int(answers[i]))) rewards.append(reward) # add coverage end_facts = set( ) # world discovered so far = union of observing game facts of all steps for t in range(len(game_facts_per_step[i])): end_facts = end_facts | set(game_facts_per_step[i][t]) coverage = exploration_coverage(full_facts[i], end_facts, init_game_facts[i]) coverage_rewards.append(coverage) seen_entities = set(name for f in end_facts for name in f.names) seen_entity_reward.append(1.0 if asked_entities[i] in seen_entities else 0.0) res = pad_sequences(rewards, dtype="float32") # batch x game step res = res * game_finishing_mask.T coverage_rewards = np.array(coverage_rewards) seen_entity_reward = np.array(seen_entity_reward) res = res + game_finishing_mask.T * np.expand_dims( coverage_rewards + seen_entity_reward, axis=-1) * 0.1 return res # batch x game step
def answer_question(self, input_observation, input_observation_char, observation_id_list, input_quest, input_quest_char, use_model="online"): # first pad answerer_input, and get the mask model = self.online_net if use_model == "online" else self.target_net batch_size = len(observation_id_list) max_length = input_observation.size(1) mask = compute_mask(input_observation) # batch x obs_len # noun mask for location question if self.question_type in ["location"]: location_mask = [] for i in range(batch_size): m = [1 for item in observation_id_list[i]] location_mask.append(m) location_mask = pad_sequences(location_mask, maxlen=max_length, dtype="float32") location_mask = to_pt(location_mask, enable_cuda=self.use_cuda, type='float') assert mask.size() == location_mask.size() mask = mask * location_mask match_representation_sequence = self.get_match_representations( input_observation, input_observation_char, input_quest, input_quest_char, use_model=use_model) pred = model.answer_question(match_representation_sequence, mask) # batch x vocab or batch x 2 # attention sum: # sometimes certain word appears multiple times in the observation, # thus we need to merge them together before doing further computations # ------- but # if answer type is not pointing, we just use a pre-defined mapping # that maps 0/1 to their positions in vocab if self.answer_type == "2 way": observation_id_list = [] max_length = 2 for i in range(batch_size): observation_id_list.append( [self.word2id["0"], self.word2id["1"]]) observation = to_pt( pad_sequences(observation_id_list, maxlen=max_length).astype('int32'), self.use_cuda) vocab_distribution = np.zeros( (batch_size, len(self.word_vocab))) # batch x vocab vocab_distribution = to_pt(vocab_distribution, self.use_cuda, type='float') vocab_distribution = vocab_distribution.scatter_add_( 1, observation, pred) # batch x vocab non_zero_words = [] for i in range(batch_size): non_zero_words.append(list(set(observation_id_list[i]))) vocab_mask = torch.ne(vocab_distribution, 0).float() return vocab_distribution, non_zero_words, vocab_mask