def update(self): """ Update neural model in agent. In this example we follow algorithm of updating model in dqn with replay memory. """ if len(self.replay_memory) < self.replay_batch_size: return None transitions = self.replay_memory.sample(self.replay_batch_size) batch = Transition(*zip(*transitions)) observation_id_list = pad_sequences( batch.observation_id_list, maxlen=max_len(batch.observation_id_list)).astype('int32') input_observation = to_pt(observation_id_list, self.use_cuda) next_observation_id_list = pad_sequences( batch.next_observation_id_list, maxlen=max_len(batch.next_observation_id_list)).astype('int32') next_input_observation = to_pt(next_observation_id_list, self.use_cuda) chosen_indices = list(list(zip(*batch.word_indices))) chosen_indices = [torch.stack(item, 0) for item in chosen_indices] # list of batch x 1 word_ranks = self.infer_word_ranks( input_observation ) # list of batch x vocab, len=5 (one per potential output word) word_qvalues = [ w_rank.gather(1, idx).squeeze(-1) for w_rank, idx in zip(word_ranks, chosen_indices) ] # list of batch q_value = torch.mean(torch.stack(word_qvalues, -1), -1) # batch next_word_ranks = self.infer_word_ranks( next_input_observation ) # batch x n_verb, batch x n_noun, batchx n_second_noun next_word_masks = list(list(zip(*batch.next_word_masks))) next_word_masks = [np.stack(item, 0) for item in next_word_masks] next_word_qvalues, _ = _choose_maxQ_command(next_word_ranks, next_word_masks, self.use_cuda) next_q_value = torch.mean(torch.stack(next_word_qvalues, -1), -1) # batch next_q_value = next_q_value.detach() rewards = torch.stack(batch.reward) # batch not_done = 1.0 - np.array(batch.done, dtype='float32') # batch not_done = to_pt(not_done, self.use_cuda, type='float') rewards = rewards + not_done * next_q_value * self.discount_gamma # batch mask = torch.stack(batch.mask) # batch loss = F.smooth_l1_loss(q_value * mask, rewards * mask) return loss
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ inventory_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["inventory"]] inventory_id_list = [_words_to_ids(tokens, self.word2id) for tokens in inventory_token_list] feedback_token_list = [preproc(item, str_type='feedback', tokenizer=self.nlp) for item in obs] feedback_id_list = [_words_to_ids(tokens, self.word2id) for tokens in feedback_token_list] quest_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["extra.recipe"]] quest_id_list = [_words_to_ids(tokens, self.word2id) for tokens in quest_token_list] prev_action_token_list = [preproc(item, tokenizer=self.nlp) for item in self.prev_actions] prev_action_id_list = [_words_to_ids(tokens, self.word2id) for tokens in prev_action_token_list] description_token_list = [preproc(item, tokenizer=self.nlp) for item in infos["description"]] for i, d in enumerate(description_token_list): if len(d) == 0: description_token_list[i] = ["end"] # if empty description, insert word "end" description_id_list = [_words_to_ids(tokens, self.word2id) for tokens in description_token_list] description_id_list = [_d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list)] input_description = pad_sequences(description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list
def get_agent_inputs(self, string_list): sentence_token_list = [item.split() for item in string_list] sentence_id_list = [ _words_to_ids(tokens, self.word2id) for tokens in sentence_token_list ] input_sentence_char = list_of_token_list_to_char_input( sentence_token_list, self.char2id) input_sentence = pad_sequences( sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32') input_sentence = to_pt(input_sentence, self.use_cuda) input_sentence_char = to_pt(input_sentence_char, self.use_cuda) return input_sentence, input_sentence_char, sentence_id_list
def get_game_step_info(self, obs: List[str], infos: Dict[str, List[Any]]): """ Get all the available information, and concat them together to be tensor for a neural model. we use post padding here, all information are tokenized here. Arguments: obs: Previous command's feedback for each game. infos: Additional information for each game. """ word2id = self.vocab.word2id inventory_id_list = get_token_ids_for_items(infos["inventory"], word2id, tokenizer=self.nlp) feedback_id_list = get_token_ids_for_items(obs, word2id, tokenizer=self.nlp) quest_id_list = get_token_ids_for_items(infos["extra.recipe"], word2id, tokenizer=self.nlp) prev_action_id_list = get_token_ids_for_items(self.prev_actions, word2id, tokenizer=self.nlp) description_id_list = get_token_ids_for_items(infos["description"], word2id, tokenizer=self.nlp, subst_if_empty=['end']) description_id_list = [ _d + _i + _q + _f + _pa for (_d, _i, _q, _f, _pa ) in zip(description_id_list, inventory_id_list, quest_id_list, feedback_id_list, prev_action_id_list) ] input_description = pad_sequences( description_id_list, maxlen=max_len(description_id_list)).astype('int32') input_description = to_pt(input_description, self.use_cuda) return input_description, description_id_list