def record_training_sample(self, state, agent_action, reward, next_state, episode_over): # samples of master agent. shaping = self.reward_shaping(state, next_state) alpha = self.parameter.get("weight_for_reward_shaping") if episode_over is True: pass else: reward = reward + alpha * shaping if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation. next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set) else: state_rep = state_to_representation_last(state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state_rep = state_to_representation_last(state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) #print("state", [idx for idx,x in enumerate(state_rep) if x==1], agent_action) ##print("nexts", [idx for idx,x in enumerate(next_state_rep) if x==1], reward) #print(self.master_action_index, reward) if self.parameter.get("value_as_reward") is True: q_values = self.id2lowerAgent[self.current_lower_agent_id].get_q_values(state) q_values.reshape(q_values.shape[1]) master_reward = np.max(q_values, axis=1)[0] else: master_reward = reward self.experience_replay_pool.append((state_rep, self.master_action_index, master_reward, next_state_rep, episode_over))
def record_training_sample(self, state, agent_action, reward, next_state, episode_over, **kwargs): symptom_dist_as_input = self.parameter.get("symptom_dist_as_input") agent_id = self.parameter.get("agent_id") if self.parameter.get("state_reduced"): state = reduced_state_to_representation_last( state=state, slot_set=self.slot_set) # sequence representation. next_state = reduced_state_to_representation_last( state=next_state, slot_set=self.slot_set) else: state = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state = state_to_representation_last( state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) if symptom_dist_as_input is True and agent_id.lower() == 'agenthrl': symptom_dist = kwargs.get('symptom_dist') state = np.concatenate((state, symptom_dist), axis=0) next_state = np.concatenate((next_state, symptom_dist), axis=0) self.experience_replay_pool.append( (state, agent_action, reward, next_state, episode_over))
def record_prioritized_training_sample(self, state, agent_action, reward, next_state, episode_over, TD_error, **kwargs): shaping = self.reward_shaping(state, next_state) alpha = self.parameter.get("weight_for_reward_shaping") # if True: # print('shaping', shaping) # Reward shaping only when non-terminal state. if episode_over is True: pass else: reward = reward + alpha * shaping if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set) # sequence representation. next_state_rep = reduced_state_to_representation_last( state=next_state, slot_set=self.slot_set) else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state_rep = state_to_representation_last( state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) self.experience_replay_pool.add(state_rep, self.master_action_index, reward, next_state_rep, episode_over, TD_error)
def record_training_sample(self, state, agent_action, reward, next_state, episode_over): # samples of lower agent self.id2lowerAgent[self.current_lower_agent_id].record_training_sample( state, agent_action, reward, next_state, episode_over, symptom_dist=self.disease_to_symptom_dist[self.id2disease[ self.current_lower_agent_id]]) # samples of master agent. state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state_rep = state_to_representation_last( state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) if self.parameter.get("value_as_reward") is True: q_values = self.id2lowerAgent[ self.current_lower_agent_id].get_q_values(state) q_values.reshape(q_values.shape[1]) master_reward = np.max(q_values, axis=1)[0] else: master_reward = reward self.experience_replay_pool.append( (state_rep, self.current_lower_agent_id, master_reward, next_state_rep, episode_over, self.behave_prob))
def next2(self, state, turn, greedy_strategy, **kwargs): """ Taking action when the action space is changing and select the action which is not inform disease. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ self.agent_action["turn"] = turn symptom_dist = kwargs.get('symptom_dist') if self.parameter.get("state_reduced") and self.parameter.get( "use_all_labels") == False: state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # Lower agent of HRL with four lower agents. if self.symptom_dist_as_input is True and self.agent_id.lower( ) == 'agenthrl': state_rep = np.concatenate((state_rep, symptom_dist), axis=0) # HRL with goal (not joint training one.) if "disease" in self.slot_set.keys(): slot_num = len(self.slot_set) - 1 else: slot_num = len(self.slot_set) goal = kwargs.get('goal') if self.agent_id.lower() in ['agentwithgoal', 'agentwithgoal2']: state_rep = np.concatenate((state_rep, goal), axis=0) if greedy_strategy is True: greedy = random.random() if greedy < self.parameter.get("epsilon"): action_index = random.randint(0, len(self.action_space) - 1) else: action_index = self.dqn.predict_slot(Xs=[state_rep], slot_num=slot_num)[1] # Evaluating mode. else: action_index = self.dqn.predict_slot(Xs=[state_rep], slot_num=slot_num)[1] if self.parameter.get("prioritized_replay"): Ys = self.dqn.predict(Xs=[state_rep])[0] self.current_action_value = Ys.detach().cpu().numpy( )[0][action_index] agent_action = self.action_space[ action_index] #当前动作空间最后10个的动作是inform disease,前面的都是request slot agent_action["turn"] = turn agent_action["speaker"] = "agent" agent_action["action_index"] = action_index assert len(list(agent_action["request_slots"].keys())) == 1 return agent_action, action_index
def __master_next__(self, state, last_master_action, greedy_strategy): # disease_symptom are not used in state_rep. epsilon = self.parameter.get("epsilon") if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation. #next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set) else: state_rep = state_to_representation_last(state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # sequence representation. last_action_rep = np.zeros(self.output_size) if last_master_action is not None: last_action_rep[last_master_action] = 1 state_rep = np.concatenate((state_rep, last_action_rep), axis=0) # Master agent takes an action, i.e., selects a goal. if greedy_strategy is True: greedy = random.random() if greedy < epsilon: master_action_index = random.randint(0, self.output_size - 1) else: master_action_index = self.dqn.predict(Xs=[state_rep])[1] # Evaluating mode. else: master_action_index = self.dqn.predict(Xs=[state_rep])[1] return master_action_index
def record_training_sample(self, state, agent_action, reward, next_state, episode_over, **kwargs): shaping = self.reward_shaping(state, next_state) if self.parameter.get("agent_id").lower() in [ "agenthrljoint", "agenthrljoint2" ]: alpha = 0.0 else: alpha = self.parameter.get("weight_for_reward_shaping") # if True: # print('shaping', shaping) # Reward shaping only when non-terminal state. if episode_over is True: pass else: reward = reward + alpha * shaping if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. next_state_rep = reduced_state_to_representation_last( state=next_state, slot_set=self.slot_set, parameter=self.parameter) else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state_rep = state_to_representation_last( state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) self.experience_replay_pool.append( (state_rep, agent_action, reward, next_state_rep, episode_over)) self.action_visitation_count.setdefault(agent_action, 0) self.action_visitation_count[agent_action] += 1
def next_state_values_DDQN(self, next_state): if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set) # sequence representation. else: state_rep = state_to_representation_last(state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) action_index = self.master.predict(Xs=[state_rep])[1] Ys = self.master.predict_target(Xs=[state_rep]) next_action_value = Ys.detach().cpu().numpy()[0][action_index] return next_action_value
def next(self, state, turn, greedy_strategy, **kwargs): """ Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule. Detail codes will be implemented in different sub-class of this class. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ # disease_symptom are not used in state_rep. epsilon = self.parameter.get("epsilon") state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # sequence representation. # Master agent takes an action. if greedy_strategy == True: greedy = random.random() if greedy < epsilon: action_index = random.randint(0, len(self.id2lowerAgent) - 1) else: action_index = self.dqn.predict(Xs=[state_rep])[1] # Evaluating mode. else: action_index = self.dqn.predict(Xs=[state_rep])[1] self.behave_prob = 1 - epsilon + epsilon / (len(self.id2lowerAgent) - 1) self.current_lower_agent_id = action_index # Lower agent takes an agent. symptom_dist = self.disease_to_symptom_dist[self.id2disease[ self.current_lower_agent_id]] agent_action, action_index = self.id2lowerAgent[ self.current_lower_agent_id].next(state, turn, greedy_strategy, symptom_dist=symptom_dist) return agent_action, action_index
def get_q_values(self, state, **kwargs): if self.parameter.get("state_reduced"): #slot_num = len(self.slot_set) #self.slot_set['disease'] = slot_num state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. else: slot_num = len(self.slot_set) self.slot_set['disease'] = slot_num state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # Lower agent of HRL with goal (not the one with joint training). #goal = kwargs.get('goal') #if self.agent_id.lower() == 'agentwithgoal' or self.agent_id.lower=='agentwithgoal2': # state_rep = np.concatenate((state_rep, goal), axis=0) #print(len(state_rep)) Q_values, max_index = self.dqn.predict(Xs=[state_rep]) return Q_values.cpu().detach().numpy()
def next(self, state, turn, greedy_strategy, **kwargs): """ Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule. Detail codes will be implemented in different sub-class of this class. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ # disease_symptom are not used in state_rep. epsilon = self.parameter.get("epsilon") #print(state["turn"]) if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"] ) # sequence representation. #print(len(state_rep)) # Master agent takes an action. if self.parameter.get("initial_symptom") and state["turn"] > 0: pass else: #print("####") if greedy_strategy == True: greedy = random.random() if greedy < epsilon: self.master_action_index = random.randint( 0, len(self.id2lowerAgent)) #print(self.master_action_index) #master_action_index = random.sample(list(self.id2lowerAgent.keys()),1)[0] else: self.master_action_index = self.master.predict( Xs=[state_rep])[1] # Evaluating mode. else: self.master_action_index = self.master.predict( Xs=[state_rep])[1] self.behave_prob = 1 - epsilon + epsilon / ( len(self.id2lowerAgent) - 1) #print(master_action_index) if self.parameter.get("prioritized_replay"): # print('2') Ys = self.master.predict(Xs=[state_rep])[0] self.current_action_value = Ys.detach().cpu().numpy()[0][ self.master_action_index] #print(self.master_action_index) 这里还存在9 # Lower agent takes an agent. #symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]] # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉 if self.parameter.get("disease_as_action"): self.current_lower_agent_id = self.master_action_space[ self.master_action_index] agent_action, lower_action_index = self.id2lowerAgent[str( self.current_lower_agent_id)].next( state, turn, greedy_strategy=greedy_strategy) else: if self.master_action_index > (len(self.id2lowerAgent) - 1): agent_action = { 'action': 'inform', 'inform_slots': { "disease": 'UNK' }, 'request_slots': {}, "explicit_inform_slots": {}, "implicit_inform_slots": {} } agent_action["turn"] = turn agent_action["inform_slots"] = {"disease": None} agent_action["speaker"] = 'agent' agent_action["action_index"] = None lower_action_index = -1 else: self.current_lower_agent_id = self.master_action_space[ self.master_action_index] #print(self.current_lower_agent_id) agent_action, lower_action_index = self.id2lowerAgent[str( self.current_lower_agent_id)].next( state, turn, greedy_strategy=greedy_strategy) assert len(list(agent_action["request_slots"].keys())) == 1 #print(self.current_lower_agent_id, lower_action_index) #print(agent_action) return agent_action, self.master_action_index, lower_action_index
def record_training_sample(self, state, agent_action, reward, next_state, episode_over, lower_reward, master_action_index): # samples of master agent. # print(state) #print(reward) shaping = self.reward_shaping(state, next_state) alpha = self.parameter.get("weight_for_reward_shaping") ''' if reward == self.parameter.get("reward_for_repeated_action"): lower_reward = reward # reward = reward * 2 else: lower_reward = max(0, shaping * alpha) # lower_reward = shaping * alpha ''' if episode_over is True: pass else: reward = reward + alpha * shaping # samples of lower agent. #print('#', lower_reward) if int(agent_action) >= 0: #q_value = self.pretrained_lowerAgent[self.master_action_space[self.master_action_index]].get_q_values(state) #print(q_value) #print('# ', lower_reward) self.id2lowerAgent[ self.current_lower_agent_id].record_training_sample( state, agent_action, lower_reward, next_state, episode_over) if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. next_state_rep = reduced_state_to_representation_last( state=next_state, slot_set=self.slot_set, parameter=self.parameter) master_state_rep = reduced_state_to_representation_last( state=self.master_state, slot_set=self.slot_set, parameter=self.parameter) else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) next_state_rep = state_to_representation_last( state=next_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) master_state_rep = state_to_representation_last( state=self.master_state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # print("state", [idx for idx,x in enumerate(state_rep) if x==1], agent_action) # print("nexts", [idx for idx,x in enumerate(next_state_rep) if x==1], reward) if self.parameter.get("value_as_reward") is True: q_values = self.id2lowerAgent[ self.current_lower_agent_id].get_q_values(state) q_values.reshape(q_values.shape[1]) self.master_reward = np.max(q_values, axis=1)[0] else: self.master_reward += reward if self.subtask_terminal or int( agent_action) == -1 or episode_over == True: #if self.master_reward > -40: # self.master_reward = max(-1, self.master_reward) if self.master_reward > -60 and self.master_reward <= 0: self.master_reward = self.master_reward / 4 #print(self.master_state["current_slots"]["inform_slots"]) #print(next_state["current_slots"]["inform_slots"]) #print("***", self.master_reward) #print(state['turn'], next_state['turn']) if self.master_action_index > (len(self.id2lowerAgent) - 1): subtask_turn = 1 else: if self.subtask_turn == 0: subtask_turn = 5 else: subtask_turn = self.subtask_turn #print(subtask_turn) self.experience_replay_pool.append( (master_state_rep, master_action_index, self.master_reward, next_state_rep, episode_over, subtask_turn)) self.master_reward = 0
def next(self, state, turn, greedy_strategy, **kwargs): """ Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule. Detail codes will be implemented in different sub-class of this class. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ # represent the master state into a vector first # print(state["turn"]) if self.parameter.get("state_reduced"): try: self.slot_set.pop("disease") except: pass state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"] ) # sequence representation. # print(len(state_rep)) # Only when the subtask is terminal, master agent takes an action. if self.subtask_terminal == True: self.master_state = copy.deepcopy(state) #print(len(state_rep)) self.__master_next(state_rep=state_rep, greedy_strategy=greedy_strategy) self.subtask_terminal = False self.subtask_turn = 0 # The selected lower agent takes an agent. # symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]] # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉 if self.parameter.get("disease_as_action"): self.current_lower_agent_id = self.master_action_space[ self.master_action_index] agent_action, lower_action_index = self.id2lowerAgent[str( self.current_lower_agent_id)].next( state, self.subtask_turn, greedy_strategy=greedy_strategy) else: if self.master_action_index > ( len(self.id2lowerAgent) - 1): # The disease classifier is activated. agent_action = { 'action': 'inform', 'inform_slots': { "disease": 'UNK' }, 'request_slots': {}, "explicit_inform_slots": {}, "implicit_inform_slots": {} } agent_action["turn"] = turn agent_action["inform_slots"] = {"disease": None} agent_action["speaker"] = 'agent' agent_action["action_index"] = None lower_action_index = -1 self.subtask_terminal = True #print("********************************************************************") else: #print("**",self.master_action_index) self.subtask_turn += 1 self.current_lower_agent_id = self.master_action_space[ self.master_action_index] # print(self.current_lower_agent_id) agent_action, lower_action_index = self.id2lowerAgent[str( self.current_lower_agent_id)].next( state, self.subtask_turn, greedy_strategy=greedy_strategy) #if agent_action['action'] == "return" or self.subtask_turn >= self.subtask_max_turn: if self.subtask_turn >= self.subtask_max_turn: self.subtask_terminal = True self.subtask_turn = 0 #if agent_action['action'] == 'return': # print(agent_action['action']) else: assert len(list(agent_action["request_slots"].keys())) == 1 # print(self.current_lower_agent_id, lower_action_index) #print(self.subtask_turn, lower_action_index, self.master_action_index) return agent_action, self.master_action_index, lower_action_index
def next(self, state, turn, greedy_strategy, **kwargs): """ Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule. Detail codes will be implemented in different sub-class of this class. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ self.agent_action["turn"] = turn #state['turn'] = turn #print(state['turn']) #print(self.slot_set) symptom_dist = kwargs.get('symptom_dist') if self.parameter.get("state_reduced"): if self.parameter.get("agent_id").lower() in [ "agenthrljoint", "agenthrljoint2" ] or self.parameter.get("use_all_labels") == False: state_rep = reduced_state_to_representation_last( state=state, slot_set=self.slot_set, parameter=self.parameter) # sequence representation. else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) else: state_rep = state_to_representation_last( state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # Lower agent of HRL with four lower agents. if self.symptom_dist_as_input is True and self.agent_id.lower( ) == 'agenthrl': state_rep = np.concatenate((state_rep, symptom_dist), axis=0) #print('1') # HRL with goal (not joint training one.) goal = kwargs.get('goal') if self.agent_id.lower() in ['agentwithgoal', 'agentwithgoal2']: state_rep = np.concatenate((state_rep, goal), axis=0) if greedy_strategy is True: greedy = random.random() if greedy < self.parameter.get("epsilon"): action_index = random.randint(0, len(self.action_space) - 1) else: action_index = self.dqn.predict(Xs=[state_rep])[1] # Evaluating mode. else: action_index = self.dqn.predict(Xs=[state_rep])[1] #print(self.parameter.get("prioritized_replay")) if self.parameter.get("prioritized_replay"): #print('2') Ys = self.dqn.predict(Xs=[state_rep])[0] self.current_action_value = Ys.detach().cpu().numpy( )[0][action_index] agent_action = copy.deepcopy(self.action_space[action_index]) agent_action["turn"] = turn agent_action["speaker"] = "agent" agent_action["action_index"] = action_index return agent_action, action_index
def next(self, state, turn, greedy_strategy, **kwargs): """ Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule. Detail codes will be implemented in different sub-class of this class. :param state: a vector, the representation of current dialogue state. :param turn: int, the time step of current dialogue session. :return: the agent action, a tuple consists of the selected agent action and action index. """ # disease_symptom are not used in state_rep. #在这里master先于lower agent一步进行判断,如果master可以inform疾病就不需要激活lower agent,反之再交由lower agent决策 min_epsilon = self.parameter.get("epsilon") index = kwargs.get("index") if index<1000: epsilon = 0.3- (0.3-min_epsilon)*index/1000 else: epsilon = min_epsilon #print(index, epsilon) #print(state["turn"]) if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation. else: state_rep = state_to_representation_last(state=state, action_set=self.action_set, slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter["max_turn"]) # sequence representation. #print(len(state_rep)) # Master agent takes an action. if self.parameter.get("initial_symptom") and state["turn"]>0: pass else: #print("####") if greedy_strategy == True: greedy = random.random() if greedy < epsilon: self.master_action_index = random.randint(0, len(self.id2lowerAgent) - 1) #master_action_index = random.sample(list(self.id2lowerAgent.keys()),1)[0] else: self.master_action_index = self.master.predict(Xs=[state_rep])[1] # Evaluating mode. else: self.master_action_index = self.master.predict(Xs=[state_rep])[1] self.behave_prob = 1 - epsilon + epsilon / (len(self.id2lowerAgent) - 1) if self.master_action_index > (len(self.id2lowerAgent) - 1): self.action["turn"] = turn self.action["inform_slots"] = {"disease": self.id2disease[self.master_action_index - len(self.id2lowerAgent)]} self.action["speaker"] = 'agent' self.action["action_index"] = None return self.action, self.master_action_index #print(master_action_index) self.current_lower_agent_id = self.master_action_space[self.master_action_index] if self.parameter.get("prioritized_replay"): # print('2') Ys = self.master.predict(Xs=[state_rep])[0] self.current_action_value = Ys.detach().cpu().numpy()[0][self.master_action_index] # Lower agent takes an agent. #symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]] # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉 agent_action, lower_action_index = self.id2lowerAgent[str(self.current_lower_agent_id)].next(state, turn, greedy_strategy=False) #assert len(agent_action["request_slots"])>0 and len(agent_action["inform_slots"])==0 return agent_action, self.master_action_index, lower_action_index
def record_training_sample(self, state, agent_action, reward, next_state, episode_over): """ 这里lower agent和master agent的sample都是在这里直接保存的,没有再另外调用函数。 """ # samples of internal critic. self.states_of_one_session.append(state) if episode_over is True: # current session is successful. if reward == self.parameter.get('reward_for_success'): for one_state in self.states_of_one_session: # positive samples. self.internal_critic.record_training_positive_sample(one_state, self.master_action_index) # negative samples. for index in range(self.output_size): if index != self.master_action_index: self.internal_critic.record_training_negative_sample(one_state, index) # current session is failed. elif reward == self.parameter.get('reward_for_fail') and state['turn'] <= self.parameter.get('max_turn') - 2: for one_state in self.states_of_one_session: self.internal_critic.record_training_negative_sample(one_state, self.master_action_index) # reward shaping alpha = self.parameter.get("weight_for_reward_shaping") # if episode_over is True: shaping = self.reward_shaping(agent_action, self.master_action_index) # else: shaping = 0 shaping = 0 # Reward shaping only when non-terminal state. if episode_over is True: pass else: reward = reward + alpha * shaping # state to vec. if self.parameter.get("state_reduced"): state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation. next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set) master_state_rep = reduced_state_to_representation_last(state=self.master_state, slot_set=self.slot_set) else: state_rep = state_to_representation_last(state=state, action_set=self.action_set, slot_set=self.slot_set,disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn']) next_state_rep = state_to_representation_last(state=next_state, action_set=self.action_set,slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn']) master_state_rep = state_to_representation_last(state=self.master_state, action_set=self.action_set,slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn']) # samples of master agent. sub_task_terminal, intrinsic_reward, _ = self.intrinsic_critic(next_state, self.master_action_index,disease_tag=self.disease_tag) self.master_reward += reward if self.sub_task_terminal is True and sub_task_terminal is True: last_master_action_rep = np.zeros(self.output_size) current_master_action_rep = np.zeros(self.output_size) # 将master所有已经选择的动作加入到状态表示中。 for last_master_action_index in self.master_previous_actions: if last_master_action_index is not None: last_master_action_rep[last_master_action_index] = 1 current_master_action_rep[last_master_action_index] = 1 if self.master_action_index is not None: current_master_action_rep[self.master_action_index] = 1 master_state_rep = np.concatenate((master_state_rep, last_master_action_rep), axis=0) next_master_state_rep = np.concatenate((next_state_rep, current_master_action_rep), axis=0) self.experience_replay_pool.append((master_state_rep, self.master_action_index, self.master_reward, next_master_state_rep, episode_over)) # # master repeated action. # if self.master_action_index in self.master_previous_actions: # temp_reward = - self.parameter.get("max_turn") / 2 # self.experience_replay_pool.append( (master_state_rep, self.master_action_index, temp_reward, next_master_state_rep, episode_over)) # else: # self.experience_replay_pool.append((master_state_rep, self.master_action_index, self.master_reward, next_master_state_rep, episode_over)) # samples of lower agent. if agent_action is not None: # session is not over. Otherwise the agent_action is not one of the lower agent's actions. goal = np.zeros(len(self.disease_symptom)) goal[self.master_action_index] = 1 state_rep = np.concatenate((state_rep, goal), axis=0) next_state_rep = np.concatenate((next_state_rep, goal), axis=0) # reward shaping for lower agent on intrinsic reward. shaping = self.reward_shaping(state, next_state) intrinsic_reward += alpha * shaping self.lower_agent.experience_replay_pool.append((state_rep, agent_action, intrinsic_reward, next_state_rep, sub_task_terminal, self.master_action_index)) # visitation count. self.lower_agent.action_visitation_count.setdefault(agent_action, 0) self.lower_agent.action_visitation_count[agent_action] += 1 # # repeated action # if agent_action in self.worker_previous_actions: # temp_reward = -0.5 # self.lower_agent.experience_replay_pool.append((state_rep, agent_action, temp_reward, next_state_rep, sub_task_terminal, self.master_action_index)) # else: # self.lower_agent.experience_replay_pool.append((state_rep, agent_action, intrinsic_reward, next_state_rep, sub_task_terminal, self.master_action_index)) # 如果达到固定长度,同时去掉即将删除transition的计数。 self.visitation_count[self.master_action_index, agent_action] += 1 if len(self.lower_agent.experience_replay_pool) == self.lower_agent.experience_replay_pool.maxlen: _, pre_agent_action, _, _, _, pre_master_action = self.lower_agent.experience_replay_pool.popleft() self.visitation_count[pre_master_action, pre_agent_action] -= 1