Exemple #1
0
    def record_training_sample(self, state, agent_action, reward, next_state, episode_over):
       # samples of master agent.
        shaping = self.reward_shaping(state, next_state)
        alpha = self.parameter.get("weight_for_reward_shaping")

        if episode_over is True:
            pass
        else:
            reward = reward + alpha * shaping
        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation.
            next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set)
        else:
            state_rep = state_to_representation_last(state=state,
                                                 action_set=self.action_set,
                                                 slot_set=self.slot_set,
                                                 disease_symptom=self.disease_symptom,
                                                 max_turn=self.parameter["max_turn"])
            next_state_rep = state_to_representation_last(state=next_state,
                                                      action_set=self.action_set,
                                                      slot_set=self.slot_set,
                                                      disease_symptom=self.disease_symptom,
                                                      max_turn=self.parameter["max_turn"])
        #print("state", [idx for idx,x in enumerate(state_rep) if x==1], agent_action)
        ##print("nexts", [idx for idx,x in enumerate(next_state_rep) if x==1], reward)
        #print(self.master_action_index, reward)
        if self.parameter.get("value_as_reward") is True:
            q_values = self.id2lowerAgent[self.current_lower_agent_id].get_q_values(state)
            q_values.reshape(q_values.shape[1])
            master_reward = np.max(q_values, axis=1)[0]
        else:
            master_reward = reward
        self.experience_replay_pool.append((state_rep, self.master_action_index, master_reward, next_state_rep, episode_over))
Exemple #2
0
 def record_training_sample(self, state, agent_action, reward, next_state,
                            episode_over, **kwargs):
     symptom_dist_as_input = self.parameter.get("symptom_dist_as_input")
     agent_id = self.parameter.get("agent_id")
     if self.parameter.get("state_reduced"):
         state = reduced_state_to_representation_last(
             state=state,
             slot_set=self.slot_set)  # sequence representation.
         next_state = reduced_state_to_representation_last(
             state=next_state, slot_set=self.slot_set)
     else:
         state = state_to_representation_last(
             state=state,
             action_set=self.action_set,
             slot_set=self.slot_set,
             disease_symptom=self.disease_symptom,
             max_turn=self.parameter["max_turn"])
         next_state = state_to_representation_last(
             state=next_state,
             action_set=self.action_set,
             slot_set=self.slot_set,
             disease_symptom=self.disease_symptom,
             max_turn=self.parameter["max_turn"])
     if symptom_dist_as_input is True and agent_id.lower() == 'agenthrl':
         symptom_dist = kwargs.get('symptom_dist')
         state = np.concatenate((state, symptom_dist), axis=0)
         next_state = np.concatenate((next_state, symptom_dist), axis=0)
     self.experience_replay_pool.append(
         (state, agent_action, reward, next_state, episode_over))
Exemple #3
0
    def record_prioritized_training_sample(self, state, agent_action, reward,
                                           next_state, episode_over, TD_error,
                                           **kwargs):
        shaping = self.reward_shaping(state, next_state)
        alpha = self.parameter.get("weight_for_reward_shaping")
        # if True:
        # print('shaping', shaping)
        # Reward shaping only when non-terminal state.
        if episode_over is True:
            pass
        else:
            reward = reward + alpha * shaping

        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(
                state=state,
                slot_set=self.slot_set)  # sequence representation.
            next_state_rep = reduced_state_to_representation_last(
                state=next_state, slot_set=self.slot_set)
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])
            next_state_rep = state_to_representation_last(
                state=next_state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])
        self.experience_replay_pool.add(state_rep, self.master_action_index,
                                        reward, next_state_rep, episode_over,
                                        TD_error)
    def next2(self, state, turn, greedy_strategy, **kwargs):
        """
        Taking action when the action space is changing and select the action which is not inform disease.
        :param state: a vector, the representation of current dialogue state.
        :param turn: int, the time step of current dialogue session.
        :return: the agent action, a tuple consists of the selected agent action and action index.
        """
        self.agent_action["turn"] = turn
        symptom_dist = kwargs.get('symptom_dist')
        if self.parameter.get("state_reduced") and self.parameter.get(
                "use_all_labels") == False:
            state_rep = reduced_state_to_representation_last(
                state=state, slot_set=self.slot_set,
                parameter=self.parameter)  # sequence representation.
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])

        # Lower agent of HRL with four lower agents.
        if self.symptom_dist_as_input is True and self.agent_id.lower(
        ) == 'agenthrl':
            state_rep = np.concatenate((state_rep, symptom_dist), axis=0)

        # HRL with goal (not joint training one.)
        if "disease" in self.slot_set.keys():
            slot_num = len(self.slot_set) - 1
        else:
            slot_num = len(self.slot_set)

        goal = kwargs.get('goal')
        if self.agent_id.lower() in ['agentwithgoal', 'agentwithgoal2']:
            state_rep = np.concatenate((state_rep, goal), axis=0)

        if greedy_strategy is True:
            greedy = random.random()
            if greedy < self.parameter.get("epsilon"):
                action_index = random.randint(0, len(self.action_space) - 1)
            else:
                action_index = self.dqn.predict_slot(Xs=[state_rep],
                                                     slot_num=slot_num)[1]
        # Evaluating mode.
        else:
            action_index = self.dqn.predict_slot(Xs=[state_rep],
                                                 slot_num=slot_num)[1]
        if self.parameter.get("prioritized_replay"):
            Ys = self.dqn.predict(Xs=[state_rep])[0]
            self.current_action_value = Ys.detach().cpu().numpy(
            )[0][action_index]

        agent_action = self.action_space[
            action_index]  #当前动作空间最后10个的动作是inform disease,前面的都是request slot
        agent_action["turn"] = turn
        agent_action["speaker"] = "agent"
        agent_action["action_index"] = action_index
        assert len(list(agent_action["request_slots"].keys())) == 1
        return agent_action, action_index
 def __master_next__(self, state, last_master_action, greedy_strategy):
     # disease_symptom are not used in state_rep.
     epsilon = self.parameter.get("epsilon")
     if self.parameter.get("state_reduced"):
         state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation.
         #next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set)
     else:
         state_rep = state_to_representation_last(state=state,
                                              action_set=self.action_set,
                                              slot_set=self.slot_set,
                                              disease_symptom=self.disease_symptom,
                                              max_turn=self.parameter["max_turn"])  # sequence representation.
     last_action_rep = np.zeros(self.output_size)
     if last_master_action is not None:
         last_action_rep[last_master_action] = 1
     state_rep = np.concatenate((state_rep, last_action_rep), axis=0)
     # Master agent takes an action, i.e., selects a goal.
     if greedy_strategy is True:
         greedy = random.random()
         if greedy < epsilon:
             master_action_index = random.randint(0, self.output_size - 1)
         else:
             master_action_index = self.dqn.predict(Xs=[state_rep])[1]
     # Evaluating mode.
     else:
         master_action_index = self.dqn.predict(Xs=[state_rep])[1]
     return master_action_index
 def record_training_sample(self, state, agent_action, reward, next_state,
                            episode_over, **kwargs):
     shaping = self.reward_shaping(state, next_state)
     if self.parameter.get("agent_id").lower() in [
             "agenthrljoint", "agenthrljoint2"
     ]:
         alpha = 0.0
     else:
         alpha = self.parameter.get("weight_for_reward_shaping")
     # if True:
     # print('shaping', shaping)
     # Reward shaping only when non-terminal state.
     if episode_over is True:
         pass
     else:
         reward = reward + alpha * shaping
     if self.parameter.get("state_reduced"):
         state_rep = reduced_state_to_representation_last(
             state=state, slot_set=self.slot_set,
             parameter=self.parameter)  # sequence representation.
         next_state_rep = reduced_state_to_representation_last(
             state=next_state,
             slot_set=self.slot_set,
             parameter=self.parameter)
     else:
         state_rep = state_to_representation_last(
             state=state,
             action_set=self.action_set,
             slot_set=self.slot_set,
             disease_symptom=self.disease_symptom,
             max_turn=self.parameter["max_turn"])
         next_state_rep = state_to_representation_last(
             state=next_state,
             action_set=self.action_set,
             slot_set=self.slot_set,
             disease_symptom=self.disease_symptom,
             max_turn=self.parameter["max_turn"])
     self.experience_replay_pool.append(
         (state_rep, agent_action, reward, next_state_rep, episode_over))
     self.action_visitation_count.setdefault(agent_action, 0)
     self.action_visitation_count[agent_action] += 1
Exemple #7
0
 def next_state_values_DDQN(self, next_state):
     if self.parameter.get("state_reduced"):
         state_rep = reduced_state_to_representation_last(state=next_state,
                                              slot_set=self.slot_set) # sequence representation.
     else:
         state_rep = state_to_representation_last(state=next_state,
                                              action_set=self.action_set,
                                              slot_set=self.slot_set,
                                              disease_symptom=self.disease_symptom,
                                              max_turn=self.parameter["max_turn"])
     action_index = self.master.predict(Xs=[state_rep])[1]
     Ys = self.master.predict_target(Xs=[state_rep])
     next_action_value = Ys.detach().cpu().numpy()[0][action_index]
     return next_action_value
 def get_q_values(self, state, **kwargs):
     if self.parameter.get("state_reduced"):
         #slot_num = len(self.slot_set)
         #self.slot_set['disease'] = slot_num
         state_rep = reduced_state_to_representation_last(
             state=state, slot_set=self.slot_set,
             parameter=self.parameter)  # sequence representation.
     else:
         slot_num = len(self.slot_set)
         self.slot_set['disease'] = slot_num
         state_rep = state_to_representation_last(
             state=state,
             action_set=self.action_set,
             slot_set=self.slot_set,
             disease_symptom=self.disease_symptom,
             max_turn=self.parameter["max_turn"])
     # Lower agent of HRL with goal (not the one with joint training).
     #goal = kwargs.get('goal')
     #if self.agent_id.lower() == 'agentwithgoal' or self.agent_id.lower=='agentwithgoal2':
     #    state_rep = np.concatenate((state_rep, goal), axis=0)
     #print(len(state_rep))
     Q_values, max_index = self.dqn.predict(Xs=[state_rep])
     return Q_values.cpu().detach().numpy()
    def next(self, state, turn, greedy_strategy, **kwargs):
        """
        Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule.
        Detail codes will be implemented in different sub-class of this class.
        :param state: a vector, the representation of current dialogue state.
        :param turn: int, the time step of current dialogue session.
        :return: the agent action, a tuple consists of the selected agent action and action index.
        """
        # disease_symptom are not used in state_rep.
        epsilon = self.parameter.get("epsilon")
        #print(state["turn"])
        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(
                state=state, slot_set=self.slot_set,
                parameter=self.parameter)  # sequence representation.
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"]
            )  # sequence representation.
        #print(len(state_rep))
        # Master agent takes an action.
        if self.parameter.get("initial_symptom") and state["turn"] > 0:
            pass
        else:
            #print("####")
            if greedy_strategy == True:
                greedy = random.random()
                if greedy < epsilon:
                    self.master_action_index = random.randint(
                        0, len(self.id2lowerAgent))
                    #print(self.master_action_index)
                    #master_action_index = random.sample(list(self.id2lowerAgent.keys()),1)[0]
                else:
                    self.master_action_index = self.master.predict(
                        Xs=[state_rep])[1]
            # Evaluating mode.
            else:
                self.master_action_index = self.master.predict(
                    Xs=[state_rep])[1]
            self.behave_prob = 1 - epsilon + epsilon / (
                len(self.id2lowerAgent) - 1)
            #print(master_action_index)

            if self.parameter.get("prioritized_replay"):
                # print('2')
                Ys = self.master.predict(Xs=[state_rep])[0]
                self.current_action_value = Ys.detach().cpu().numpy()[0][
                    self.master_action_index]
        #print(self.master_action_index)  这里还存在9

        # Lower agent takes an agent.
        #symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]]
        # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉
        if self.parameter.get("disease_as_action"):
            self.current_lower_agent_id = self.master_action_space[
                self.master_action_index]
            agent_action, lower_action_index = self.id2lowerAgent[str(
                self.current_lower_agent_id)].next(
                    state, turn, greedy_strategy=greedy_strategy)

        else:
            if self.master_action_index > (len(self.id2lowerAgent) - 1):
                agent_action = {
                    'action': 'inform',
                    'inform_slots': {
                        "disease": 'UNK'
                    },
                    'request_slots': {},
                    "explicit_inform_slots": {},
                    "implicit_inform_slots": {}
                }
                agent_action["turn"] = turn
                agent_action["inform_slots"] = {"disease": None}
                agent_action["speaker"] = 'agent'
                agent_action["action_index"] = None
                lower_action_index = -1
            else:
                self.current_lower_agent_id = self.master_action_space[
                    self.master_action_index]
                #print(self.current_lower_agent_id)
                agent_action, lower_action_index = self.id2lowerAgent[str(
                    self.current_lower_agent_id)].next(
                        state, turn, greedy_strategy=greedy_strategy)
                assert len(list(agent_action["request_slots"].keys())) == 1
            #print(self.current_lower_agent_id, lower_action_index)
            #print(agent_action)
        return agent_action, self.master_action_index, lower_action_index
Exemple #10
0
    def record_training_sample(self, state, agent_action, reward, next_state,
                               episode_over, lower_reward,
                               master_action_index):
        # samples of master agent.
        # print(state)
        #print(reward)

        shaping = self.reward_shaping(state, next_state)
        alpha = self.parameter.get("weight_for_reward_shaping")
        '''
        if reward == self.parameter.get("reward_for_repeated_action"):
            lower_reward = reward
            # reward = reward * 2
        else:
            lower_reward = max(0, shaping * alpha)
            # lower_reward = shaping * alpha
        '''
        if episode_over is True:
            pass
        else:
            reward = reward + alpha * shaping

        # samples of lower agent.
        #print('#', lower_reward)
        if int(agent_action) >= 0:
            #q_value = self.pretrained_lowerAgent[self.master_action_space[self.master_action_index]].get_q_values(state)
            #print(q_value)
            #print('# ', lower_reward)
            self.id2lowerAgent[
                self.current_lower_agent_id].record_training_sample(
                    state, agent_action, lower_reward, next_state,
                    episode_over)

        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(
                state=state, slot_set=self.slot_set,
                parameter=self.parameter)  # sequence representation.
            next_state_rep = reduced_state_to_representation_last(
                state=next_state,
                slot_set=self.slot_set,
                parameter=self.parameter)
            master_state_rep = reduced_state_to_representation_last(
                state=self.master_state,
                slot_set=self.slot_set,
                parameter=self.parameter)
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])
            next_state_rep = state_to_representation_last(
                state=next_state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])
            master_state_rep = state_to_representation_last(
                state=self.master_state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])
        # print("state", [idx for idx,x in enumerate(state_rep) if x==1], agent_action)
        # print("nexts", [idx for idx,x in enumerate(next_state_rep) if x==1], reward)
        if self.parameter.get("value_as_reward") is True:
            q_values = self.id2lowerAgent[
                self.current_lower_agent_id].get_q_values(state)
            q_values.reshape(q_values.shape[1])
            self.master_reward = np.max(q_values, axis=1)[0]
        else:
            self.master_reward += reward

        if self.subtask_terminal or int(
                agent_action) == -1 or episode_over == True:
            #if self.master_reward > -40:
            #    self.master_reward = max(-1, self.master_reward)
            if self.master_reward > -60 and self.master_reward <= 0:
                self.master_reward = self.master_reward / 4
            #print(self.master_state["current_slots"]["inform_slots"])
            #print(next_state["current_slots"]["inform_slots"])
            #print("***", self.master_reward)

            #print(state['turn'], next_state['turn'])
            if self.master_action_index > (len(self.id2lowerAgent) - 1):
                subtask_turn = 1
            else:
                if self.subtask_turn == 0:
                    subtask_turn = 5
                else:
                    subtask_turn = self.subtask_turn
            #print(subtask_turn)
            self.experience_replay_pool.append(
                (master_state_rep, master_action_index, self.master_reward,
                 next_state_rep, episode_over, subtask_turn))
            self.master_reward = 0
Exemple #11
0
    def next(self, state, turn, greedy_strategy, **kwargs):
        """
        Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule.
        Detail codes will be implemented in different sub-class of this class.
        :param state: a vector, the representation of current dialogue state.
        :param turn: int, the time step of current dialogue session.
        :return: the agent action, a tuple consists of the selected agent action and action index.
        """
        # represent the master state into a vector first

        # print(state["turn"])
        if self.parameter.get("state_reduced"):
            try:
                self.slot_set.pop("disease")
            except:
                pass
            state_rep = reduced_state_to_representation_last(
                state=state, slot_set=self.slot_set,
                parameter=self.parameter)  # sequence representation.
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"]
            )  # sequence representation.
        # print(len(state_rep))
        # Only when the subtask is terminal, master agent takes an action.
        if self.subtask_terminal == True:
            self.master_state = copy.deepcopy(state)
            #print(len(state_rep))
            self.__master_next(state_rep=state_rep,
                               greedy_strategy=greedy_strategy)
            self.subtask_terminal = False
            self.subtask_turn = 0

        # The selected lower agent takes an agent.
        # symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]]
        # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉

        if self.parameter.get("disease_as_action"):
            self.current_lower_agent_id = self.master_action_space[
                self.master_action_index]
            agent_action, lower_action_index = self.id2lowerAgent[str(
                self.current_lower_agent_id)].next(
                    state, self.subtask_turn, greedy_strategy=greedy_strategy)
        else:
            if self.master_action_index > (
                    len(self.id2lowerAgent) -
                    1):  # The disease classifier is activated.
                agent_action = {
                    'action': 'inform',
                    'inform_slots': {
                        "disease": 'UNK'
                    },
                    'request_slots': {},
                    "explicit_inform_slots": {},
                    "implicit_inform_slots": {}
                }
                agent_action["turn"] = turn
                agent_action["inform_slots"] = {"disease": None}
                agent_action["speaker"] = 'agent'
                agent_action["action_index"] = None
                lower_action_index = -1
                self.subtask_terminal = True
                #print("********************************************************************")
            else:

                #print("**",self.master_action_index)
                self.subtask_turn += 1
                self.current_lower_agent_id = self.master_action_space[
                    self.master_action_index]
                # print(self.current_lower_agent_id)
                agent_action, lower_action_index = self.id2lowerAgent[str(
                    self.current_lower_agent_id)].next(
                        state,
                        self.subtask_turn,
                        greedy_strategy=greedy_strategy)
                #if agent_action['action'] == "return" or self.subtask_turn >= self.subtask_max_turn:
                if self.subtask_turn >= self.subtask_max_turn:
                    self.subtask_terminal = True
                    self.subtask_turn = 0
                    #if agent_action['action'] == 'return':
                    #    print(agent_action['action'])
                else:
                    assert len(list(agent_action["request_slots"].keys())) == 1
            # print(self.current_lower_agent_id, lower_action_index)
            #print(self.subtask_turn, lower_action_index, self.master_action_index)
        return agent_action, self.master_action_index, lower_action_index
    def next(self, state, turn, greedy_strategy, **kwargs):
        """
        Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule.
        Detail codes will be implemented in different sub-class of this class.
        :param state: a vector, the representation of current dialogue state.
        :param turn: int, the time step of current dialogue session.
        :return: the agent action, a tuple consists of the selected agent action and action index.
        """
        self.agent_action["turn"] = turn
        #state['turn'] = turn
        #print(state['turn'])
        #print(self.slot_set)
        symptom_dist = kwargs.get('symptom_dist')
        if self.parameter.get("state_reduced"):
            if self.parameter.get("agent_id").lower() in [
                    "agenthrljoint", "agenthrljoint2"
            ] or self.parameter.get("use_all_labels") == False:
                state_rep = reduced_state_to_representation_last(
                    state=state,
                    slot_set=self.slot_set,
                    parameter=self.parameter)  # sequence representation.
            else:
                state_rep = state_to_representation_last(
                    state=state,
                    action_set=self.action_set,
                    slot_set=self.slot_set,
                    disease_symptom=self.disease_symptom,
                    max_turn=self.parameter["max_turn"])
        else:
            state_rep = state_to_representation_last(
                state=state,
                action_set=self.action_set,
                slot_set=self.slot_set,
                disease_symptom=self.disease_symptom,
                max_turn=self.parameter["max_turn"])

        # Lower agent of HRL with four lower agents.
        if self.symptom_dist_as_input is True and self.agent_id.lower(
        ) == 'agenthrl':
            state_rep = np.concatenate((state_rep, symptom_dist), axis=0)
        #print('1')
        # HRL with goal (not joint training one.)
        goal = kwargs.get('goal')
        if self.agent_id.lower() in ['agentwithgoal', 'agentwithgoal2']:
            state_rep = np.concatenate((state_rep, goal), axis=0)

        if greedy_strategy is True:
            greedy = random.random()
            if greedy < self.parameter.get("epsilon"):
                action_index = random.randint(0, len(self.action_space) - 1)
            else:
                action_index = self.dqn.predict(Xs=[state_rep])[1]
        # Evaluating mode.
        else:
            action_index = self.dqn.predict(Xs=[state_rep])[1]
        #print(self.parameter.get("prioritized_replay"))
        if self.parameter.get("prioritized_replay"):
            #print('2')
            Ys = self.dqn.predict(Xs=[state_rep])[0]
            self.current_action_value = Ys.detach().cpu().numpy(
            )[0][action_index]

        agent_action = copy.deepcopy(self.action_space[action_index])
        agent_action["turn"] = turn
        agent_action["speaker"] = "agent"
        agent_action["action_index"] = action_index
        return agent_action, action_index
Exemple #13
0
    def next(self, state, turn, greedy_strategy, **kwargs):
        """
        Taking action based on different methods, e.g., DQN-based AgentDQN, rule-based AgentRule.
        Detail codes will be implemented in different sub-class of this class.
        :param state: a vector, the representation of current dialogue state.
        :param turn: int, the time step of current dialogue session.
        :return: the agent action, a tuple consists of the selected agent action and action index.
        """
        # disease_symptom are not used in state_rep.
        #在这里master先于lower agent一步进行判断,如果master可以inform疾病就不需要激活lower agent,反之再交由lower agent决策
        min_epsilon = self.parameter.get("epsilon")
        index = kwargs.get("index")
        if index<1000:
            epsilon = 0.3- (0.3-min_epsilon)*index/1000
        else:
            epsilon = min_epsilon
        #print(index, epsilon)
        #print(state["turn"])
        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation.
        else:
            state_rep = state_to_representation_last(state=state,
                                                 action_set=self.action_set,
                                                 slot_set=self.slot_set,
                                                 disease_symptom=self.disease_symptom,
                                                 max_turn=self.parameter["max_turn"]) # sequence representation.
        #print(len(state_rep))
        # Master agent takes an action.
        if self.parameter.get("initial_symptom") and state["turn"]>0:
            pass
        else:
            #print("####")
            if greedy_strategy == True:
                greedy = random.random()
                if greedy < epsilon:
                    self.master_action_index = random.randint(0, len(self.id2lowerAgent) - 1)
                    #master_action_index = random.sample(list(self.id2lowerAgent.keys()),1)[0]
                else:
                    self.master_action_index = self.master.predict(Xs=[state_rep])[1]
            # Evaluating mode.
            else:
                self.master_action_index = self.master.predict(Xs=[state_rep])[1]
            self.behave_prob = 1 - epsilon + epsilon / (len(self.id2lowerAgent) - 1)

            if self.master_action_index > (len(self.id2lowerAgent) - 1):
                self.action["turn"] = turn
                self.action["inform_slots"] = {"disease": self.id2disease[self.master_action_index - len(self.id2lowerAgent)]}
                self.action["speaker"] = 'agent'
                self.action["action_index"] = None
                return self.action, self.master_action_index
            #print(master_action_index)
            self.current_lower_agent_id = self.master_action_space[self.master_action_index]
            if self.parameter.get("prioritized_replay"):
                # print('2')
                Ys = self.master.predict(Xs=[state_rep])[0]
                self.current_action_value = Ys.detach().cpu().numpy()[0][self.master_action_index]

        # Lower agent takes an agent.
        #symptom_dist = self.disease_to_symptom_dist[self.id2disease[self.current_lower_agent_id]]
        # 在state_to_representation_last的步骤中,可以自动将不属于slot set中的slot去除掉
        agent_action, lower_action_index = self.id2lowerAgent[str(self.current_lower_agent_id)].next(state, turn, greedy_strategy=False)
        #assert len(agent_action["request_slots"])>0 and len(agent_action["inform_slots"])==0
        return agent_action, self.master_action_index, lower_action_index
    def record_training_sample(self, state, agent_action, reward, next_state, episode_over):
        """
        这里lower agent和master agent的sample都是在这里直接保存的,没有再另外调用函数。
        """
        # samples of internal critic.
        self.states_of_one_session.append(state)
        if episode_over is True:
            # current session is successful.
            if reward == self.parameter.get('reward_for_success'):
                for one_state in self.states_of_one_session:
                    # positive samples.
                    self.internal_critic.record_training_positive_sample(one_state, self.master_action_index)
                    # negative samples.
                    for index in range(self.output_size):
                        if index != self.master_action_index:
                            self.internal_critic.record_training_negative_sample(one_state, index)
            # current session is failed.
            elif reward == self.parameter.get('reward_for_fail') and state['turn'] <= self.parameter.get('max_turn') - 2:
                for one_state in self.states_of_one_session:
                    self.internal_critic.record_training_negative_sample(one_state, self.master_action_index)

        # reward shaping
        alpha = self.parameter.get("weight_for_reward_shaping")
        # if episode_over is True: shaping = self.reward_shaping(agent_action, self.master_action_index)
        # else: shaping = 0
        shaping = 0
        # Reward shaping only when non-terminal state.
        if episode_over is True:
            pass
        else:
            reward = reward + alpha * shaping

        # state to vec.
        if self.parameter.get("state_reduced"):
            state_rep = reduced_state_to_representation_last(state=state, slot_set=self.slot_set) # sequence representation.
            next_state_rep = reduced_state_to_representation_last(state=next_state, slot_set=self.slot_set)
            master_state_rep = reduced_state_to_representation_last(state=self.master_state, slot_set=self.slot_set)
        else:
            state_rep = state_to_representation_last(state=state, action_set=self.action_set, slot_set=self.slot_set,disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn'])
            next_state_rep = state_to_representation_last(state=next_state, action_set=self.action_set,slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn'])
            master_state_rep = state_to_representation_last(state=self.master_state, action_set=self.action_set,slot_set=self.slot_set, disease_symptom=self.disease_symptom, max_turn=self.parameter['max_turn'])


        # samples of master agent.
        sub_task_terminal, intrinsic_reward, _ = self.intrinsic_critic(next_state, self.master_action_index,disease_tag=self.disease_tag)
        self.master_reward += reward
        if self.sub_task_terminal is True and sub_task_terminal is True:
            last_master_action_rep = np.zeros(self.output_size)
            current_master_action_rep = np.zeros(self.output_size)
            # 将master所有已经选择的动作加入到状态表示中。
            for last_master_action_index in self.master_previous_actions:
                if last_master_action_index is not None:
                    last_master_action_rep[last_master_action_index] = 1
                    current_master_action_rep[last_master_action_index] = 1
            if self.master_action_index is not None: current_master_action_rep[self.master_action_index] = 1
            master_state_rep = np.concatenate((master_state_rep, last_master_action_rep), axis=0)
            next_master_state_rep = np.concatenate((next_state_rep, current_master_action_rep), axis=0)

            self.experience_replay_pool.append((master_state_rep, self.master_action_index, self.master_reward, next_master_state_rep, episode_over))

            # # master repeated action.
            # if self.master_action_index in self.master_previous_actions:
            #     temp_reward = - self.parameter.get("max_turn") / 2
            #     self.experience_replay_pool.append( (master_state_rep, self.master_action_index, temp_reward, next_master_state_rep, episode_over))
            # else:
            #     self.experience_replay_pool.append((master_state_rep, self.master_action_index, self.master_reward, next_master_state_rep, episode_over))

        # samples of lower agent.
        if agent_action is not None: # session is not over. Otherwise the agent_action is not one of the lower agent's actions.
            goal = np.zeros(len(self.disease_symptom))
            goal[self.master_action_index] = 1
            state_rep = np.concatenate((state_rep, goal), axis=0)
            next_state_rep = np.concatenate((next_state_rep, goal), axis=0)
            # reward shaping for lower agent on intrinsic reward.

            shaping = self.reward_shaping(state, next_state)
            intrinsic_reward += alpha * shaping
            self.lower_agent.experience_replay_pool.append((state_rep, agent_action, intrinsic_reward, next_state_rep, sub_task_terminal, self.master_action_index))
            # visitation count.
            self.lower_agent.action_visitation_count.setdefault(agent_action, 0)
            self.lower_agent.action_visitation_count[agent_action] += 1

            # # repeated action
            # if agent_action in self.worker_previous_actions:
            #     temp_reward = -0.5
            #     self.lower_agent.experience_replay_pool.append((state_rep, agent_action, temp_reward, next_state_rep, sub_task_terminal, self.master_action_index))
            # else:
            #     self.lower_agent.experience_replay_pool.append((state_rep, agent_action, intrinsic_reward, next_state_rep, sub_task_terminal, self.master_action_index))

            # 如果达到固定长度,同时去掉即将删除transition的计数。
            self.visitation_count[self.master_action_index, agent_action] += 1
            if len(self.lower_agent.experience_replay_pool) == self.lower_agent.experience_replay_pool.maxlen:
                _, pre_agent_action, _, _, _, pre_master_action = self.lower_agent.experience_replay_pool.popleft()
                self.visitation_count[pre_master_action, pre_agent_action] -= 1