class Agent: def __init__(self, training): # Create the environment self.environment = Environment() # Training or testing self.training = training # Set the initial training epsilon self.epsilon = 0.10 # Get the number of actions for storing memories and Q-values etc. total_actions = self.environment.total_actions() # Training or testing if self.training: # Training : Set a learning rate self.learning_rate = 1e-2 # Training: Set up the replay memory self.replay_memory = ReplayMemory(size=1000, num_actions=total_actions) else: # Testing: These are not needed self.learning_rate = None self.replay_memory = None # Create the neural network self.neural_network = NeuralNetwork(num_actions=total_actions, replay_memory=self.replay_memory) # This stores the rewards for each episode self.rewards = [] def get_action(self, q_values, curr_question): """ Description: Use the current epsilon greedy to select an action Parameters: q_values: q_values at current state iteration_count: count of processed states training: training or testing Return: action: the selected reply """ # This is used when a random reply is selected. It encourages more efficient interactions when selecting randomly # as it only lets the chatbot select from the correct column (this speeds up training a bit by letting the # agent find the correct answers a little more easily). if(curr_question) == 0: low = 0 high = len(greetings) elif(curr_question) == 1: low = len(greetings) high = len(greetings) + len(place) else: low = len(greetings) + len(place) high = len(greetings) + len(place) + len(answers) # self.epsilon = probability of selecting a random action if np.random.random() < self.epsilon: # Random sentence reply in the correct column action = np.random.randint(low=low, high=high) else: # Select highest Q-value action = np.argmax(q_values) return action def get_testing_action(self, q_values): # During testing, always select the maximum Q-value action = np.argmax(q_values) return action def run(self, num_episodes=1000000): """ Description: Run the agent in either training or testing mode Parameters: num_episodes: The number of episodes the agent will run for in training mode """ if self.training: # Reset following loop end_episode = True # Counter for the states processed so far count_states = 0 # Counter for the episodes processed so far count_episodes = 0 # Counter for which step of the conversation (question) conversation_step = 0 while count_episodes <= num_episodes: if end_episode: # Generate new conversation for the new episode conversation = self.environment.create_conversation() # The number of questions for this episode num_questions = len(conversation) # Reset conversation step conversation_step = 0 # Increment count_episodes as it is the end of the conversation count_episodes += 1 # Reset episode reward reward_episode = 0.0 if count_episodes > num_episodes: self.neural_network.save(count_states) if(conversation_step == 0): # First step in conversation. No previous question state = self.environment.get_state(curr_question = conversation[conversation_step]) else: # Pass in the prev prev_question_idx = conversation_step - 1 prev_question = conversation[prev_question_idx] state = self.environment.get_state(curr_question = conversation[conversation_step], prev_question = prev_question) # Estimate Q-Values for this state q_values = self.neural_network.get_q_values(states=[state])[0] # Determine the action action = self.get_action(q_values=q_values, curr_question = conversation_step) # Use action to take a step / reply reward, end_episode = self.environment.step(curr_question = conversation_step, action=action) # Increment to the next conversation step conversation_step += 1 # Add to the reward for this episode reward_episode += reward # Increment the episode counter for calculating the control parameters count_states += 1 # Add this memory to the replay memory self.replay_memory.add_memory(state=state,q_values=q_values,action=action,reward=reward,end_episode=end_episode) if self.replay_memory.is_full(): # If the replay memory is full, update all the Q-values in a backwards sweep self.replay_memory.update_q_values() # Improve the policy with random batches from the replay memory self.neural_network.optimize(learning_rate=self.learning_rate, current_state=count_states) # Reset the replay memory self.replay_memory.reset_size() # Add the reward of the episode to the rewards array if end_episode: self.rewards.append(reward_episode) # Reward from previous episodes (mean of last 30) if len(self.rewards) == 0: # No previous rewards reward_mean = 0.0 else: # Get the mean of the last 30 reward_mean = np.mean(self.rewards[-30:]) if end_episode: # Print statistics statistics = "{0:4}:{1}\tReward: {2:.1f}\tMean Reward (last 30): {3:.1f}\tQ-min: {4:5.7f}\tQ-max: {5:5.7f}" print(statistics.format(count_episodes, count_states, reward_episode, reward_mean, np.min(q_values), np.max(q_values))) # TESTING else: # Clear cmd window and print chatbot intro clear = lambda: os.system('cls') clear() # Load the conversation checkpoint generated by training self.neural_network.load() # Set the previous question to blank so it returns a word vector of 0 previous_question = "" # Current question counter curr_question = 0 while True: user_input = input("Me: ").lower() try: # Get the state for this input if(previous_question == ""): # First question state = self.environment.get_state(curr_question=user_input) else: state = self.environment.get_state(curr_question=user_input, prev_question=previous_question) print("STATE:",state) # Input this question into the neural network q_values = self.neural_network.get_q_values(states=[state])[0] # Store previous question previous_question = user_input # Possible actions of agent (replies) possible_actions = greetings_answer + place_answer + answers print("Q VALUES:",q_values) # Select an action based on the q-values action = self.get_testing_action(q_values = q_values) print("Chatbot: ", possible_actions[action]) if(curr_question < 2): curr_question += 1 else: print("*****END OF CONVERSATION. RESTARTING...*****") # Reset curr_question = 0 previous_question = "" except: print("Sorry, I don't understand you.") if user_input == "bye": print("Chatbot signing off in 5...") time.sleep(5) break
class Learner: def __init__(self, path, model_path, target_model_path): self.path = path self.model_path = model_path self.target_model_path = target_model_path self.lr = 1e-3 self.gamma = 0.95 self.epsilon = 0.3 self.batch_size = 32 self.N_STEP = 3 self.qf = DuelingQFunc() self.target_qf = DuelingQFunc() #model.state_dict():モデルの学習パラメータをとってきている self.target_qf.load_state_dict(self.qf.state_dict()) self.optimizer = optim.Adam(self.qf.parameters(), lr = self.lr) self.criterion = nn.MSELoss() self.memory = ReplayMemory() self.total_step = 0 def run(self): while True: read_step = 0 while True and self.total_step % 100 == 0: read_step += 1 if os.path.isfile(self.path): try: trans_memory = torch.load(self.path) os.remove(self.path) self.memory.add_memory(trans_memory) break except: sleep(np.random.random() * 2 + 2) elif read_step > 25: break #一定以上の履歴が格納されていればモデルの学習を行う if self.memory.get_memory_size() > 100: batch, indices, probability_distribution = self.memory.sample() #各サンプルにおける状態行動の値を取ってくる q_value = self.qf(batch['obs']).gather(1, batch['actions']) #PERにおけるimportance samplingによるバイアスを打ち消すための処理 weights = torch.tensor(np.power(probability_distribution, -1) / self.batch_size, dtype = torch.float) #サンプルごとの処理を同時に行う with torch.no_grad(): #Q-networkにおける最大値のインデックスを取ってくる max_next_q_value_index = self.qf(batch['next_obs']).max(dim = 1, keepdim = True)[1] #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる next_q_value = self.target_qf(batch['next_obs']).gather(1, max_next_q_value_index) #目的とする値の導出 target_q_value = batch['rewards'] + self.gamma * next_q_value * (1 - batch['terminates']) #PERにおけるimportance samplingによるバイアスを打ち消すための処理 #誤差の計算 loss = torch.mean(weights * (0.5 * (q_value - target_q_value) ** 2)) #勾配を0にリセットする self.optimizer.zero_grad() #逆誤差伝搬を計算する loss.backward() #勾配を更新する self.optimizer.step() with torch.no_grad(): q_value = self.qf(batch['obs']).gather(1, batch['actions']) #Q-networkにおける最大値のインデックスを取ってくる max_next_q_value_index = self.qf(batch['next_obs']).max(dim = 1, keepdim = True)[1] #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる next_q_value = self.target_qf(batch['next_obs']).gather(1, max_next_q_value_index) #目的とする値の導出 target_q_value = batch['rewards'] + self.gamma * next_q_value * (1 - batch['terminates']) priorities = (abs(target_q_value - q_value)).numpy().squeeze() self.memory.update_priority(indices, priorities) if self.total_step % 50 == 0: #targetネットワークの更新 torch.save(self.qf.state_dict(), self.model_path) torch.save(self.target_qf.state_dict(), self.target_model_path) self.target_qf.load_state_dict(self.qf.state_dict()) self.total_step += 1