class Agent:


	def __init__(self, training):

		# Create the environment
		self.environment = Environment()

		# Training or testing
		self.training = training

		# Set the initial training epsilon
		self.epsilon = 0.10

		# Get the number of actions for storing memories and Q-values etc.
		total_actions = self.environment.total_actions()
        
		# Training or testing
		if self.training:
			# Training : Set a learning rate
			self.learning_rate = 1e-2

			# Training: Set up the replay memory
			self.replay_memory = ReplayMemory(size=1000, num_actions=total_actions)

		else:
			# Testing: These are not needed
			self.learning_rate = None
			self.replay_memory = None

		# Create the neural network
		self.neural_network = NeuralNetwork(num_actions=total_actions, replay_memory=self.replay_memory)

		# This stores the rewards for each episode
		self.rewards = []


	def get_action(self, q_values, curr_question):
		"""
		Description:
			Use the current epsilon greedy to select an action
		Parameters:
			q_values: q_values at current state
			iteration_count: count of processed states
			training: training or testing
		Return:
			action: the selected reply
		"""

		# This is used when a random reply is selected. It encourages more efficient interactions when selecting randomly
		# as it only lets the chatbot select from the correct column (this speeds up training a bit by letting the 
		# agent find the correct answers a little more easily).
		if(curr_question) == 0:
			low = 0
			high = len(greetings)
		elif(curr_question) == 1:
			low = len(greetings)
			high = len(greetings) + len(place)
		else:
			low = len(greetings) + len(place)
			high = len(greetings) + len(place) + len(answers)

		# self.epsilon = probability of selecting a random action
		if np.random.random() < self.epsilon:
			# Random sentence reply in the correct column
			action = np.random.randint(low=low, high=high)
		else:
			# Select highest Q-value
			action = np.argmax(q_values)

		return action

	def get_testing_action(self, q_values):

		# During testing, always select the maximum Q-value
		action = np.argmax(q_values)

		return action

	def run(self, num_episodes=1000000):
		"""
		Description:
			Run the agent in either training or testing mode
		Parameters:
			num_episodes: The number of episodes the agent will run for in training mode
		"""

		if self.training:

			# Reset following loop
			end_episode = True

			# Counter for the states processed so far
			count_states = 0

			# Counter for the episodes processed so far
			count_episodes = 0

			# Counter for which step of the conversation (question)
			conversation_step = 0

			while count_episodes <= num_episodes:
				if end_episode:
					# Generate new conversation for the new episode
					conversation = self.environment.create_conversation()

					# The number of questions for this episode
					num_questions = len(conversation)

					# Reset conversation step
					conversation_step = 0				

					# Increment count_episodes as it is the end of the conversation
					count_episodes += 1

					# Reset episode reward
					reward_episode = 0.0

					if count_episodes > num_episodes:
						self.neural_network.save(count_states)

				if(conversation_step == 0):
					# First step in conversation. No previous question
					state = self.environment.get_state(curr_question = conversation[conversation_step])
				else:
					# Pass in the prev
					prev_question_idx = conversation_step - 1
					prev_question = conversation[prev_question_idx]
					state = self.environment.get_state(curr_question = conversation[conversation_step], prev_question = prev_question)

				# Estimate Q-Values for this state
				q_values = self.neural_network.get_q_values(states=[state])[0]

				# Determine the action
				action = self.get_action(q_values=q_values, curr_question = conversation_step)
				
				# Use action to take a step / reply
				reward, end_episode = self.environment.step(curr_question = conversation_step, action=action)

				# Increment to the next conversation step
				conversation_step += 1

				# Add to the reward for this episode
				reward_episode += reward

				# Increment the episode counter for calculating the control parameters
				count_states += 1

				# Add this memory to the replay memory
				self.replay_memory.add_memory(state=state,q_values=q_values,action=action,reward=reward,end_episode=end_episode)

				if self.replay_memory.is_full():
					# If the replay memory is full, update all the Q-values in a backwards sweep
					self.replay_memory.update_q_values()

					# Improve the policy with random batches from the replay memory
					self.neural_network.optimize(learning_rate=self.learning_rate, current_state=count_states)

					# Reset the replay memory
					self.replay_memory.reset_size()

				# Add the reward of the episode to the rewards array 
				if end_episode:
					self.rewards.append(reward_episode)

				# Reward from previous episodes (mean of last 30)
				if len(self.rewards) == 0:
					# No previous rewards
					reward_mean = 0.0
				else:
					# Get the mean of the last 30
					reward_mean = np.mean(self.rewards[-30:])

				if end_episode:
					# Print statistics
					statistics = "{0:4}:{1}\tReward: {2:.1f}\tMean Reward (last 30): {3:.1f}\tQ-min: {4:5.7f}\tQ-max: {5:5.7f}"
					print(statistics.format(count_episodes, count_states, reward_episode, reward_mean, np.min(q_values), np.max(q_values)))


		# TESTING
		else:
			# Clear cmd window and print chatbot intro
			clear = lambda: os.system('cls')
			clear()

			# Load the conversation checkpoint generated by training
			self.neural_network.load()

			# Set the previous question to blank so it returns a word vector of 0
			previous_question = ""

			# Current question counter
			curr_question = 0

			while True:

				user_input = input("Me: ").lower()
				try:

					# Get the state for this input
					if(previous_question == ""):
						# First question
						state = self.environment.get_state(curr_question=user_input)
					else:
						state = self.environment.get_state(curr_question=user_input, prev_question=previous_question)

					print("STATE:",state)

					# Input this question into the neural network
					q_values = self.neural_network.get_q_values(states=[state])[0]

					# Store previous question
					previous_question = user_input

					# Possible actions of agent (replies)
					possible_actions = greetings_answer + place_answer + answers

					print("Q VALUES:",q_values)

					# Select an action based on the q-values
					action = self.get_testing_action(q_values = q_values)

					print("Chatbot: ", possible_actions[action])
					if(curr_question < 2):
						curr_question += 1
					else:
						print("*****END OF CONVERSATION. RESTARTING...*****")
						# Reset
						curr_question = 0
						previous_question = ""

				except:
					print("Sorry, I don't understand you.")

				
				if user_input == "bye":
					print("Chatbot signing off in 5...")
					time.sleep(5)
					break
Exemple #2
0
class Learner:
    def __init__(self, path, model_path, target_model_path):
        self.path = path
        self.model_path = model_path
        self.target_model_path = target_model_path
        self.lr = 1e-3
        self.gamma = 0.95
        self.epsilon = 0.3
        self.batch_size = 32
        self.N_STEP = 3
        self.qf = DuelingQFunc()
        self.target_qf = DuelingQFunc()
        #model.state_dict():モデルの学習パラメータをとってきている
        self.target_qf.load_state_dict(self.qf.state_dict())
        self.optimizer = optim.Adam(self.qf.parameters(), lr = self.lr)
        self.criterion = nn.MSELoss()
        self.memory = ReplayMemory()
        self.total_step = 0
        
    def run(self):
        while True:
            read_step = 0
            while True and self.total_step % 100 == 0:
                read_step += 1
                if os.path.isfile(self.path):
                    try:
                        trans_memory = torch.load(self.path)
                        os.remove(self.path)
                        self.memory.add_memory(trans_memory)
                        break
                    except:
                        sleep(np.random.random() * 2 + 2)
                elif read_step > 25:
                    break
            #一定以上の履歴が格納されていればモデルの学習を行う
            if self.memory.get_memory_size() > 100:
                batch, indices, probability_distribution = self.memory.sample()
                #各サンプルにおける状態行動の値を取ってくる
                q_value = self.qf(batch['obs']).gather(1, batch['actions'])
                #PERにおけるimportance samplingによるバイアスを打ち消すための処理
                weights = torch.tensor(np.power(probability_distribution, -1) / self.batch_size, dtype = torch.float)
        
                #サンプルごとの処理を同時に行う
                with torch.no_grad():
                    #Q-networkにおける最大値のインデックスを取ってくる
                    max_next_q_value_index = self.qf(batch['next_obs']).max(dim = 1, keepdim = True)[1]
                    #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる
                    next_q_value = self.target_qf(batch['next_obs']).gather(1, max_next_q_value_index)
                    #目的とする値の導出
                    target_q_value = batch['rewards'] + self.gamma * next_q_value * (1 - batch['terminates'])
                #PERにおけるimportance samplingによるバイアスを打ち消すための処理
                #誤差の計算
                loss = torch.mean(weights * (0.5 * (q_value - target_q_value) ** 2))
                #勾配を0にリセットする
                self.optimizer.zero_grad()
                #逆誤差伝搬を計算する
                loss.backward()
                #勾配を更新する
                self.optimizer.step()
        
                with torch.no_grad():
                    q_value = self.qf(batch['obs']).gather(1, batch['actions'])
                    #Q-networkにおける最大値のインデックスを取ってくる
                    max_next_q_value_index = self.qf(batch['next_obs']).max(dim = 1, keepdim = True)[1]
                    #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる
                    next_q_value = self.target_qf(batch['next_obs']).gather(1, max_next_q_value_index)
                    #目的とする値の導出
                    target_q_value = batch['rewards'] + self.gamma * next_q_value * (1 - batch['terminates'])
                    priorities = (abs(target_q_value - q_value)).numpy().squeeze()
                    self.memory.update_priority(indices, priorities)
        
                if self.total_step % 50 == 0:
                    #targetネットワークの更新
                    torch.save(self.qf.state_dict(), self.model_path)
                    torch.save(self.target_qf.state_dict(), self.target_model_path)
                    self.target_qf.load_state_dict(self.qf.state_dict())
                self.total_step += 1