class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习个体 ''' def __init__(self, env: Env = None, trans_capacity = 20000, hidden_dim: int = 16): if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input = self.input_dim, dim_output = self.output_dim, dim_hidden = self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return def _decayed_epsilon(self,cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon = None): '''依据更新策略的价值函数(网络)产生一个行为 ''' Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon = None): return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs): trans_pieces = self.sample(batch_size) # 随机获取记忆里的Transmition states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 y_batch = self.Q(states_0) # 得到numpy格式的结果 Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\ (~ is_done) # is_done则Q_target==reward_1 y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x = X_batch, y = y_batch, learning_rate = learning_rate, epochs = epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma = 0.99, learning_rate=1e-5, max_episodes=1000, batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, epochs = 1): total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode = num_episode, min_epsilon = min_epsilon, max_epsilon = 1, target_episode = target_episode) self.state = self.env.reset() # self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) s1, r1, is_done, info, total_reward = self.act(a0) # self.env.render() step_in_episode += 1 if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return
class ApproxQAgent(Agent): def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) self.PQ = self.Q.clone() return def _decayed_epsilon(self, cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon=None): Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon=None): return self._curPolicy(s, epsilon) def _update_Q_net(self): self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs, r, s): trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 y_batch = self.Q(states_0) Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * \ (~ is_done) y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x=X_batch, y=y_batch, learning_rate=learning_rate, epochs=epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma=0.99, learning_rate=1e-5, max_episodes=1000, batch_size=64, min_epsilon=0.2, epsilon_factor=0.1, epochs=1): total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor file = open('dqn.csv', 'w') file.write("Episode" + "," + "Distance" + "\n") tot_dis = 0 file = open('reward.csv', 'w') file.write("Steps in Episode" + "," + "reward" + "\n") while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode=num_episode, min_epsilon=min_epsilon, max_epsilon=1, target_episode=target_episode) self.state = self.env._reset() self.env._render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) s1, r1, is_done, dis_info = self.env._step_b(a0) self.env._render() step_in_episode += 1 tot_dis += r1 print("Step in Episode :: ", step_in_episode) print("Distance of agent from goal :: ", dis_info) file.write(str(step_in_episode) + "," + str(tot_dis) + "\n") if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs, r1, s1) file.close() mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}".format( self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 #print("Episode :: ", num_episode) # print("Distance of agent from goal :: ", dis_info) return
class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习个体 #Function 1 value function approximation 2 base on Experience Relay, which is good for eliminating relationship of transition in a single episode, in order to get a better approximation 3 DQN ''' def __init__(self, env: Env = None, trans_capacity = 20000, hidden_dim: int = 16): '''set input_dim(w.r.t. obs.space) and output_dim(w.r.t. action_space)... super(...).__init__(...), self.Q = Approximator(...) self.PQ = self.Q.clone() #PQ for updating parameters #args env: environment of this agent trans_capacity:<int>max num. of transitions in memory hiddden_dim:<int>num. of nodes in hidden layer ''' if env is None: raise "agent should have an environment" super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] #e.g. observation_space>>Box(6,), .shape>>(6,) if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n # elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) self.hidden_dim = hidden_dim self.Q = Approximator(dim_input = self.input_dim, dim_output = self.output_dim, dim_hidden = self.hidden_dim) self.PQ = self.Q.clone() # 更新参数的网络 return def _decayed_epsilon(self,cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon #return epsilon<float>changing from max_epsilon(when cur_episode=0) to min_epsilon w.r.t. cur_episode ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) #slope*cur_episode is negative def _curPolicy(self, s, epsilon = None): '''依据更新策略的价值函数(网络)产生一个行为 #args s: state s0<6x1 ndarray> epsilon: =None means greedy, otherwise epsilon greedy #return an action a0<int> w.r.t. PQ(policy evaluation) using decayed epsilon-greedy(policy improvement) ''' Q_s = self.PQ(s) # rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon = None): #若只有一个Policy,则可略 return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone() def _learn_from_memory(self, gamma, batch_size, learning_rate, epochs): # get Transmition randomly from experience, return a <list>, consists of batch_size * Transition object(consists of data,s0,a0,reward,s1,is_done) trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) #ndarray actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 # ndarray, consists of list([Q(s0)(a_0), Q(s0)(a_1),....]), describe all Q of all actions in state s0 #y_batch = self.Q(states_0) #main difference in a0 dimension y_batch = self.PQ(states_0) #only Q(s,a,w) in a0 dimension different. But always walk around #matrix-weise calculation Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1)*\ (~ is_done) # is_done则Q_target==reward_1 #Attension: y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x = X_batch, y = y_batch, learning_rate = learning_rate, epochs = epochs) mean_loss = loss.sum().data[0] / batch_size self._update_Q_net() return mean_loss def learning(self, gamma = 0.99, learning_rate=1e-5, max_episodes=1000, batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, epochs = 1): '''contruct experience, when nums of trans. in experience enough, start learning from experience, compute loss Methods details see below #Arguments gamma = 0.99, # discount factor, range from [0,1] learning_rate=1e-5, # 集中学习的规模 max_episodes=1000, # 最大训练Episode数量 batch_size = 64, min_epsilon = 0.2, epsilon_factor = 0.1, # 开始使用最小Epsilon时Episode的序号占最大 # Episodes序号之比,该比值越小,表示使用 # min_epsilon的episode越多 epochs = 1): # 每个batch_size训练的次数 ''' total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: #for each episode until max_episode, get loss epsilon = self._decayed_epsilon(cur_episode = num_episode, min_epsilon = min_epsilon, max_epsilon = 1, target_episode = target_episode) self.state = self.env.reset() self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 # is_done = False while not is_done:#for every transition s0 = self.state #self.state change inside self.act(a0) a0 = self.performPolicy(s0, epsilon) #get action w.r.t. PQ using decayed epsilon-greedy s1, r1, is_done, info, total_reward = self.act(a0) #inside self.act(a0): self.state = s1 #inside act also:sotre trans as episode_list in experience, and as trans_list in episode, and accumulate the total_reward self.env.render() step_in_episode += 1 if self.total_trans > batch_size: loss += self._learn_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return
class ApproxQAgent(Agent): '''使用近似的价值函数实现的Q学习的个体 ''' def __init__(self, env: Env = None, trans_capacity=20000, hidden_dim: int = 16): if env is None: raise Exception("agent should have an environment") super(ApproxQAgent, self).__init__(env, trans_capacity) self.input_dim, self.output_dim = 1, 1 # 适应不同的状态和行为空间类型 if isinstance(env.observation_space, spaces.Discrete): self.input_dim = 1 elif isinstance(env.observation_space, spaces.Box): self.input_dim = env.observation_space.shape[0] if isinstance(env.action_space, spaces.Discrete): self.output_dim = env.action_space.n elif isinstance(env.action_space, spaces.Box): self.output_dim = env.action_space.shape[0] # print("{},{}".format(self.input_dim, self.output_dim)) # 隐藏层神经元数目 self.hidden_dim = hidden_dim # 关键在下面两句,声明了两个近似价值函数 # 变量Q是一个计算价值,产生loss的近似函数(网络), # 该网络参数在一定时间段内不更新参数 self.Q = Approximator(dim_input=self.input_dim, dim_output=self.output_dim, dim_hidden=self.hidden_dim) # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新 # 更新参数的网络 self.PQ = self.Q.clone() return def _learning_from_memory(self, gamma, batch_size, learning_rate, epochs): # 随机获取记忆里的Transmition trans_pieces = self.sample(batch_size) states_0 = np.vstack([x.s0 for x in trans_pieces]) actions_0 = np.array([x.a0 for x in trans_pieces]) reward_1 = np.array([x.reward for x in trans_pieces]) is_done = np.array([x.is_done for x in trans_pieces]) states_1 = np.vstack([x.s1 for x in trans_pieces]) X_batch = states_0 # 调用的时approximator的__call__方法 y_batch = self.Q(states_0) # 使用了Batch,代码是矩阵运算 # np.max => axis=1时取出最大的一列;axis=0时取出最大的一行 # ~ True = -2; ~ False = -1 Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * (~ is_done) y_batch[np.arange(len(X_batch)), actions_0] = Q_target # loss is a torch Variable with size of 1 loss = self.PQ.fit(x=X_batch, y=y_batch, learning_rate=learning_rate, epochs=epochs) mean_loss = loss.sum().item() / batch_size self._update_Q_net() return mean_loss def learning(self, gamma=0.99, learning_rate=1e-5, max_episodes=1000, batch_size=64, min_epsilon=0.2, epsilon_factor=0.1, epochs=1): '''learning的主要工作是构建经历,当构建的经历足够时,同时启动基于经历的学习 ''' total_steps, step_in_episode, num_episode = 0, 0, 0 target_episode = max_episodes * epsilon_factor while num_episode < max_episodes: epsilon = self._decayed_epsilon(cur_episode=num_episode, min_epsilon=min_epsilon, max_epsilon=1, target_episode=target_episode) self.state = self.env.reset() self.env.render() step_in_episode = 0 loss, mean_loss = 0.00, 0.00 is_done = False while not is_done: s0 = self.state a0 = self.performPolicy(s0, epsilon) # act方法封装了将Transition记录至Experience中的过程 s1, r1, is_done, info, total_reward = self.act(a0) # self.env.render() step_in_episode += 1 # 当经历里有足够大小的Transition时,开始启用基于经历的学习 if self.total_trans > batch_size: loss += self._learning_from_memory(gamma, batch_size, learning_rate, epochs) mean_loss = loss / step_in_episode print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". format(self.experience.last, epsilon, mean_loss)) # print(self.experience) total_steps += step_in_episode num_episode += 1 return def _decayed_epsilon(self, cur_episode: int, min_epsilon: float, max_epsilon: float, target_episode: int) -> float: '''获得一个在一定范围内的epsilon ''' slope = (min_epsilon - max_epsilon) / (target_episode) intercept = max_epsilon return max(min_epsilon, slope * cur_episode + intercept) def _curPolicy(self, s, epsilon=None): '''依据更新策略的价值函数(网络)产生一个行为 ''' Q_s = self.PQ(s) rand_value = random() if epsilon is not None and rand_value < epsilon: return self.env.action_space.sample() else: return int(np.argmax(Q_s)) def performPolicy(self, s, epsilon=None): return self._curPolicy(s, epsilon) def _update_Q_net(self): '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 ''' self.Q = self.PQ.clone()