def __init__(self, number, num_actions, trainer, model_name): self.name = "worker_" + str(number) self.number = number self.model_name = model_name # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_ac = ACNet(num_actions, self.name, trainer) # 从全局 AC 网络中获取数据 self.update_target_graph = self.update_target(global_scope_name, self.name) # 使用的环境 self.env = EnvLab(width=out_size_width, height=out_size_height, fps=out_fps, level=level)
def __init__(self, number, num_actions, trainer, model_name): self.name = "worker_" + str(number) self.number = number self.model_name = model_name # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_ac = ACNet(num_actions, self.name, trainer) self.update_target_graph = self.update_target(global_scope_name, self.name) if (lab): self.env = EnvLab(80, 80, 60, "seekavoid_arena_01") else: self.env = EnvVizDoom(vizdoom_scenario)
reward = env.Act(action, 1) reward_total += reward if (not env.IsRunning()): break state_raw = env.Observation() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--gpu", help="the GPU to use") args = parser.parse_args() if (args.gpu): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu if (lab): env = EnvLab(80, 80, 60, "seekavoid_arena_01") else: env = EnvVizDoom(vizdoom_scenario) agent = Agent(env.NumActions()) if (train): agent.Train() Test(agent)
class Worker(object): def __init__(self, number, num_actions, trainer, model_name): self.name = "worker_" + str(number) self.number = number self.model_name = model_name # Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_ac = ACNet(num_actions, self.name, trainer) # 从全局 AC 网络中获取数据 self.update_target_graph = self.update_target(global_scope_name, self.name) # 使用的环境 self.env = EnvLab(width=out_size_width, height=out_size_height, fps=out_fps, level=level) # Copies one set of variables to another. # Used to set worker network parameters to those of global network. def update_target(self, from_scope, to_scope): # 指定了 global 和 local 的作用域 from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) op_holder = [] # 由于 global 和 local 定义的变量一致。所以直接组合成同一个 Zip,然后从 from 中给 to 赋值 for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder # Calculate discounted returns. def Discount(self, x, gamma): # 他是计算来的这是一个通用的函数,具体的含义只有在调用位置才有指定 for idx in reversed(range(len(x) - 1)): x[idx] += x[idx + 1] * gamma return x def Start(self, session, saver, coord): worker_process = lambda: self.Process(session, saver, coord) thread = threading.Thread(target=worker_process) thread.start() global start_time start_time = time.time() return thread def Train(self, episode_buffer, sess, bootstrap_value): """ 训练数据集 :param episode_buffer: local 缓存的数据 :param sess: 会话 :param bootstrap_value: 自举值,总是计算得到的当前最新的状态的 :return: """ # 定义的 local_ac 保存的计算的数据 episode_buffer = np.array(episode_buffer) states = episode_buffer[:, 0] actions = episode_buffer[:, 1] rewards = episode_buffer[:, 2] values = episode_buffer[:, 3] # Here we take the rewards and values from the episode_buffer, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" # **删除** 目前的线索包括 迹函数、后观点的 TD error # **删除** 生成一个带有刚刚计算出来的值函数的新的 reward 列表,需要用来计算 eligibility trace。 rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) # 这个对应于伪代码中,更新 R 的公式,目前 x 中的值是 r_i。对于 x 中最后一个元素,就是刚刚计算出来的 R。最终计算出来的 R 保存在 x[0] 中。 # 我们不需要最后扩展出来的 R。 # 这个可以推出来。Discount 的公式可以提炼出:X(i) = SIGMA(i=0,i<指定的上界)(r(t+i) * gama^i)。这里的t是起始的位置,这个函数默认是 0。可以对应论文章的 t_start discounted_rewards = self.Discount(rewards_plus, gamma)[:-1] # advatnages 的定义 A(at,st) = Q(at,st) - V(st) value_plus = np.asarray(values.tolist() + [bootstrap_value]) # 这一个部分计算的是 TD error :delta_t = R(t+1) + gama * V(S(t+1)) - V(S(t)) advantages = rewards + gamma * value_plus[1:] - value_plus[:-1] # TD 的后向观点的计算。 更新 Q advantages = self.Discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save self.local_ac.Train(sess, discounted_rewards, states, actions, advantages) def Process(self, sess, saver, coord): global step, train_scores, start_time, lock print("Starting worker " + str(self.number)) while (not coord.should_stop()): sess.run(self.update_target_graph) episode_buffer = [] episode_reward = 0 # 重新设置环境 self.env.Reset() s = self.env.Observation() # 当 channel==1 的时候,把 环境返回的 RGBD 格式转换为 RGB 格式。最后要缩放 s = Preprocess(s) self.local_ac.ResetLstm() while (self.env.IsRunning()): # Take an action using probabilities from policy network output. a, v = self.local_ac.GetAction(sess, s) r = self.env.Act(a, frame_repeat) finished = not self.env.IsRunning() if (not finished): s1 = self.env.Observation() s1 = Preprocess(s1) else: s1 = None episode_buffer.append([s, a, r, v]) episode_reward += r s = s1 lock.acquire() step += 1 if (step % save_each == 0): model_name_curr = self.model_name + "_{:04}".format( int(step / save_each)) print("\nSaving the network weigths to:", model_name_curr, file=sys.stderr) saver.save(sess, model_name_curr) PrintStat(time.time() - start_time, step, step_num, train_scores) train_scores = [] if (step == step_num): coord.request_stop() lock.release() # If the episode hasn't ended, but the experience buffer is full, then we # make an update step using that experience rollout. if (len(episode_buffer) == t_max or (finished and len(episode_buffer) > 0)): # Since we don't know what the true final return is, # we "bootstrap" from our current value estimation. if (not finished): v1 = self.local_ac.GetValue(sess, s) self.Train(episode_buffer, sess, v1) episode_buffer = [] # 将全局 AC 中的数据更新 sess.run(self.update_target_graph) else: self.Train(episode_buffer, sess, 0.0) lock.acquire() train_scores.append(episode_reward) lock.release()
if (not env.IsRunning()): break state_raw = env.Observation() # 释放资源 if test_write_video: out_video.release() cv2.destroyAllWindows() if __name__ == '__main__': # seekavoid_arena_01 的 actions: """ [{'max': 512, 'min': -512, 'name': 'LOOK_LEFT_RIGHT_PIXELS_PER_FRAME'}, {'max': 512, 'min': -512, 'name': 'LOOK_DOWN_UP_PIXELS_PER_FRAME'}, {'max': 1, 'min': -1, 'name': 'STRAFE_LEFT_RIGHT'}, {'max': 1, 'min': -1, 'name': 'MOVE_BACK_FORWARD'}, {'max': 1, 'min': 0, 'name': 'FIRE'}, {'max': 1, 'min': 0, 'name': 'JUMP'}, {'max': 1, 'min': 0, 'name': 'CROUCH'}] """ if (lab): env = EnvLab(width=out_size_width, height=out_size_height, fps=out_fps, level=level) agent = Agent() Test(agent)
state = Preprocess(state_raw) action = agent.Act(state) for _ in range(frame_repeat): if (test_display): cv2.imshow("frame-test", state_raw) cv2.waitKey(20) if (test_write_video): out_video.write(state_raw) reward = env.Act(action, 1) reward_total += reward if (not env.IsRunning()): break state_raw = env.Observation() if __name__ == '__main__': if (lab): env = EnvLab(80, 80, 60, "seekavoid_arena_01") else: env = EnvVizDoom(vizdoom_scenario) agent = Agent() Test(agent)