def evaluate( self, obs_queue: deque, agent: Agent, num_episode: int = 3, render: bool = False, ) -> Tuple[float, List[GymImg], ]: """evaluate uses the given agent to run the game for a few episodes and returns the average reward and the captured frames.""" self.__env = self.__env_eval ep_rewards = [] frames = [] for _ in range(self.get_eval_lives() * num_episode): observations, ep_reward, _frames = self.reset(render=render) for obs in observations: obs_queue.append(obs) if render: frames.extend(_frames) done = False while not done: state = self.make_state(obs_queue).to(self.__device).float() action = agent.run(state) obs, reward, done = self.step(action) ep_reward += reward obs_queue.append(obs) if render: frames.append(self.get_frame()) ep_rewards.append(ep_reward) self.__env = self.__env_train return np.sum(ep_rewards) / num_episode, frames
def evaluate( self, obs_queue: deque, agent: Agent, num_episode: int = 3, render: bool = False, ) -> Tuple[float, List[GymImg], ]: # 使用给定的代理运行游戏几次(3个玩家5个回合),并返回平均奖励和捕获的帧(实际上不返回帧)。 """evaluate uses the given agent to run the game for a few episodes and returns the average reward and the captured frames.""" self.__env = self.__env_eval ep_rewards = [] frames = [] for _ in range(self.get_eval_lives() * num_episode): observations, ep_reward, _frames = self.reset( render=render) # 初始化测试环境 for obs in observations: obs_queue.append(obs) if render: frames.extend(_frames) done = False while not done: # 开始测试 state = self.make_state(obs_queue).to(self.__device).float() action = agent.run(state) # 得到AI的下一个步骤 obs, reward, done = self.step(action) # 得到下一状态、收益、是否结束 ep_reward += reward obs_queue.append(obs) if render: frames.append(self.get_frame()) ep_rewards.append(ep_reward) # 统计收益 self.__env = self.__env_train return np.sum(ep_rewards) / num_episode, frames # 返回平均收益
MAX_STEPS = 5_000_000 EVALUATE_FREQ = 100_000 rand = random.Random() rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) os.mkdir(SAVE_PREFIX) # 保存训练好的模型 torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) # 创建一个环境,用于跑atari这个游戏 agent = Agent( # 创建一个agent env.get_action_dim(), # 游戏中动作的数量,一共有三个,分别是左右和不动 device, # 训练使用的设备 GAMMA, new_seed(), EPS_START, # epsilon的开始值 EPS_END, # epsilon的最小值 EPS_DECAY, # epsilon递减的 ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 用来记录agent的动作于结果之间的联系,用于后面神经网络的训练 #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False,
rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) if not os.path.exists(SAVE_PREFIX): os.mkdir(SAVE_PREFIX) torch.manual_seed(new_seed()) # The number of threads here needs to be adjusted based on the number of CPU cores available torch.set_num_threads(4) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, restore=restore, rlmodel=rlmodel, ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False,
EVALUATE_FREQ = 100_00 # 100000 rand = random.Random() rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) os.mkdir(SAVE_PREFIX) # 创建目录保存模型 torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) env = MyEnv(device) # 环境 agent = Agent( # 智能体 env.get_action_dim(), # 3 device, # cuda GAMMA, # 0.99 new_seed(), EPS_START, # 1 EPS_END, # 0.1 EPS_DECAY, # 1e6 ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 初始化经验池 #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") # 可视化进度条
MAX_STEPS = 50_000_000 EVALUATE_FREQ = 100_000 rand = random.Random() rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) os.mkdir(SAVE_PREFIX) torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, ) # memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) memory = Experience({ 'size': MEM_SIZE, 'batch_size': BATCH_SIZE, 'learn_start': WARM_STEPS, 'steps': MAX_STEPS, 'device': device, 'channels': STACK_SIZE + 1 }) #### Training ####
print('current lab:', ve, 'shade rate:', get_shade_time(action_queue)) if __name__ == "__main__": versions = vs0 for version in versions: #set_trace() print(version) dueling = False if version.find('dueling') == -1 else True stable = False if version.find('stable') == -1 else True if stable: action_queue = [] env = MyEnv(device) agent = Agent(env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, dueling, pretrained, stable * 0.1) if version.find('PER') != -1: memory = PERMemory(STACK_SIZE + 1, MEM_SIZE, device) #memory = Memory_Buffer_PER(MEM_SIZE) else: memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #memory = Memory_Buffer_PER(MEM_SIZE) #### Training #### obs_queue: deque = deque(maxlen=5) done = True avg_reward_arr = [] progressive = tqdm(range(MAX_STEPS),
if c == 0: return ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) else: return PERMemory(STACK_SIZE + 1, MEM_SIZE, device) if __name__ == "__main__": REWARD = [] global memory c = 1 while (c >= 0): agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, "./model_weights_b" ####************* ) memory = choosememory(c) Reward = [] for step in tqdm(range(MAX_STEPS), ncols=50, leave=False, unit="b"): if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to(device).float() action, value_this = agent.run(state, training) #当前选择的行动和Q值
from utils_env import MyEnv from utils_drl import Agent # In[2]: target = 78 model_name = f"model_{target:03d}" model_path = f"./models/{model_name}" device = torch.device("cpu") env = MyEnv(device) agent = Agent(env.get_action_dim(), device, 0.99, 0, 0, 0, 1, model_path, use_dueling=True, use_PR=True, use_DDQN=True) # In[3]: obs_queue = deque(maxlen=5) avg_reward, frames = env.evaluate(obs_queue, agent, render=True) print(f"Avg. Reward: {avg_reward:.1f}") get_ipython().system('rm -r eval_*') target_dir = f"eval_{target:03d}" os.mkdir(target_dir)
EVALUATE_FREQ = 100_000 # 评估频率,每100_000次停下来评估一下 rand = random.Random() # [0,1]中的任一浮点数值 rand.seed(GLOBAL_SEED) # 根据输入seed固定获得相同的随机数 new_seed = lambda: rand.randint(0, 1000_000) #0~1000000中任选其一 os.mkdir(SAVE_PREFIX) # 在"./models"创建目录 torch.manual_seed(new_seed()) # 将new_seed赋值给cpu的随机数种子 #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#创建设备:GPU/CPU? device = torch.device("cpu") env = MyEnv(device) agent = Agent( # 根据预设参数初始化 env.get_action_dim(), # 返回3,三个动作:["NOOP", "RIGHT", "LEFT"] device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) # 循环队列,三者分别对应通道、容量、设备,容量为MEM_SIZE=100_000 #### Training #### obs_queue: deque = deque(maxlen=5) #创建观察队列 done = True progressive = tqdm( range(MAX_STEPS), total=MAX_STEPS, # 预期的迭代次数 ncols=50, # 可以自定义进度条的总长度
MAX_STEPS = 50_000_000 EVALUATE_FREQ = 100_000 rand = random.Random() rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) os.mkdir(SAVE_PREFIX) torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, ) memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #### Training #### obs_queue: deque = deque(maxlen=5) done = True progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS, ncols=50, leave=False, unit="b") for step in progressive: if done: observations, _, _ = env.reset()
MAX_STEPS = 10_000_0 EVALUATE_FREQ = 100_0 rand = random.Random() rand.seed(GLOBAL_SEED) new_seed = lambda: rand.randint(0, 1000_000) #os.mkdir(SAVE_PREFIX) torch.manual_seed(new_seed()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = MyEnv(device) agent = Agent( env.get_action_dim(), device, GAMMA, new_seed(), EPS_START, EPS_END, EPS_DECAY, ) prioritized = True if prioritized: memory = MemoryBufferPER(STACK_SIZE + 1, MEM_SIZE, device) else: memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device) #### Training #### obs_queue: deque = deque(maxlen=5) done = True episode_reward = 0