ma_rewards.append(ep_reward) print('Complete training!') return rewards, ma_rewards if __name__ == "__main__": cfg = HierarchicalDQNConfig() # train env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) agent.save(path=cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) plot_rewards(rewards, ma_rewards, tag="train", algo=cfg.algo, path=cfg.result_path) # eval env, agent = env_agent_config(cfg, seed=10) agent.load(path=cfg.model_path) rewards, ma_rewards = eval(cfg, env, agent) save_results(rewards, ma_rewards, tag='eval', path=cfg.result_path) plot_rewards(rewards, ma_rewards, tag="eval", env=cfg.env, algo=cfg.algo, path=cfg.result_path)
if i_episode % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format( i_episode + 1, cfg.train_eps, ep_reward, i_step, done)) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward if ma_rewards: ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) print('Complete training!') return rewards, ma_rewards if __name__ == "__main__": cfg = DQNConfig() env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 n_states = env.observation_space.shape[0] n_actions = env.action_space.n agent = DQN(n_states, n_actions, cfg) rewards, ma_rewards = train(cfg, env, agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train", algo=cfg.algo, path=RESULT_PATH)
ep_reward += reward one_ep_transition.append((state, action, reward)) state = next_state if done: break rewards.append(ep_reward) if ma_rewards: ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) else: ma_rewards.append(ep_reward) agent.update(one_ep_transition) if (i_episode + 1) % 10 == 0: print("Episode:{}/{}: Reward:{}".format(i_episode + 1, mc_cfg.n_episodes, ep_reward)) return rewards, ma_rewards if __name__ == "__main__": mc_cfg = MCConfig() env = RacetrackEnv() n_actions = 9 agent = FisrtVisitMC(n_actions, mc_cfg) rewards, ma_rewards = mc_train(mc_cfg, env, agent) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train", algo="On-Policy First-Visit MC Control", path=RESULT_PATH)
ma_rewards.append(ep_reward) return rewards, ma_rewards if __name__ == "__main__": cfg = TD3Config() env = gym.make(cfg.env) env.seed(cfg.seed) # Set seeds torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) td3 = TD3(state_dim, action_dim, max_action, cfg) cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/' td3.load(cfg.model_path) td3_rewards, td3_ma_rewards = eval(cfg.env, td3, cfg.seed) make_dir(cfg.result_path, cfg.model_path) save_results(td3_rewards, td3_ma_rewards, tag='eval', path=cfg.result_path) plot_rewards( { 'td3_rewards': td3_rewards, 'td3_ma_rewards': td3_ma_rewards, }, tag="eval", env=cfg.env, algo=cfg.algo, path=cfg.result_path) # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/' # agent.load(cfg.result_path) # eval(cfg.env,agent, cfg.seed)
self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.result_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/models/' # 保存模型的路径 self.save = True # 是否保存图片 def env_agent_config(cfg, seed=1): env = gym.make(cfg.env_name) env.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = PPO(state_dim, action_dim, cfg) return env, agent cfg = PPOConfig() plot_cfg = PlotConfig() # 训练 env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 agent.save(path=plot_cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 测试 env, agent = env_agent_config(cfg, seed=10) agent.load(path=plot_cfg.model_path) rewards, ma_rewards = eval(cfg, env, agent) save_results(rewards, ma_rewards, tag='eval', path=plot_cfg.result_path) plot_rewards(rewards, ma_rewards, plot_cfg, tag="eval")
# reward_pool.append(reward) state = next_state if done: print('Episode:', i_episode, 'Reward:', ep_reward) break rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9 * ma_rewards[-1] + ep_reward) else: ma_rewards.append(ep_reward) print("Complete Evaluating!") return rewards, ma_rewards if __name__ == '__main__': env = gym.make('CartPole-v0') env.seed( 1) # seed()用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed()值,则每次生成的随机数相同,设置仅一次有效 state_dim = env.observation_space.shape[0] action_dim = env.action_space.n cfg = PGConfig() agent = PolicyGradient(state_dim, cfg, device) # rewards, ma_rewards = eval(cfg, env, agent) rewards, ma_rewards = train(cfg, env, agent) agent.save_model(SAVED_MODEL_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag='train', algo="Policy Gradient", path=RESULT_PATH)
# 此经验池中存储的是低层每一步的数据 agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, onehot_goal]), done) state = next_state agent.update() # 此经验池中每达到一次高层限定的目标时就存储一次数据 agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward, agent.loss_numpy, agent.meta_loss_numpy)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) print('Complete training !') return rewards, ma_rewards if __name__ == '__main__': env = gym.make('CartPole-v0') env.seed(1) cfg = HierarchicalDQNConfig() state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = HierarchicalDQN(state_dim, action_dim, cfg) rewards, ma_rewards = train(cfg, env, agent) save_results(rewards, ma_rewards, 'train', RESULT_PATH) plot_rewards(rewards, ma_rewards, 'train', RESULT_PATH) plot_losses(agent.losses, cfg.algo, RESULT_PATH)
ep_reward += reward ep_steps += 1 if done: break rewards.append(ep_reward) steps.append(ep_steps) if ma_rewards: ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) # 移动平均奖励,上一回合和这一回合的奖励和 else: ma_rewards.append(ep_reward) print("Episode:{}/{}; reward:{}".format(i_episode + 1, cfg.train_eps, ep_reward)) return rewards, ma_rewards if __name__ == "__main__": cfg = QlearningConfig() env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = CliffWalkingWapper(env) action_dim = env.env.action_space.n # 因为env是CliffWalkingWapper类打包后的环境,所以要调用属性值的话必须用env.env agent = QLearning(action_dim, cfg) rewards, ma_rewards = train(cfg, env, agent, False) eval(cfg, env, agent, True) agent.save(path=SAVED_MODEL_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag='train', algo='Off-Policy First-Visit QLearning', path=RESULT_PATH)