def eval(cfg, saved_model_path=SAVED_MODEL_PATH): env, state_dim, n_actions = env_init() device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu agent = PolicyGradient(state_dim, device=device, lr=cfg.policy_lr) agent.load_model(saved_model_path + 'checkpoint.pth') rewards = [] moving_average_rewards = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE writer = SummaryWriter(log_dir) # 使用tensorboard的writer for i_episode in range(cfg.eval_eps): state = env.reset() ep_reward = 0 for _ in count(): action = agent.choose_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) ep_reward += reward state = next_state if done: print('Episode:', i_episode, ' Reward:', ep_reward) break rewards.append(ep_reward) if i_episode == 0: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode + 1) writer.close() print('Complete evaling!')
def train(cfg): env, n_states, n_actions = env_init() device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu agent = PolicyGradient(n_states, device=device, lr=cfg.policy_lr) '''下面带pool都是存放的transition序列用于gradient''' state_pool = [] # 存放每batch_size个episode的state序列 action_pool = [] reward_pool = [] for i_episode in range(cfg.train_eps): state = env.reset() ep_reward = 0 for t in count(): action = agent.choose_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) ep_reward += reward if done: reward = 0 state_pool.append(state) action_pool.append(float(action)) reward_pool.append(reward) state = next_state if done: print('Episode:', i_episode, ' Reward:', ep_reward) break # if i_episode % cfg.batch_size == 0: if i_episode > 0 and i_episode % 5 == 0: agent.update(reward_pool, state_pool, action_pool) state_pool = [] # 每个episode的state action_pool = [] reward_pool = []
def train(cfg): env, state_dim, n_actions = env_init() device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu agent = PolicyGradient(state_dim, device=device, lr=cfg.policy_lr) '''下面带pool都是存放的transition序列用于gradient''' state_pool = [] # 存放每batch_size个episode的state序列 action_pool = [] reward_pool = [] ''' 存储每个episode的reward用于绘图''' rewards = [] moving_average_rewards = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) # 使用tensorboard的writer for i_episode in range(cfg.train_eps): state = env.reset() ep_reward = 0 for _ in count(): action = agent.choose_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) ep_reward += reward if done: reward = 0 state_pool.append(state) action_pool.append(float(action)) reward_pool.append(reward) state = next_state if done: print('Episode:', i_episode, ' Reward:', ep_reward) break if i_episode > 0 and i_episode % cfg.batch_size == 0: agent.update(reward_pool, state_pool, action_pool) state_pool = [] # 每个episode的state action_pool = [] reward_pool = [] rewards.append(ep_reward) if i_episode == 0: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode + 1) writer.close() print('Complete training!') save_model(agent, model_path=SAVED_MODEL_PATH) '''存储reward等相关结果''' save_results(rewards, moving_average_rewards, tag='train', result_path=RESULT_PATH) plot(rewards) plot(moving_average_rewards, ylabel='moving_average_rewards_train')
def main(): RENDER = False MAX_EXPLORE = 2000 # env = gym.make('MountainCar-v0') env = gym.make('CartPole-v0') env.seed(1) env = env.unwrapped print(f"action_space: {env.action_space}") print(f"action_space.n: {env.action_space.n}") print(f"observation_space: {env.observation_space}") print(f"observation_space.shape: {env.observation_space.shape}") print(f"observation_space.high: {env.observation_space.high}") print(f"observation_space.low: {env.observation_space.low}") # action_space: Discrete(3) # action_space.n: 3 # observation_space: # Box(-1.2000000476837158, 0.6000000238418579, (2,), float32) # observation_space.shape: (2,) # observation_space.high: [0.6 0.07] # observation_space.low: [-1.2 -0.07] agent = PolicyGradient(n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=0.001, reward_decay=0.995) episodes = 3000 total_reward = [] for episode in range(episodes): s = env.reset() # s : list # s.shape = (2,) for i in range(MAX_EXPLORE): if RENDER: env.render() a = agent.choose_action(s) # a : scalar s_, r, done, _ = env.step(a) # s : list, shape=(2,) # r : float # done : bool agent.store_transition(s, a, r) if done or (i + 1) == MAX_EXPLORE: ep_rs_sum = sum(agent.ep_r) total_reward.append(ep_rs_sum) avg_reward = sum(total_reward) / len(total_reward) print(f"Episode: {episode+1}") print(f"\treward: {ep_rs_sum}, done: {done}") print(f"\tavg reward: {avg_reward}") vt = agent.learn() if avg_reward > 200: RENDER = True if episode == 30: plt.plot(vt) plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break s = s_
DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold RENDER = False # rendering wastes time env = gym.make('CartPole-v0') env.seed(1) # reproducible, vanilla Policy gradient has high variance env = env.unwrapped # # Basic info about the envir # print(env.action_space) # print(env.observation_space) # print(env.observation_space.high) # print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, ) running_reward = 0 for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_reward(reward)