def eval(cfg, saved_model_path=SAVED_MODEL_PATH):
    env, state_dim, n_actions = env_init()
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
    agent = PolicyGradient(state_dim, device=device, lr=cfg.policy_lr)
    agent.load_model(saved_model_path + 'checkpoint.pth')
    rewards = []
    moving_average_rewards = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE
    writer = SummaryWriter(log_dir)  # 使用tensorboard的writer
    for i_episode in range(cfg.eval_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            state = next_state
            if done:
                print('Episode:', i_episode, ' Reward:', ep_reward)
                break
        rewards.append(ep_reward)
        if i_episode == 0:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode + 1)
    writer.close()
    print('Complete evaling!')
Beispiel #2
0
def train(cfg):
    env, n_states, n_actions = env_init()
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
    agent = PolicyGradient(n_states, device=device, lr=cfg.policy_lr)
    '''下面带pool都是存放的transition序列用于gradient'''
    state_pool = []  # 存放每batch_size个episode的state序列
    action_pool = []
    reward_pool = []
    for i_episode in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        for t in count():
            action = agent.choose_action(state)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state_pool.append(state)
            action_pool.append(float(action))
            reward_pool.append(reward)
            state = next_state
            if done:
                print('Episode:', i_episode, ' Reward:', ep_reward)
                break
        # if i_episode % cfg.batch_size == 0:
        if i_episode > 0 and i_episode % 5 == 0:
            agent.update(reward_pool, state_pool, action_pool)
            state_pool = []  # 每个episode的state
            action_pool = []
            reward_pool = []
def train(cfg):
    env, state_dim, n_actions = env_init()
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
    agent = PolicyGradient(state_dim, device=device, lr=cfg.policy_lr)
    '''下面带pool都是存放的transition序列用于gradient'''
    state_pool = []  # 存放每batch_size个episode的state序列
    action_pool = []
    reward_pool = []
    ''' 存储每个episode的reward用于绘图'''
    rewards = []
    moving_average_rewards = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)  # 使用tensorboard的writer
    for i_episode in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state_pool.append(state)
            action_pool.append(float(action))
            reward_pool.append(reward)
            state = next_state
            if done:
                print('Episode:', i_episode, ' Reward:', ep_reward)
                break
        if i_episode > 0 and i_episode % cfg.batch_size == 0:
            agent.update(reward_pool, state_pool, action_pool)
            state_pool = []  # 每个episode的state
            action_pool = []
            reward_pool = []
        rewards.append(ep_reward)
        if i_episode == 0:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode + 1)
    writer.close()
    print('Complete training!')
    save_model(agent, model_path=SAVED_MODEL_PATH)
    '''存储reward等相关结果'''
    save_results(rewards,
                 moving_average_rewards,
                 tag='train',
                 result_path=RESULT_PATH)
    plot(rewards)
    plot(moving_average_rewards, ylabel='moving_average_rewards_train')
Beispiel #4
0
def main():
    RENDER = False
    MAX_EXPLORE = 2000

    # env = gym.make('MountainCar-v0')
    env = gym.make('CartPole-v0')
    env.seed(1)
    env = env.unwrapped

    print(f"action_space: {env.action_space}")
    print(f"action_space.n: {env.action_space.n}")
    print(f"observation_space: {env.observation_space}")
    print(f"observation_space.shape: {env.observation_space.shape}")
    print(f"observation_space.high: {env.observation_space.high}")
    print(f"observation_space.low: {env.observation_space.low}")
    # action_space: Discrete(3)
    # action_space.n: 3
    # observation_space:
    #     Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
    # observation_space.shape: (2,)
    # observation_space.high: [0.6  0.07]
    # observation_space.low: [-1.2  -0.07]

    agent = PolicyGradient(n_features=env.observation_space.shape[0],
                           n_actions=env.action_space.n,
                           lr=0.001,
                           reward_decay=0.995)

    episodes = 3000
    total_reward = []
    for episode in range(episodes):
        s = env.reset()
        # s : list
        # s.shape = (2,)
        for i in range(MAX_EXPLORE):
            if RENDER:
                env.render()
            a = agent.choose_action(s)
            # a : scalar
            s_, r, done, _ = env.step(a)
            # s : list, shape=(2,)
            # r : float
            # done : bool
            agent.store_transition(s, a, r)

            if done or (i + 1) == MAX_EXPLORE:
                ep_rs_sum = sum(agent.ep_r)
                total_reward.append(ep_rs_sum)
                avg_reward = sum(total_reward) / len(total_reward)
                print(f"Episode: {episode+1}")
                print(f"\treward: {ep_rs_sum}, done: {done}")
                print(f"\tavg reward: {avg_reward}")
                vt = agent.learn()

                if avg_reward > 200:
                    RENDER = True

                if episode == 30:
                    plt.plot(vt)
                    plt.xlabel('episode steps')
                    plt.ylabel('normalized state-action value')
                    plt.show()
                break

            s = s_
Beispiel #5
0
DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time

env = gym.make('CartPole-v0')
env.seed(1)     # reproducible, vanilla Policy gradient has high variance
env = env.unwrapped

# # Basic info about the envir
# print(env.action_space)
# print(env.observation_space)
# print(env.observation_space.high)
# print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,
)


running_reward = 0
for i_episode in range(3000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_reward(reward)