def eval(cfg, saved_model_path=SAVED_MODEL_PATH):
    print('start to eval ! \n')
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
    env = gym.make('CartPole-v0').unwrapped  # 可google为什么unwrapped gym,此处一般不需要
    env.seed(1)  # 设置env随机种子
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    agent = DQN(n_states=n_states,
                n_actions=n_actions,
                device="cpu",
                gamma=cfg.gamma,
                epsilon_start=cfg.epsilon_start,
                epsilon_end=cfg.epsilon_end,
                epsilon_decay=cfg.epsilon_decay,
                policy_lr=cfg.policy_lr,
                memory_capacity=cfg.memory_capacity,
                batch_size=cfg.batch_size)
    agent.load_model(saved_model_path + 'checkpoint.pth')
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.eval_eps + 1):
        state = env.reset()  # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.eval_steps + 1):
            action = agent.choose_action(state,
                                         train=False)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)  # 更新环境参数
            ep_reward += reward
            state = next_state  # 跳转到下一个状态
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step, 'done: ', done)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    '''存储reward等相关结果'''
    save_results(rewards,
                 moving_average_rewards,
                 ep_steps,
                 tag='eval',
                 result_path=RESULT_PATH)
    print('Complete evaling!')
Ejemplo n.º 2
0
def test_dqn(env):
    agent = DQN(env, params)

    agent.load_model(sys.argv[1], sys.argv[2])

    state = env.reset()  # Reset enviroment before each episode to start fresh
    state = np.reshape(state, (1, env.state_space))
    max_steps = 10000
    total_reward = 0

    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        state = np.reshape(next_state, (1, env.state_space))
        total_reward += reward
        time.sleep(0.1)
        if done:
            print(f'Score: {total_reward}, steps: {step}')
            break
    return
Ejemplo n.º 3
0
def main():
    env = retro.make(game='Frogger-Genesis', use_restricted_actions=retro.Actions.DISCRETE)
    gamma = 0.99
    copy_step = 25
    num_actions = env.action_space.n
    num_states = len(env.observation_space.sample())
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    # For stable weights, use one net to train, and copy their weights over to the TargetNet every copy_steps
    TrainNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                   min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                   hidden_units=hidden_units, num_states=num_states)
    TargetNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                    min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                    hidden_units=hidden_units, num_states=num_states)

    # Loading check
    while True:
        if os.path.exists(save_dir):
            if input("\n\nWould you like to load the previous network weights? (y/n) ") == 'y':
                # load weights and copy to train net
                TargetNet.load_model(save_path)
                TrainNet.copy_weights(TargetNet)
                print("Loaded model weights...")
                break

            elif input("\nWould you like to delete the old checkpoints and start again? (y/n)") == 'y':
                shutil.rmtree(save_dir)
                print("Removed old checkpoint...")
                break
        else:
            break

    N = 50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1

    # play N games
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()

        with summary_writer.as_default():
            tf.summary.scalar("episode reward", total_reward, step=n)
            tf.summary.scalar("running avg reward(100)", avg_rewards, step=n)

        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward,
                  "eps:", epsilon, "avg reward (last 100):", avg_rewards)

            # save the model weights
            TargetNet.save_model(save_path)

    print("avg reward for last 100 episodes:", avg_rewards)

    if create_video:
        make_video(env, TrainNet)

    env.close()