コード例 #1
0
ファイル: Tennis.py プロジェクト: Caruso33/19_udacity_drlnd
def execute_ddpg(ddpg_agent: Agent, num_episodes: int = 3000, max_episode_t: int = 2000, learn_each: int = 5,
                 consec_learn_iter: int = 10) -> list:
    """
    DDPG - Execution Algorithm Implementation
    :param ddpg_agent: agent in charge of controling both Actor and Critic neural networks behaviour
    :param num_episodes: number of episodes the algorithm will train
    :param max_episode_t: maximum number of time steps to play at each episode
    :param learn_each: teps in a game before triggering the learning procedure
    :param consec_learn_iter: number of consecutive learning iterations
    :return: results obtained during the training procedure
    """
    # 1| Initialization
    global_score = []
    global_score_deque = deque(maxlen=100)

    # 2| Episode run
    for i_episode in range(1, num_episodes + 1):

        # 2.0| Initialization of episode
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        ddpg_agent.reset()

        # 2.1| Episode Run
        for t_step in range(max_episode_t):
            # 2.1.1| Agent decision and interaction
            actions = ddpg_agent.act(states)
            env_info = env.step(actions)[brain_name]

            # 2.1.2| Feedback on action
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # 2.1.3| Experience saving
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                ddpg_agent.memorize(state, action, reward, next_state, done)

            # 2.1.4| Update values
            scores += rewards
            states = next_states

            # 2.1.5| Agent learning
            if t_step % learn_each == 0:
                for _ in range(consec_learn_iter):
                    ddpg_agent.trigger_learning()

            # 2.1.6| Episode ending
            if np.any(dones):
                break

        # 2.2| Episode post-processing
        # 2.2.1| Scoring
        global_score_deque.append(np.max(scores))
        global_score.append(np.max(scores))

        if i_episode % 10 == 0:
            print('Episode {}\tTotal Average Score: {:.2f}\tMean: {:.2f}'.format(
                    i_episode, np.mean(global_score_deque), np.mean(scores)))

        if i_episode % 50 == 0:
            torch.save(ddpg_agent.actor_local.state_dict(),
                       model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.actor_target.state_dict(),
                       model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_local.state_dict(),
                       model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_target.state_dict(),
                       model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth')

        if np.mean(global_score_deque) >= 0.5 and i_episode >= 100:
            print('\rEpisode employed for completing the challenge {}'.format(i_episode))

            torch.save(ddpg_agent.actor_local.state_dict(),
                       model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.actor_target.state_dict(),
                       model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_local.state_dict(),
                       model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_target.state_dict(),
                       model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth')
            break

    return global_score
コード例 #2
0
 def reset(self):
     for Agent in self.maddpg_agent:
         Agent.reset()