Exemple #1
0
def ddpg_run(episodes=1000, seed=42):
    env = start_env()
    env_info = reset_env_info(env)
    
    state_size = get_state_size(env_info)
    action_size = get_action_size(env)
    
    print('Seed used:', seed)
    total_agents = get_total_agents(env_info)
    agent = DDPGAgent(total_agents, state_size, action_size, seed)
    
    scores = []
    scores_window = deque(maxlen=100)
    for episode in range(1, episodes+1):
        init_time = datetime.datetime.now()
        
        env_info = reset_env_info(env)
        score = np.zeros(total_agents)
        dones = np.zeros(total_agents)
        agent.reset()
        critic_losses = []
        actor_losses = []
        while not np.any(dones):
            states = env_info.vector_observations
            actions = agent.act(states, add_noise=True)
            env_info = env_step(env, actions)
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            critic_loss, actor_loss = agent.step(states, actions, rewards, next_states, dones)
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)
            #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='')
                
            score += rewards

        scores_window.append(np.mean(score))
        scores.append(np.mean(score))
        print('Ep. {}/{} - Avg Global Score: {:.2f} - Avg Ep. Score: {:.2f} - Min Ep. Score: {:.2f} - Max Ep. Score: {:.2f} - Actor loss: {:.6f}, Critic loss: {:.6f} - time: {}'.format(episode, episodes,
            np.mean(scores_window), np.mean(score), np.min(score), np.max(score), np.mean(actor_losses), np.mean(critic_losses), datetime.datetime.now() - init_time, end=' '))
            
        if np.mean(scores_window) >= 30.0 and episode >= 100:
            print('\nEnvironment solved (mean of 30.0 for 100 episodes) in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window)))
            
            torch.save(agent.actor_local.state_dict(), 'actor_local_checkpoint.pth')
            torch.save(agent.actor_target.state_dict(), 'actor_target_checkpoint.pth')
            
            torch.save(agent.critic_local.state_dict(), 'critic_local_checkpoint.pth')
            torch.save(agent.critic_target.state_dict(), 'critic_target_checkpoint.pth')
            break
    
    env.close()
    return scores
Exemple #2
0
def play():
    env = UnityEnvironment(file_name='./Reacher.app')

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))

    # create agent
    agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0)

    # load weights
    agent.policy_local.load_state_dict(torch.load('policy.pth'))

    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    state = env_info.vector_observations[0]                # get the current state (for each agent)
    score = 0                                              # initialize the score (for each agent)
    while True:
        action = agent.act(state, add_noise=False)         # select an action (for each agent)
        env_info = env.step(action)[brain_name]            # send all actions to tne environment
        next_state = env_info.vector_observations[0]       # get next state (for each agent)
        reward = env_info.rewards[0]                       # get reward (for each agent)
        done = env_info.local_done[0]                      # see if episode finished
        score += reward                                    # update the score (for each agent)
        state = next_state                                 # roll over states to next time step
        if done:                                           # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(score))

    env.close()
def testAgent():
    print("Testing the Agent")
    agent = DDPGAgent(state_size=state_size,
                      action_size=action_size,
                      n_agents=n_agents,
                      seed=0,
                      pretrainedWeightsFile='checkpoint_actor.pth',
                      train=False)
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state
    score = np.zeros(n_agents)  # initialize the score
    while True:
        actions = agent.act(states)  # select an action
        env_info = env.step(actions)[
            brain_name]  # send the action to the environment
        next_states = env_info.vector_observations  # get the next state
        rewards = env_info.rewards  # get the reward
        dones = env_info.local_done  # see if episode has finished
        score += np.array(rewards)  # update the score
        states = next_states  # roll over the state to next time step
        if np.any(dones):  # exit loop if episode finished
            break
    print("Score: {}".format(np.mean(score)))
    return score