Ejemplo n.º 1
0
    state = trajectory_collector.last_states
    is_random_run = [0, 1, 2]

    for is_random in is_random_run:
        print(f"Staring {'' if is_random else 'non' } random run...")
        total_rewards = []
        avg_episode_length = 0
        episode_lengths = []
        for i_run in range(NUM_RUNS):
            sum_reward = 0
            ep = 0
            while True:
                ep += 1
                if is_random == 1:
                    actions = agent.act(state).cpu().numpy()
                elif is_random == 2:
                    actions = np.r_[np.random.randn(3), [0.5]]
                else:
                    actions = np.random.randn(4)

                next_states, rewards, dones = trajectory_collector.next_observation(
                    actions)

                sum_reward += rewards.cpu().numpy().sum()

                state = next_states
                if np.any(dones.cpu().numpy()):
                    trajectory_collector.reset()
                    state = trajectory_collector.last_states
                    total_rewards.append(sum_reward)
Ejemplo n.º 2
0
select_device(0)

print("GPU available: {}".format(torch.cuda.is_available()))
print("GPU tensor test: {}".format(torch.rand(3, 3).cuda()))

agent = PPOAgent(config)

random_seed()
config = agent.config

agent.actor_critic.load_state_dict(
    torch.load('../checkpoints/ppo_checkpoint.pth'))

score = 0  # initialize the score

for i in range(3):
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    for j in range(2000):
        action = agent.act(state)
        env_info = env.step(action.cpu().detach().numpy())[brain_name]
        next_state = env_info.vector_observations  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        state = next_state
        score += reward
        print('\rScore: {:.2f}'.format(score), end="")
        if done:
            break

env.close()
def experiment(hidden_size=64,
               lr=3e-4,
               num_steps=2048,
               mini_batch_size=32,
               ppo_epochs=10,
               threshold_reward=10,
               max_episodes=15,
               nrmlz_adv=True,
               gamma=0.99,
               tau=0.95,
               clip_gradients=True):
    '''

    :param hidden_size: number of neurons for the layers of the model
    :param lr: learning rate
    :param num_steps: maximum duration of one epoch
    :param mini_batch_size: mini batch size for ppo
    :param ppo_epochs: number of epochs for ppo to learn
    :param threshold_reward: what is the goal of the training
    :param max_episodes: maximum duration of the training
    :param nrmlz_adv: True, if advantages should be normalized before PPO
    :param clip_gradients: True if gradients should ne clipped after PPO
    :return: list of scores and list of test_rewards
    '''

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    scores_window = deque(maxlen=100)
    test_rewards = []
    moving_averages = []

    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = PPOAgent(learning_rate=lr,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     num_agents=num_agents,
                     random_seed=0,
                     ppo_epochs=ppo_epochs,
                     mini_batch_size=mini_batch_size,
                     normalize_advantages=nrmlz_adv,
                     clip_gradients=clip_gradients,
                     gamma=gamma,
                     tau=tau,
                     device=device)

    #    while episode < max_episodes and not early_stop:
    for episode in tqdm(range(max_episodes)):
        log_probs = []
        values = []
        states_list = []
        actions_list = []
        rewards = []
        masks = []
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        for duration in range(num_steps):

            state = torch.FloatTensor(state).to(device)
            action, value, log_prob = agent.act(state)
            env_info = env.step(action.cpu().data.numpy())[
                brain_name]  # send all actions to the environment

            next_state = env_info.vector_observations  # get next state (for each agent)
            reward = env_info.rewards  # get reward (for each agent)
            dones = np.array(env_info.local_done)  # see if episode finished
            if reward == None:
                pass

            log_probs.append(log_prob)
            values.append(value)
            reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device)
            masks_t = torch.FloatTensor(1 - dones)
            rewards.append(reward_t)
            masks.append(masks_t)
            states_list.append(state)
            actions_list.append(action)

            state = next_state

            if np.any(dones):
                break

        next_state = torch.FloatTensor(state).to(device)
        _, next_value, _ = agent.act(next_state)
        agent.step(states=states_list,
                   actions=actions_list,
                   values=values,
                   log_probs=log_probs,
                   rewards=rewards,
                   masks=masks,
                   next_value=next_value)

        test_mean_reward = test_agent(env, brain_name, agent, device)
        test_rewards.append(test_mean_reward)
        scores_window.append(test_mean_reward)
        moving_averages.append(np.mean(scores_window))
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.
              format(episode, test_mean_reward, min(episode, 100),
                     np.mean(scores_window)))
        if np.mean(scores_window) > threshold_reward:
            agent.save_model(
                f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth"
            )
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, test_mean_reward))
            break

        episode += 1
    env.close()
    return scores_window, test_rewards, moving_averages
Ejemplo n.º 4
0
                          num_processes=1,
                          obs_shape=obs_shape)
step = 0
episode = 0
ppo_update = 0
total_reward = 0
done = True

while True:  # nb episodes
    if done:
        env_info = env.reset(train_mode=train_mode,
                             config=config)[default_brain]
    obs = env_info.observations[0]
    obs = img_to_tensor(obs)
    while True:  # nb of steps
        action, action_log_prob, value = agent.act(obs)
        #action_cpu = action.data.numpy()
        action_cuda = action.data.cpu().numpy()
        #print(action_cuda)

        env_info = env.step(action_cuda)[default_brain]
        done = env_info.local_done[0]
        reward = torch.cuda.FloatTensor([env_info.rewards[0]])
        total_reward += env_info.rewards[0]

        mask = 0 if env_info.local_done[0] else 1
        mask = torch.cuda.FloatTensor([mask])
        rollouts.insert(step, obs.data, action.data, action_log_prob.data,
                        value.data, reward, mask)
        step += 1
        obs = env_info.observations[0]