Beispiel #1
0
                next_state, reward, done, _ = envs.step(action.cpu().numpy())

                log_prob = dist.log_prob(action)
                agent.entropy += dist.entropy().mean()

                agent.append_log_prob(log_prob)
                agent.append_value(value)
                agent.append_reward(reward)
                agent.append_done(done)

                agent.append_state(state)
                agent.append_action(action)

                state = next_state
                idx += 1

                if idx % 1000 == 0:
                    score = np.mean([agent.test_env(env) for _ in range(100)])
                    print(idx, score)
                    scores.append(score)
                    if score > best_avg_score:
                        best_avg_score = score
                        agent.save_model('../models/ppo/model.pt')
                        print('Saved best model')
                    if score > args.threshold_score:
                        early_stop = True

            agent.train(next_state)

        plot_and_save_scores(scores, args.max_frames / 1000, args)
Beispiel #2
0
def paralle_train(args):
    logger = SummaryWriter(log_dir='results/{}_{}_{}'.format(
        args.env, args.seed,
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    env = gym.make(args.env)
    env_params = get_env_params(env, args)
    env.close()

    agent = PPOAgent(args, env_params)
    workers, parent_conns, children_conns = workers_initialize(args)

    obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32)

    #initialize obs_normalizer
    print('Start initialize obs normalizer....')
    next_obs_batch = []
    for step in range(args.initialize_episode * args.max_episode_step):
        actions = np.random.randint(0,
                                    env_params['a_dim'],
                                    size=(args.num_worker))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)
        for parent_conn in parent_conns:
            obs_, r, done, info = parent_conn.recv()
            next_obs_batch.append(obs_)

        if len(next_obs_batch) % (10 * args.num_worker) == 0:
            next_obs_batch = np.stack(next_obs_batch)
            agent.normalizer_obs.update(next_obs_batch)
            next_obs_batch = []
    print('End initialize obs normalizer....')

    log_reward_ex = 0
    log_reward_in = 0
    log_step = 0
    log_episode = 0
    for i_epoch in range(args.max_epoch):
        epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], []
        for i_step in range(args.rollout_len):
            actions, log_probs = agent.choose_action(obs)

            for action, parent_conn in zip(actions, parent_conns):
                parent_conn.send(action)

            batch_re, batch_mask, batch_next_obs = [], [], []
            for parent_conn in parent_conns:
                obs_, r_e, done, info = parent_conn.recv()

                batch_next_obs.append(obs_)
                batch_re.append(r_e)
                batch_mask.append(0 if done else 1)

            batch_next_obs = np.stack(batch_next_obs)
            batch_re = np.stack(batch_re)
            batch_mask = np.stack(batch_mask)
            batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy())

            #for log
            log_reward_ex += batch_re[args.log_env_idx]
            log_reward_in += batch_ri[args.log_env_idx]
            log_step += 1
            if batch_mask[args.log_env_idx] == 0:
                log_episode += 1
                logger.add_scalar('Indicator/Reward_ex', log_reward_ex,
                                  log_episode)
                logger.add_scalar('Indicator/Reward_in', log_reward_in,
                                  log_episode)
                log_reward_ex = 0
                log_reward_in = 0

            epoch_obs.append(obs)
            epoch_action.append(actions)
            epoch_next_obs.append(batch_next_obs)
            epoch_ri.append(batch_ri)
            epoch_re.append(batch_re)
            epoch_mask.append(batch_mask)
            epoch_logprob.append(log_probs)

            obs = batch_next_obs[:, :, :, :]

        epoch_obs = np.stack(epoch_obs)
        epoch_action = np.stack(epoch_action)
        epoch_ri = np.stack(epoch_ri)
        epoch_re = np.stack(epoch_re)
        epoch_mask = np.stack(epoch_mask)
        epoch_next_obs = np.stack(epoch_next_obs)
        epoch_logprob = np.stack(epoch_logprob)

        epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4])
        epoch_action = np.transpose(epoch_action, axes=[1, 0])
        epoch_ri = np.transpose(epoch_ri, axes=[1, 0])
        epoch_re = np.transpose(epoch_re, axes=[1, 0])
        epoch_mask = np.transpose(epoch_mask, axes=[1, 0])
        epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4])
        epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0])

        loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action,
                                                epoch_ri, epoch_re, epoch_mask,
                                                epoch_next_obs, epoch_logprob)

        used_sample_num = args.rollout_len * args.num_worker * i_epoch
        logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num)
        logger.add_scalar('Loss/loss_a', loss_a, used_sample_num)
        logger.add_scalar('Loss/loss_c', loss_c, used_sample_num)

        if i_epoch % args.save_model_interval == 0:
            agent.save_model(remark='{}'.format(i_epoch))
def experiment(hidden_size=64,
               lr=3e-4,
               num_steps=2048,
               mini_batch_size=32,
               ppo_epochs=10,
               threshold_reward=10,
               max_episodes=15,
               nrmlz_adv=True,
               gamma=0.99,
               tau=0.95,
               clip_gradients=True):
    '''

    :param hidden_size: number of neurons for the layers of the model
    :param lr: learning rate
    :param num_steps: maximum duration of one epoch
    :param mini_batch_size: mini batch size for ppo
    :param ppo_epochs: number of epochs for ppo to learn
    :param threshold_reward: what is the goal of the training
    :param max_episodes: maximum duration of the training
    :param nrmlz_adv: True, if advantages should be normalized before PPO
    :param clip_gradients: True if gradients should ne clipped after PPO
    :return: list of scores and list of test_rewards
    '''

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    scores_window = deque(maxlen=100)
    test_rewards = []
    moving_averages = []

    env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    num_agents = len(env_info.agents)
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = PPOAgent(learning_rate=lr,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     num_agents=num_agents,
                     random_seed=0,
                     ppo_epochs=ppo_epochs,
                     mini_batch_size=mini_batch_size,
                     normalize_advantages=nrmlz_adv,
                     clip_gradients=clip_gradients,
                     gamma=gamma,
                     tau=tau,
                     device=device)

    #    while episode < max_episodes and not early_stop:
    for episode in tqdm(range(max_episodes)):
        log_probs = []
        values = []
        states_list = []
        actions_list = []
        rewards = []
        masks = []
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        for duration in range(num_steps):

            state = torch.FloatTensor(state).to(device)
            action, value, log_prob = agent.act(state)
            env_info = env.step(action.cpu().data.numpy())[
                brain_name]  # send all actions to the environment

            next_state = env_info.vector_observations  # get next state (for each agent)
            reward = env_info.rewards  # get reward (for each agent)
            dones = np.array(env_info.local_done)  # see if episode finished
            if reward == None:
                pass

            log_probs.append(log_prob)
            values.append(value)
            reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device)
            masks_t = torch.FloatTensor(1 - dones)
            rewards.append(reward_t)
            masks.append(masks_t)
            states_list.append(state)
            actions_list.append(action)

            state = next_state

            if np.any(dones):
                break

        next_state = torch.FloatTensor(state).to(device)
        _, next_value, _ = agent.act(next_state)
        agent.step(states=states_list,
                   actions=actions_list,
                   values=values,
                   log_probs=log_probs,
                   rewards=rewards,
                   masks=masks,
                   next_value=next_value)

        test_mean_reward = test_agent(env, brain_name, agent, device)
        test_rewards.append(test_mean_reward)
        scores_window.append(test_mean_reward)
        moving_averages.append(np.mean(scores_window))
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.
              format(episode, test_mean_reward, min(episode, 100),
                     np.mean(scores_window)))
        if np.mean(scores_window) > threshold_reward:
            agent.save_model(
                f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth"
            )
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, test_mean_reward))
            break

        episode += 1
    env.close()
    return scores_window, test_rewards, moving_averages