Esempio n. 1
0
        ep_ret += rew
        ep_len += 1

        if done or (t==local_steps_per_epoch-1):
            # if not done:
            #     print("WARNING: trajectory cut off by epoch at %d steps." % ep_len)

            last_val = rew if done else v_t
            buffer.finish_path(last_val)

            if done:
                rewards.append(ep_ret)
                obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0


    agent.update(buffer.get())

for i in range(10):
    obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    rewards = []
    while not d or ep_len == 1000:
        act, _, _ = agent.get_action(obs)
        obs, r, d, _ = env.step(act[0])
        ep_len += 1
        ep_ret += r
        rewards.append(r)
        env.render()
    obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    print(np.mean(np.array(rewards)))

print(rewards)
Esempio n. 2
0
            obs = torch.from_numpy(obs)
            rollouts.insert(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)
        with torch.no_grad():
            agent.model.eval()
            next_value = agent.get_value(rollouts.obs[-1], rollouts.masks[-1],
                                         device)

        if using_pcnt:
            pcnt_dist.add_pcnt(rollouts, device)

        rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda,
                                 use_proper_time_limits)

        agent.model.train()
        value_loss, action_loss, dist_entropy = agent.update(rollouts, device)

        rollouts.after_update()

        #if j % args.log_interval == 0 and len(episode_rewards) > 1:
        all_return.append(np.mean(cumul_return))
        all_length.append(np.mean(episo_length))
        if True:
            total_num_steps = (j + 1) * num_processes * num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n, entropy loss {}, value_loss {}, action_loss {}"
                .format(j, total_num_steps, len(cumul_return),
                        np.mean(cumul_return), np.median(cumul_return),
                        np.min(cumul_return), np.max(cumul_return),
                        dist_entropy, value_loss, action_loss))