Ejemplo n.º 1
0
Archivo: main.py Proyecto: km01/myrl
    global_rewards = []

    obs_gotten = None

    while frame_count < max_frame:

        cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []}
        probs_cache = {'mu': [], 'sig': []}

        for _ in range(n_steps):
            obs = envs.reset() if obs_gotten is None else obs_gotten
            obs_in = torch.FloatTensor(obs).to(device)
            mu, sig = actor(obs_in)
            with torch.no_grad():
                a = Normal(mu, sig).sample()
                a.clamp_(-2.0 + 1e-7, 2.0 - 1e-7)

            obs_gotten, rews, dones, _ = envs.step(a)

            for i in range(num_envs):
                rewards[i][-1] += rews[i]
                if dones[i]:
                    global_rewards.append(rewards[i][-1])
                    rewards[i].append(0.)

            cache['obs'].append(obs)
            cache['acts'].append(a)
            cache['rews'].append(rews * 0.1)
            cache['dones'].append(dones)

            probs_cache['mu'].append(mu)