コード例 #1
0
def _generate(device, env, policy, ob_scale, number_timesteps, gamma,
              timesteps_per_batch):
    """ Generate trajectories """
    record = ['o', 'a', 'r', 'done', 'vpred']
    export = ['o', 'a', 'r', 'vpred']
    trajectories = Trajectories(record, export, device, gamma, ob_scale)

    o = env.reset()
    infos = []
    for n in range(1, number_timesteps + 1):
        # sample action
        with torch.no_grad():
            logits, v = policy(scale_ob(o, device, ob_scale))
            dist = torch.distributions.Categorical(logits=logits)
            a = dist.sample().cpu().numpy()
            v = v.cpu().numpy()[:, 0]

        # take action in env
        o_, r, done, info = env.step(a)
        for d in info:
            if d.get('episode'):
                infos.append(d['episode'])

        # store batch data and update observation
        trajectories.append(o, a, r, done, v)
        if n % timesteps_per_batch == 0:
            with torch.no_grad():
                ob = scale_ob(o_, device, ob_scale)
                v_ = policy(ob)[1].cpu().numpy()[:, 0] * (1 - done)
            yield trajectories.export(v_) + (infos, )
            infos.clear()
        o = o_
コード例 #2
0
ファイル: dqn.py プロジェクト: mikepsn/RL-Experiments
def _generate(device, env, qnet, ob_scale,
              number_timesteps, param_noise,
              exploration_fraction, exploration_final_eps):
    """ Generate training batch sample """
    noise_scale = 1e-2
    action_dim = env.action_space.n
    explore_steps = number_timesteps * exploration_fraction

    o = env.reset()
    infos = dict()
    epret = eplen = 0
    for n in range(1, number_timesteps + 1):
        epsilon = 1.0 - (1.0 - exploration_final_eps) * n / explore_steps
        epsilon = max(exploration_final_eps, epsilon)

        # sample action
        with torch.no_grad():
            ob = scale_ob(np.expand_dims(o, 0), device, ob_scale)
            q = qnet(ob)
            if not param_noise:
                if random.random() < epsilon:
                    a = int(random.random() * action_dim)
                else:
                    a = q.argmax(1).cpu().numpy()[0]
            else:
                # see Appendix C of `https://arxiv.org/abs/1706.01905`
                q_dict = deepcopy(qnet.state_dict())
                for _, m in qnet.named_modules():
                    if isinstance(m, nn.Linear):
                        std = torch.empty_like(m.weight).fill_(noise_scale)
                        m.weight.data.add_(torch.normal(0, std).to(device))
                        std = torch.empty_like(m.bias).fill_(noise_scale)
                        m.bias.data.add_(torch.normal(0, std).to(device))
                q_perturb = qnet(ob)
                kl_perturb = ((log_softmax(q, 1) - log_softmax(q_perturb, 1)) *
                              softmax(q, 1)).sum(-1).mean()
                kl_explore = -math.log(1 - epsilon + epsilon / action_dim)
                if kl_perturb < kl_explore:
                    noise_scale *= 1.01
                else:
                    noise_scale /= 1.01
                qnet.load_state_dict(q_dict)
                if random.random() < epsilon:
                    a = int(random.random() * action_dim)
                else:
                    a = q_perturb.argmax(1).cpu().numpy()[0]

        # take action in env
        o_, r, done, _ = env.step(a)
        epret += r
        eplen += 1
        if done:
            infos = {'eprewmean': epret, 'eplenmean': eplen}
            epret = eplen = 0

        # return data and update observation
        yield (o, [a], [r], o_, [int(done)], infos)
        infos = dict()
        o = o_ if not done else env.reset()