Esempio n. 1
0
File: x1.py Progetto: rstager/rnrrl
def objective(kwargs):
    cluster = kwargs.pop('cluster')
    gamma = kwargs.get('gamma')
    reg = kwargs.pop('reg')
    epochs = kwargs.pop('epochs')
    clip_tdq = kwargs.get('clip_tdq', False)

    actor, critic = make_models(cluster.env, reg=reg)
    agent = DDPGAgent(cluster, actor, critic, mode=2, **kwargs)

    def qpriority(obs0, a0, r0, obs1, done):
        tdq = TD_q(agent.target_actor, agent.target_critic, agent.gamma, obs1,
                   r0, done)
        q0 = agent.target_critic.predict([obs0, a0])
        if clip_tdq is not None:
            tdq = np.clip(tdq, clip_tdq / (1 - gamma), 0)
        epsilon = 0.00001
        priority = np.abs((q0 - tdq) / (tdq + epsilon))
        priority = np.clip(priority, 0.0001, 1)
        return priority.squeeze(axis=-1)

    eval = ActorCriticEval(cluster, agent.target_actor, agent.target_critic,
                           gamma)
    callbacks = []
    callbacks.append(eval)
    callbacks.append(
        PltQEval(cluster,
                 gamma, [('target', agent.target_actor, agent.target_critic),
                         ('ddpg', agent.actor, agent.critic)],
                 title="RL eval",
                 fignum=1))
    callbacks.append(
        PlotDist(cluster,
                 eval.hist,
                 title="actor/critic training trends",
                 fignum=2))
    memory = PrioritizedMemory(sz=1000000, updater=qpriority)
    agent.train(memory=memory,
                epochs=epochs,
                fignum=1,
                visualize=False,
                callbacks=callbacks)

    reward = eval.hist['reward']
    r1 = np.array(reward)
    n = min(int(r1.shape[0] * 0.2), 20)  # last 20% or 20 epochs
    if not np.isnan(r1).any():
        loss = -np.median(r1[-n:])  # median reward of the last nepochs
    else:
        loss = -np.inf
    print(
        "loss={loss:.0f} gamma={gamma:.3f} tau={tau:.3e} lr={lr:.3e} clr={clr:.3e} decay={decay:.3e}"
        .format(loss=loss, **kwargs))
    return {
        'loss': loss,
        'status': STATUS_OK,
    }
Esempio n. 2
0
def objective(kwargs):
    cluster = kwargs.pop('cluster')
    gamma = kwargs.get('gamma')
    reg = kwargs.pop('reg')
    epochs = kwargs.pop('epochs')

    actor, critic = make_models(cluster.env, reg=reg)
    agent = DDPGAgent(cluster, actor, critic, mode=2, **kwargs)
    eval = ActorCriticEval(cluster, agent.target_actor, agent.target_critic,
                           gamma)
    callbacks = []
    callbacks.append(eval)
    callbacks.append(
        PltQEval(cluster,
                 gamma, [('target', agent.target_actor, agent.target_critic),
                         ('ddpg', agent.actor, agent.critic)],
                 title="RL eval",
                 fignum=1))
    callbacks.append(
        PlotDist(cluster,
                 eval.hist,
                 title="actor/critic training trends",
                 fignum=2))
    agent.train(epochs=epochs, fignum=1, visualize=False, callbacks=callbacks)

    reward = np.array(eval.hist['reward'])
    n = min(int(reward.shape[0] * 0.2), 20)  # last 20% or 20 epochs
    loss = -np.median(reward[-n:, :])  # median reward of the last nepochs
    print(
        "loss={loss:.0f} gamma={gamma:.3f} tau={tau:.3e} lr={lr:.3e} clr={clr:.3e} decay={decay:.3e}"
        .format(loss=loss, **kwargs))
    return {
        'loss': loss,
        'status': STATUS_OK,
    }
Esempio n. 3
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.ndim = kwargs.get('ndim', 1)
     self.links = [1.0, 0.7, 0.3]
     ospace = 2 if self.ndim != 3 else 3
     # velocities, goalxyt,sintheta,costheta
     self.observation_space = spaces.Box(
         low=np.array([-1] * self.ndim + [-self.ndim] * ospace +
                      [-1] * self.ndim * 2),
         high=np.array([1] * self.ndim + [self.ndim] * ospace +
                       [1] * self.ndim * 2))