Example #1
0
 if RENDER_ENV:
     env.render()
 a_t = actor.predict(s_t, ACTION_BOUND, target=False)
 action = a_t + ou.sample(a_t[0])
 s_t_1, r_t, done, info = env.step(action)
 buff.store(s_t, a_t[0], r_t, np.reshape(s_t_1, (1, 3))[0], [done])
 if buff.t_memory > MINIBATCH_SIZE:
     batch = buff.sample(MINIBATCH_SIZE)
     states_t = batch[:, 0:3]
     actions = batch[:, 3]
     rewards = batch[:, 4]
     b_s_ = batch[:, 5:8]
     dones = batch[:, -1]
     y = np.zeros((len(batch), 1))
     a_tgt = actor.predict(b_s_, ACTION_BOUND, target=True)
     Q_tgt = critic.predict(b_s_, a_tgt, target=True)
     for i in range(len(batch)):
         if dones[i]:
             y[i] = rewards[i]
         else:
             y[i] = rewards[i] + GAMMA * Q_tgt[i]
     actions = actions[:, np.newaxis]
     loss += critic.weight_update(states_t, actions, y)
     a_for_dQ_da = actor.predict(states_t,
                                 ACTION_BOUND,
                                 target=False)
     dQ_da = critic.evaluate_action_gradient(states_t, a_for_dQ_da)
     actor.weight_update(states_t, dQ_da, ACTION_BOUND)
     actor.weight_update_target(TAU)
     critic.weight_update_target(TAU)
 s_t = np.reshape(s_t_1, (1, 3))[0]
Example #2
0
                loss2 = 0;
                if RENDER_ENV:
                    env.render()
                a_t = actor.predict(np.reshape(s_t,(1,3)), ACTION_BOUND, target=False)+1./(1.+ii+j)
                s_t_1, r_t, done, info = env.step(a_t[0])
                buff.add(s_t, a_t[0], r_t, s_t_1, done)
                if buff.count() > MINIBATCH_SIZE:
                    batch = buff.getBatch(MINIBATCH_SIZE)
                    states_t = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    states_t_1 = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y=np.zeros((len(batch), action_dim))
                    a_tgt=actor.predict(states_t_1, ACTION_BOUND, target=True)
                    Q_tgt = critic.predict(states_t_1, a_tgt,target=True)

                    for i in range(len(batch)):
                        if dones[i]:
                            y[i] = rewards[i]
                        else:
                            y[i] = rewards[i] + GAMMA*Q_tgt[i]
                    loss += critic.weight_update(states_t, actions, y)
                    a_for_dQ_da=actor.predict(states_t, ACTION_BOUND, target=False)
                    if count==0:
                        dQ_da = critic.evaluate_action_gradient(states_t,a_for_dQ_da)
                        actor.weight_update(states_t, dQ_da, ACTION_BOUND)
                        count=1;
                    else:
                        dL_da = critic.evaluate_action_loss(states_t,a_for_dQ_da,y)
                        actor.weight_update(states_t, dL_da, ACTION_BOUND)