if RENDER_ENV: env.render() a_t = actor.predict(s_t, ACTION_BOUND, target=False) action = a_t + ou.sample(a_t[0]) s_t_1, r_t, done, info = env.step(action) buff.store(s_t, a_t[0], r_t, np.reshape(s_t_1, (1, 3))[0], [done]) if buff.t_memory > MINIBATCH_SIZE: batch = buff.sample(MINIBATCH_SIZE) states_t = batch[:, 0:3] actions = batch[:, 3] rewards = batch[:, 4] b_s_ = batch[:, 5:8] dones = batch[:, -1] y = np.zeros((len(batch), 1)) a_tgt = actor.predict(b_s_, ACTION_BOUND, target=True) Q_tgt = critic.predict(b_s_, a_tgt, target=True) for i in range(len(batch)): if dones[i]: y[i] = rewards[i] else: y[i] = rewards[i] + GAMMA * Q_tgt[i] actions = actions[:, np.newaxis] loss += critic.weight_update(states_t, actions, y) a_for_dQ_da = actor.predict(states_t, ACTION_BOUND, target=False) dQ_da = critic.evaluate_action_gradient(states_t, a_for_dQ_da) actor.weight_update(states_t, dQ_da, ACTION_BOUND) actor.weight_update_target(TAU) critic.weight_update_target(TAU) s_t = np.reshape(s_t_1, (1, 3))[0]
loss2 = 0; if RENDER_ENV: env.render() a_t = actor.predict(np.reshape(s_t,(1,3)), ACTION_BOUND, target=False)+1./(1.+ii+j) s_t_1, r_t, done, info = env.step(a_t[0]) buff.add(s_t, a_t[0], r_t, s_t_1, done) if buff.count() > MINIBATCH_SIZE: batch = buff.getBatch(MINIBATCH_SIZE) states_t = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) states_t_1 = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y=np.zeros((len(batch), action_dim)) a_tgt=actor.predict(states_t_1, ACTION_BOUND, target=True) Q_tgt = critic.predict(states_t_1, a_tgt,target=True) for i in range(len(batch)): if dones[i]: y[i] = rewards[i] else: y[i] = rewards[i] + GAMMA*Q_tgt[i] loss += critic.weight_update(states_t, actions, y) a_for_dQ_da=actor.predict(states_t, ACTION_BOUND, target=False) if count==0: dQ_da = critic.evaluate_action_gradient(states_t,a_for_dQ_da) actor.weight_update(states_t, dQ_da, ACTION_BOUND) count=1; else: dL_da = critic.evaluate_action_loss(states_t,a_for_dQ_da,y) actor.weight_update(states_t, dL_da, ACTION_BOUND)