critic = CriticNet(state_dim, action_dim, HIDDEN1_UNITS, HIDDEN2_UNITS, HIDDEN2_UNITS, action_dim) buff = Memory(BUFFER_SIZE, 9) step = 0 reward_result = [] for i in range(MAX_EPISODES): s_t = env.reset() s_t = np.reshape(s_t, (1, 3))[0] total_reward = 0. for j in range(MAX_EP_STEPS): loss = 0 if RENDER_ENV: env.render() a_t = actor.predict(s_t, ACTION_BOUND, target=False) action = a_t + ou.sample(a_t[0]) s_t_1, r_t, done, info = env.step(action) buff.store(s_t, a_t[0], r_t, np.reshape(s_t_1, (1, 3))[0], [done]) if buff.t_memory > MINIBATCH_SIZE: batch = buff.sample(MINIBATCH_SIZE) states_t = batch[:, 0:3] actions = batch[:, 3] rewards = batch[:, 4] b_s_ = batch[:, 5:8] dones = batch[:, -1] y = np.zeros((len(batch), 1)) a_tgt = actor.predict(b_s_, ACTION_BOUND, target=True) Q_tgt = critic.predict(b_s_, a_tgt, target=True) for i in range(len(batch)): if dones[i]:
for times in range(max_time): step=0 x=np.linspace(1,MAX_EPISODES,MAX_EPISODES) for ii in range(MAX_EPISODES): s_t = env.reset() total_reward = 0. count=0 for j in range(MAX_EP_STEPS): loss=0; loss2 = 0; if RENDER_ENV: env.render() a_t = actor.predict(np.reshape(s_t,(1,3)), ACTION_BOUND, target=False)+1./(1.+ii+j) s_t_1, r_t, done, info = env.step(a_t[0]) buff.add(s_t, a_t[0], r_t, s_t_1, done) if buff.count() > MINIBATCH_SIZE: batch = buff.getBatch(MINIBATCH_SIZE) states_t = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) states_t_1 = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y=np.zeros((len(batch), action_dim)) a_tgt=actor.predict(states_t_1, ACTION_BOUND, target=True) Q_tgt = critic.predict(states_t_1, a_tgt,target=True) for i in range(len(batch)): if dones[i]: