if (t > 0 and episode > 200): # Obtain TD-error _, base_v = ppo.evaluate_state(s_old) _, target_v = ppo.evaluate_state(s) lambda_mix = lambda_max * (1 - np.exp(-factor * np.abs( r + GAMMA * np.squeeze(target_v) - np.squeeze(base_v)))) else: lambda_mix = 10. lambda_actual = 10. lambda_store[t] = lambda_mix # Update ppo if t == BATCH: # or (terminal and t < BATCH): # Normalise rewards rewards = np.array(buffer_r) rolling_r.update(rewards) rewards = np.clip(rewards / rolling_r.std, -10, 10) v_final = [ v * (1 - terminal) ] # v = 0 if terminal, otherwise use the predicted v values = np.array(buffer_v + v_final) terminals = np.array(buffer_terminal + [terminal]) # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438 delta = rewards + GAMMA * values[1:] * ( 1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) buffer_v = np.squeeze(np.array(buffer_v))[:, np.newaxis] returns = np.squeeze(advantage)[:, np.newaxis] + buffer_v advantage = (advantage - advantage.mean()) / np.maximum(
bs, ba, br, badv = np.reshape(buffer_s, (len(buffer_s),) + ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) experience.append([bs, ba, br, badv]) buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] # Update ppo if t >= BATCH: # Per batch normalisation of advantages advs = np.concatenate(list(zip(*experience))[3]) for x in experience: x[3] = (x[3] - np.mean(advs)) / np.maximum( np.std(advs), 1e-6) # Update rolling reward stats rolling_r.update(np.array(batch_rewards)) print("Training using %i episodes and %i steps..." % (len(experience), t)) graph_summary = ppo.update(experience) t, experience, batch_rewards = 0, [], [] buffer_s.append(s) buffer_a.append(a) buffer_v.append(v) buffer_terminal.append(terminal) ep_a.append(a) if not ppo.discrete: a = np.clip(a, env.action_space.low, env.action_space.high) s, r, terminal, _ = env.step(a)
def work(self): hooks = [self.ppo.sync_replicas_hook] sess = tf.train.MonitoredTrainingSession(master=self.server.target, is_chief=(self.wid == 0), checkpoint_dir=SUMMARY_DIR, save_summaries_steps=None, save_summaries_secs=None, hooks=hooks) if self.wid == 0: writer = SummaryWriterCache.get(SUMMARY_DIR) t, episode, terminal = 0, 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() while not sess.should_stop() and not (episode > EP_MAX and self.wid == 0): s = self.env.reset() ep_r, ep_t, ep_a = 0, 0, [] while True: a, v = self.ppo.evaluate_state(s, sess) # Update ppo if t == BATCH: # or (terminal and t < BATCH): # Normalise rewards rewards = np.array(buffer_r) rolling_r.update(rewards) rewards = np.clip(rewards / rolling_r.std, -10, 10) v_final = [ v * (1 - terminal) ] # v = 0 if terminal, otherwise use the predicted v values = np.array(buffer_v + v_final) terminals = np.array(buffer_terminal + [terminal]) # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438 delta = rewards + GAMMA * values[1:] * ( 1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) returns = advantage + np.array(buffer_v) advantage = (advantage - advantage.mean()) / np.maximum( advantage.std(), 1e-6) bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) graph_summary = self.ppo.update(bs, ba, br, badv, sess) buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] t = 0 buffer_s.append(s) buffer_a.append(a) buffer_v.append(v) buffer_terminal.append(terminal) ep_a.append(a) if not self.ppo.discrete: a = np.clip(a, self.env.action_space.low, self.env.action_space.high) s, r, terminal, _ = self.env.step(a) buffer_r.append(r) ep_r += r ep_t += 1 t += 1 if terminal: # End of episode summary print('Worker_%i' % self.wid, '| Episode: %i' % episode, "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t) if self.wid == 0: worker_summary = tf.Summary() worker_summary.value.add(tag="Reward", simple_value=ep_r) # Create Action histograms for each dimension actions = np.array(ep_a) if self.ppo.discrete: add_histogram(writer, "Action", actions, episode, bins=self.ppo.a_dim) else: for a in range(self.ppo.a_dim): add_histogram(writer, "Action/Dim" + str(a), actions[:, a], episode) try: writer.add_summary(graph_summary, episode) except NameError: pass writer.add_summary(worker_summary, episode) writer.flush() episode += 1 break self.env.close() print("Worker_%i finished" % self.wid)