print('Episode: %i' % episode, "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t) worker_summary = tf.Summary() worker_summary.value.add(tag="Reward", simple_value=ep_r) worker_summary.value.add(tag="Reward/mean", simple_value=rolling_r.mean) worker_summary.value.add(tag="Reward/std", simple_value=rolling_r.std) # Create Action histograms for each dimension actions = np.array(ep_a) if ppo.discrete: add_histogram(ppo.writer, "Action", actions, episode, bins=ppo.a_dim) else: for a in range(ppo.a_dim): add_histogram(ppo.writer, "Action/Dim" + str(a), actions[:, a], episode) try: ppo.writer.add_summary(graph_summary, episode) except NameError: pass ppo.writer.add_summary(worker_summary, episode) ppo.writer.flush() # Save the model
def work(self): hooks = [self.ppo.sync_replicas_hook] sess = tf.train.MonitoredTrainingSession(master=self.server.target, is_chief=(self.wid == 0), checkpoint_dir=SUMMARY_DIR, save_summaries_steps=None, save_summaries_secs=None, hooks=hooks) if self.wid == 0: writer = SummaryWriterCache.get(SUMMARY_DIR) t, episode, terminal = 0, 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() while not sess.should_stop() and not (episode > EP_MAX and self.wid == 0): s = self.env.reset() ep_r, ep_t, ep_a = 0, 0, [] while True: a, v = self.ppo.evaluate_state(s, sess) # Update ppo if t == BATCH: # or (terminal and t < BATCH): # Normalise rewards rewards = np.array(buffer_r) rolling_r.update(rewards) rewards = np.clip(rewards / rolling_r.std, -10, 10) v_final = [ v * (1 - terminal) ] # v = 0 if terminal, otherwise use the predicted v values = np.array(buffer_v + v_final) terminals = np.array(buffer_terminal + [terminal]) # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438 delta = rewards + GAMMA * values[1:] * ( 1 - terminals[1:]) - values[:-1] advantage = discount(delta, GAMMA * LAMBDA, terminals) returns = advantage + np.array(buffer_v) advantage = (advantage - advantage.mean()) / np.maximum( advantage.std(), 1e-6) bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \ np.vstack(returns), np.vstack(advantage) graph_summary = self.ppo.update(bs, ba, br, badv, sess) buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] t = 0 buffer_s.append(s) buffer_a.append(a) buffer_v.append(v) buffer_terminal.append(terminal) ep_a.append(a) if not self.ppo.discrete: a = np.clip(a, self.env.action_space.low, self.env.action_space.high) s, r, terminal, _ = self.env.step(a) buffer_r.append(r) ep_r += r ep_t += 1 t += 1 if terminal: # End of episode summary print('Worker_%i' % self.wid, '| Episode: %i' % episode, "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t) if self.wid == 0: worker_summary = tf.Summary() worker_summary.value.add(tag="Reward", simple_value=ep_r) # Create Action histograms for each dimension actions = np.array(ep_a) if self.ppo.discrete: add_histogram(writer, "Action", actions, episode, bins=self.ppo.a_dim) else: for a in range(self.ppo.a_dim): add_histogram(writer, "Action/Dim" + str(a), actions[:, a], episode) try: writer.add_summary(graph_summary, episode) except NameError: pass writer.add_summary(worker_summary, episode) writer.flush() episode += 1 break self.env.close() print("Worker_%i finished" % self.wid)