def append(self, step, s, a, n_s, r, d): self.sr = self.low_reward(s, self.sg, n_s) # Low Replay Buffer self.replay_buffer_low.append(s, self.sg, a, n_s, self.n_sg, self.sr, float(d)) # High Replay Buffer if _is_update(step, self.buffer_freq, rem=1): if len(self.buf[6]) == self.buffer_freq: self.buf[4] = s self.buf[5] = float(d) self.replay_buffer_high.append(state=self.buf[0], goal=self.buf[1], action=self.buf[2], n_state=self.buf[4], reward=self.buf[3], done=self.buf[5], state_arr=np.array(self.buf[6]), action_arr=np.array( self.buf[7])) self.buf = [s, self.fg, self.sg, 0, None, None, [], []] self.buf[3] += self.reward_scaling * r self.buf[6].append(s) self.buf[7].append(a)
def log(self, global_step, data): losses, td_errors = data[0], data[1] # Logs if global_step >= self.args.start_training_steps and _is_update(global_step, args.writer_freq): for k, v in losses.items(): self.logger.write('loss/%s'%(k), v, global_step) for k, v in td_errors.items(): self.logger.write('td_error/%s'%(k), v, global_step)
def end_episode(self, episode, logger=None): if logger: # log logger.write('reward/Intrinsic Reward', self.episode_subreward, episode) # Save Model if _is_update(episode, self.model_save_freq): self.save(episode=episode) self.episode_subreward = 0 self.sr = 0 self.buf = [None, None, None, 0, None, None, [], []]
def evaluate(self, e): # Print if _is_update(e, args.print_freq): agent = copy.deepcopy(self.agent) rewards, success_rate = agent.evaluate_policy(self.env) #rewards, success_rate = self.agent.evaluate_policy(self.env) self.logger.write('Success Rate', success_rate, e) print('episode:{episode:05d}, mean:{mean:.2f}, std:{std:.2f}, median:{median:.2f}, success:{success:.2f}'.format( episode=e, mean=np.mean(rewards), std=np.std(rewards), median=np.median(rewards), success=success_rate))
def end_episode(self, episode, logger=None): if logger: if _is_update(episode, self.model_save_freq): self.save(episode=episode)