예제 #1
0
 def offline(self, batch):
     obs = self._batch_obs(batch.obs)
     rew = self._batch_rew(batch.nobs)
     nobs = self._batch_obs(batch.nobs)
     done = self._batch_done(batch.nobs)
     batch = Experience(obs, batch.act, rew, nobs, done)
     return batch
예제 #2
0
 def episode(self, train=True):
   metrics = {"episode/length": 0, "episode/reward_env": 0, "episode/reward_task": 0}
   metrics_reset_agent = self.agent.reset()
   metrics_reset_task  = self.task .reset()
   self._metrics("reset_agent", metrics_reset_agent, metrics)
   self._metrics("reset_task",  metrics_reset_task,  metrics)
   done = False
   obs  = self.env.reset()
   while not done:
     act = self.agent.act(self.task.obs(obs))
     nobs, rew, done, info = self.env.step(self._scale(act))
     step = self.task.online(Experience(obs, act, rew, nobs, done))
     if self.live: self.experience.append(step)
     metrics_train = self.agent.train(self.experience) if train else {}
     metrics_task  = self.task .train(self.experience) if train else {}
     self._metrics("actions",      {str(k):v for k,v in enumerate(act)}, metrics)
     self._metrics("observations", {str(k):v for k,v in enumerate(obs)}, metrics)
     self._metrics("train",        metrics_train, metrics)
     self._metrics("task",         metrics_task,  metrics)
     metrics["episode/reward_env" ] += rew
     metrics["episode/reward_task"] += step.rew
     metrics["episode/length"] += 1
     metrics["episode/buffer"] = len(self.experience)
     obs  = nobs
     done = step.done
   return metrics
예제 #3
0
 def online(self, step):
     self.time += 1
     goal = np.array(self.params.TASK_GOAL).astype(np.float32)
     dist = np.linalg.norm(step.nobs[-3:] - goal)
     rew = -dist
     done = dist < self.params.TASK_THRESHOLD or self.time >= self.params.TASK_TIME_LIMIT or step.done
     step = Experience(step.obs, step.act, rew, step.nobs, done)
     return step
예제 #4
0
 def offline(self, batch):
     pruned_nobs = batch.nobs[:, :-self.params.L_SIZE]
     logits = self.discr(pruned_nobs)
     target = self._from_one_hot(batch.nobs[:, -self.params.L_SIZE:])
     rew = -torch.nn.functional.cross_entropy(
         logits, target, reduction="none") - np.log(
             1.0 / self.params.L_SIZE)
     batch = Experience(batch.obs, batch.act,
                        rew.detach().unsqueeze(1), batch.nobs, batch.done)
     return batch
예제 #5
0
    def episode_end(self, episode, step_count, agent):
        rewards = [e.r for e in self.experiences]
        self.reward_log.append(sum(rewards))

        discounteds = []
        for t, r in enumerate(rewards):
            d_r = [_r * (self.gamma**i) for i, _r in enumerate(rewards[t:])]
            d_r = sum(d_r)
            discounteds.append(d_r)

        for i, e in enumerate(self.experiences):
            s, a, r, n_s, d = e
            d_r = discounteds[i]
            d_e = Experience(s, a, d_r, n_s, d)
            self.d_experiences.append(d_e)

        if not self.training and len(self.d_experiences) == self.buffer_size:
            self.begin_train(i, agent)
            self.training = True

        if self.is_event(episode, self.report_interval):
            recent_rewards = self.reward_log[-self.report_interval:]
            self.logger.describe("reward", recent_rewards, episode=episode)
예제 #6
0
 def offline(self, batch):
     rew = torch.sum((self.targn(batch.obs) - self.predn(batch.obs))**2,
                     dim=1)
     batch = Experience(batch.obs, batch.act, rew.detach(), batch.nobs,
                        batch.done)
     return batch
예제 #7
0
 def batch(self, buf):
     return Experience(*map(
         lambda x: torch.FloatTensor(x).view(self.params.BATCH, -1).cuda(),
         zip(*random.sample(buf, self.params.BATCH))))
예제 #8
0
 def online(self, step):
     return Experience(self.obs(step.obs), step.act, step.rew,
                       self.obs(step.nobs), step.done)
예제 #9
0
 def decode(self, step):
     pruned_obs = step.obs[:-self.params.L_SIZE]
     pruned_nobs = step.nobs[:-self.params.L_SIZE]
     return Experience(pruned_obs, step.act, step.rew, pruned_nobs,
                       step.done)