def offline(self, batch): obs = self._batch_obs(batch.obs) rew = self._batch_rew(batch.nobs) nobs = self._batch_obs(batch.nobs) done = self._batch_done(batch.nobs) batch = Experience(obs, batch.act, rew, nobs, done) return batch
def episode(self, train=True): metrics = {"episode/length": 0, "episode/reward_env": 0, "episode/reward_task": 0} metrics_reset_agent = self.agent.reset() metrics_reset_task = self.task .reset() self._metrics("reset_agent", metrics_reset_agent, metrics) self._metrics("reset_task", metrics_reset_task, metrics) done = False obs = self.env.reset() while not done: act = self.agent.act(self.task.obs(obs)) nobs, rew, done, info = self.env.step(self._scale(act)) step = self.task.online(Experience(obs, act, rew, nobs, done)) if self.live: self.experience.append(step) metrics_train = self.agent.train(self.experience) if train else {} metrics_task = self.task .train(self.experience) if train else {} self._metrics("actions", {str(k):v for k,v in enumerate(act)}, metrics) self._metrics("observations", {str(k):v for k,v in enumerate(obs)}, metrics) self._metrics("train", metrics_train, metrics) self._metrics("task", metrics_task, metrics) metrics["episode/reward_env" ] += rew metrics["episode/reward_task"] += step.rew metrics["episode/length"] += 1 metrics["episode/buffer"] = len(self.experience) obs = nobs done = step.done return metrics
def online(self, step): self.time += 1 goal = np.array(self.params.TASK_GOAL).astype(np.float32) dist = np.linalg.norm(step.nobs[-3:] - goal) rew = -dist done = dist < self.params.TASK_THRESHOLD or self.time >= self.params.TASK_TIME_LIMIT or step.done step = Experience(step.obs, step.act, rew, step.nobs, done) return step
def offline(self, batch): pruned_nobs = batch.nobs[:, :-self.params.L_SIZE] logits = self.discr(pruned_nobs) target = self._from_one_hot(batch.nobs[:, -self.params.L_SIZE:]) rew = -torch.nn.functional.cross_entropy( logits, target, reduction="none") - np.log( 1.0 / self.params.L_SIZE) batch = Experience(batch.obs, batch.act, rew.detach().unsqueeze(1), batch.nobs, batch.done) return batch
def episode_end(self, episode, step_count, agent): rewards = [e.r for e in self.experiences] self.reward_log.append(sum(rewards)) discounteds = [] for t, r in enumerate(rewards): d_r = [_r * (self.gamma**i) for i, _r in enumerate(rewards[t:])] d_r = sum(d_r) discounteds.append(d_r) for i, e in enumerate(self.experiences): s, a, r, n_s, d = e d_r = discounteds[i] d_e = Experience(s, a, d_r, n_s, d) self.d_experiences.append(d_e) if not self.training and len(self.d_experiences) == self.buffer_size: self.begin_train(i, agent) self.training = True if self.is_event(episode, self.report_interval): recent_rewards = self.reward_log[-self.report_interval:] self.logger.describe("reward", recent_rewards, episode=episode)
def offline(self, batch): rew = torch.sum((self.targn(batch.obs) - self.predn(batch.obs))**2, dim=1) batch = Experience(batch.obs, batch.act, rew.detach(), batch.nobs, batch.done) return batch
def batch(self, buf): return Experience(*map( lambda x: torch.FloatTensor(x).view(self.params.BATCH, -1).cuda(), zip(*random.sample(buf, self.params.BATCH))))
def online(self, step): return Experience(self.obs(step.obs), step.act, step.rew, self.obs(step.nobs), step.done)
def decode(self, step): pruned_obs = step.obs[:-self.params.L_SIZE] pruned_nobs = step.nobs[:-self.params.L_SIZE] return Experience(pruned_obs, step.act, step.rew, pruned_nobs, step.done)