Ejemplo n.º 1
0
def _postprocess_dqn(policy_graph, sample_batch):
    obs, actions, rewards, new_obs, dones = [
        list(x) for x in sample_batch.columns(
            ["obs", "actions", "rewards", "new_obs", "dones"])]

    # N-step Q adjustments
    if policy_graph.config["n_step"] > 1:
        adjust_nstep(
            policy_graph.config["n_step"], policy_graph.config["gamma"],
            obs, actions, rewards, new_obs, dones)

    batch = SampleBatch({
        "obs": obs, "actions": actions, "rewards": rewards,
        "new_obs": new_obs, "dones": dones,
        "weights": np.ones_like(rewards)})

    # Prioritize on the worker side
    if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
        td_errors = policy_graph.compute_td_error(
            batch["obs"], batch["actions"], batch["rewards"],
            batch["new_obs"], batch["dones"], batch["weights"])
        new_priorities = (
            np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
        batch.data["weights"] = new_priorities

    return batch
Ejemplo n.º 2
0
    def _optimize(self):
        with self.replay_timer:
            if isinstance(self.replay_buffer, PrioritizedReplayBuffer):
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_indexes) = self.replay_buffer.sample(
                     self.train_batch_size, beta=self.prioritized_replay_beta)
            else:
                (obses_t, actions, rewards, obses_tp1,
                 dones) = self.replay_buffer.sample(self.train_batch_size)
                weights = np.ones_like(rewards)
                batch_indexes = -np.ones_like(rewards)

            samples = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })

        with self.grad_timer:
            td_error = self.local_evaluator.compute_apply(samples)
            new_priorities = (np.abs(td_error) + self.prioritized_replay_eps)
            if isinstance(self.replay_buffer, PrioritizedReplayBuffer):
                self.replay_buffer.update_priorities(samples["batch_indexes"],
                                                     new_priorities)
            self.grad_timer.push_units_processed(samples.count)

        self.num_steps_trained += samples.count
Ejemplo n.º 3
0
    def replay(self):
        with self.replay_timer:
            if len(self.replay_buffer) < self.replay_starts:
                return None

            (obses_t, actions, rewards, obses_tp1,
                dones, weights, batch_indexes) = self.replay_buffer.sample(
                    self.train_batch_size,
                    beta=self.prioritized_replay_beta)

            batch = SampleBatch({
                "obs": obses_t, "actions": actions, "rewards": rewards,
                "new_obs": obses_tp1, "dones": dones, "weights": weights,
                "batch_indexes": batch_indexes})
            return batch
Ejemplo n.º 4
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"] +
                       self.config["n_step"] - 1):
            action = self.agent.act(self.state)
            next_state, reward, done, _ = self.env.step(action)
            next_state = to_rainbow(next_state)
            obs.append(self.state.data.cpu().numpy())
            actions.append(action)
            rewards.append(reward)
            new_obs.append(next_state.data.cpu().numpy())
            dones.append(1.0 if done else 0.0)
            self.state = next_state
            self.episode_rewards[-1] += reward
            self.episode_lengths[-1] += 1
            if done:
                self.state = to_rainbow(self.env.reset())
                self.agent.reset_noise()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            self.local_timestep += 1

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert batch.count == self.config["sample_batch_size"]

        td_errors = self.agent.compute_td_error(batch)
        batch.data["obs"] = [pack(o) for o in batch["obs"]]
        batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]]
        new_priorities = (np.abs(td_errors) +
                          self.config["prioritized_replay_eps"])
        batch.data["weights"] = new_priorities

        return batch
Ejemplo n.º 5
0
    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            update_eps = self.exploration.value(self.local_timestep)
            action = self.act(
                np.array(self.obs)[None], update_eps=update_eps)[0]
            obs_tp1, reward, done, _ = self.env.step(action)
            obs.append(self.obs)
            actions.append(action)
            rewards.append(np.sign(reward))
            new_obs.append(obs_tp1)
            dones.append(1.0 if done else 0.0)
            self.obs = obs_tp1
            self.episode_rewards[-1] += reward
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            self.local_timestep += 1

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs, "actions": actions, "rewards": rewards,
            "new_obs": new_obs, "dones": dones,
            "weights": np.ones_like(rewards)})
        assert batch.count == self.config["sample_batch_size"]

#        td_errors = self.agent.compute_td_error(batch)
        batch.data["obs"] = [pack(o) for o in batch["obs"]]
        batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]]
#        new_priorities = (
#            np.abs(td_errors) + self.config["prioritized_replay_eps"])
#        batch.data["weights"] = new_priorities

        return batch
Ejemplo n.º 6
0
 def _replay(self):
     samples = {}
     with self.replay_timer:
         for policy_id, replay_buffer in self.replay_buffers.items():
             if isinstance(replay_buffer, PrioritizedReplayBuffer):
                 (obses_t, actions, rewards, obses_tp1, dones, weights,
                  batch_indexes) = replay_buffer.sample(
                      self.train_batch_size,
                      beta=self.prioritized_replay_beta)
             else:
                 (obses_t, actions, rewards, obses_tp1,
                  dones) = replay_buffer.sample(self.train_batch_size)
                 weights = np.ones_like(rewards)
                 batch_indexes = -np.ones_like(rewards)
         samples[policy_id] = SampleBatch({
             "obs": obses_t,
             "actions": actions,
             "rewards": rewards,
             "new_obs": obses_tp1,
             "dones": dones,
             "weights": weights,
             "batch_indexes": batch_indexes
         })
     return MultiAgentBatch(samples, self.train_batch_size)
Ejemplo n.º 7
0
    def sample(self):
        """sample rollouts from the environment, being called in step in PolicyOptimizer"""
        observations, rewards, actions, logprobs, dones, values = [], [], [], [], [], []
        done = False
        for step in range(self.config['steps_per_rollout']):
            value, action, logprob, mean = self.net.forward(
                to_variable(self.obs[np.newaxis], self.config['cuda']))
            action = action.cpu().data.numpy(
            )[0] if self.config['cuda'] else action.data.numpy()[0]
            next_obs, reward, done, _ = self.env.step(action)

            if self.config['cuda']:
                # torch has an additional dimension for batch size, so we need to select that batch
                value, logprob, mean = value.data.cpu().numpy()[0], logprob.data.cpu().numpy()[0], \
                                       mean.data.cpu().numpy()[0]
            else:
                value, logprob, mean = value.data.numpy()[0], logprob.data.numpy()[0], \
                                       mean.data.numpy()[0]

            observations.append(self.obs)
            actions.append(action)
            rewards.append(reward)
            logprobs.append(logprob)
            values.append(value)
            dones.append(done)

            self.obs = next_obs

            if done:
                # reset the environment
                self.obs = self.env.reset()

        if done:
            last_value = 0.0
        else:
            # bootstrap, we only need the last value to do this
            value, action, logprob, mean = self.net.forward(
                to_variable(self.obs[np.newaxis], self.config['cuda']))

            if self.config['cuda']:
                # torch has an additional dimension for batch size, so we need to select that batch
                value, = value.data.cpu().numpy()[0]
            else:
                value, = value.data.numpy()[0]
            last_value = value

        # same as ppo_single/model/ppo.py
        observations = np.asarray(observations)
        rewards = np.asarray(rewards)
        logprobs = np.asarray(logprobs)
        dones = np.asarray(dones)
        values = np.asarray(values)
        actions = np.asarray(actions)
        returns = calculate_returns(rewards, dones, last_value,
                                    self.config['gamma'])
        return SampleBatch({
            'observations': observations,
            'rewards': rewards,
            'logprobs': logprobs,
            'dones': dones,
            'values': values,
            'actions': actions,
            'returns': returns[:-1]
        })