Exemple #1
0
class ReplayBuffer:
    def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"):
        super().__init__()
        self.done_string = done_string
        self.min_storage = min_storage
        cpprb_args = {
            "size": size,
            "env_dict": env_dict,
            "Nstep": n_step_dict
        }
        self.buffer = CPPRB(**cpprb_args)

    def add(self, data: Sequence[Dict[str, np.ndarray]]) -> None:
        for d in data:
            self.buffer.add(**d)
            if d[self.done_string]:
                self.buffer.on_episode_end()

    def sample(self, size: int) -> Dict[str, np.ndarray]:
        if self.buffer.get_stored_size() < self.min_storage:
            print(
                f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage" +
                f"size {self.min_storage}. Returning None."
            )
            return None
        else:
            return self.buffer.sample(size)
Exemple #2
0
    def test_load_Nstep(self):
        """
        Load Nstep transitions
        """
        buffer_size = 10
        env_dict = {"done": {}}
        Nstep = {"size": 3, "gamma": 0.99}

        rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)
        rb2 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)
        rb3 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)

        d = [0, 0, 0, 0, 1]

        rb1.add(done=d)
        rb1.on_episode_end()

        fname="Nstep.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["done"], t2["done"])
        np.testing.assert_allclose(t1["done"], t3["done"])
Exemple #3
0
    def test_Nstep_incompatibility(self):
        """
        Raise ValueError when Nstep incompatibility
        """
        buffer_size = 10
        env_dict = {"done": {}}
        Nstep = {"size": 3, "gamma": 0.99}

        rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)
        rb2 = ReplayBuffer(buffer_size, env_dict)
        rb3 = ReplayBuffer(buffer_size, env_dict)

        d = [0, 0, 0, 0, 1]

        rb1.add(done=d)
        rb1.on_episode_end()

        fname="Nstep_raise.npz"
        rb1.save_transitions(fname)

        with self.assertRaises(ValueError):
            rb2.load_transitions(fname)

        with self.assertRaises(ValueError):
            rb3.load_transitions(v(1,fname))
Exemple #4
0
    def test_stack_compress(self):
        bsize = 10
        odim = 2
        ssize = 2
        rb = ReplayBuffer(bsize, {"a": {
            "shape": (odim, ssize)
        }},
                          stack_compress="a")
        a = np.random.rand(odim, bsize + ssize - 1)

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])
            rb.on_episode_end()

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])
Exemple #5
0
    def test_has_next_of(self):
        bsize = 10
        rb = ReplayBuffer(bsize, {"a": {}}, next_of="a")
        a = np.random.rand(bsize + 1)

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])
            rb.on_episode_end()

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])
Exemple #6
0
def explorer(global_rb,env_dict,is_training_done,queue):
    local_buffer_size = int(1e+2)
    local_rb = ReplayBuffer(local_buffer_size,env_dict)

    model = MyModel()
    env = gym.make("CartPole-v1")

    obs = env.reset()
    while not is_training_done.is_set():
        if not queue.empty():
            w = queue.get()
            model.weights = w

        action = model.get_action(obs)
        next_obs, reward, done, _ = env.step(action)
        local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done)

        if done:
            local_rb.on_episode_end()
            obs = env.reset()
        else:
            obs = next_obs

        if local_rb.get_stored_size() == local_buffer_size:
            local_sample = local_rb.get_all_transitions()
            local_rb.clear()

            absTD = model.abs_TD_error(local_sample)
            global_rb.add(**local_sample,priorities=absTD)
Exemple #7
0
    def test_cache_next_of(self):
        stack_size = 3
        episode_len = 5
        rb = ReplayBuffer(32,
                          {"obs": {
                              "shape": (stack_size),
                              "dtype": np.int
                          }},
                          next_of="obs",
                          stack_compress="obs")

        obs = np.arange(episode_len + stack_size + 2, dtype=np.int)
        # [0,1,...,episode_len+stack_size+1]
        obs2 = obs + 3 * episode_len
        # [3*episode_len,...,4*episode_len+stack_size+1]

        # Add 1st episode
        for i in range(episode_len):
            rb.add(obs=obs[i:i + stack_size],
                   next_obs=obs[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Reset environment
        rb.on_episode_end()
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Add 2nd episode
        for i in range(episode_len):
            rb.add(obs=obs2[i:i + stack_size],
                   next_obs=obs2[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), 2 * episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len):
            with self.subTest(i=i + episode_len):
                np.testing.assert_equal(s["obs"][i + episode_len],
                                        obs2[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i + episode_len],
                                        obs2[i + 1:i + 1 + stack_size])
Exemple #8
0
    def test_next_obs(self):
        buffer_size = 32
        nstep = 4
        gamma = 0.99
        rb = ReplayBuffer(buffer_size, {
            "next_obs": {},
            "done": {}
        },
                          Nstep={
                              "size": nstep,
                              "gamma": gamma,
                              "next": "next_obs"
                          })

        rb.add(next_obs=1, done=0)
        rb.add(next_obs=2, done=0)
        rb.add(next_obs=3, done=0)
        rb.add(next_obs=4, done=0)
        rb.add(next_obs=5, done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["next_obs"],
                                   np.asarray([[4], [5]]))

        rb.add(next_obs=6, done=1)
        rb.on_episode_end()

        sample = rb.get_all_transitions()
        np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0],
                                   np.asarray([4, 5, 6]))

        rb.add(next_obs=7, done=0)
        rb.add(next_obs=8, done=0)
        rb.add(next_obs=9, done=0)
        rb.add(next_obs=10, done=1)
        rb.on_episode_end()
        sample = rb.get_all_transitions()
        np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0],
                                   np.asarray([4, 5, 6, 10]))
def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True, record=False, record_project= 'benchmarking', record_name = 'trained' , data_path='', config_name='test', max_len_rb=100, benchmark=False, log_prefix=''):
    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    logger = EpochLogger()
    o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
    ep_cost = 0
    local_steps_per_epoch = int(4000 / num_procs())

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    rew_mov_avg_10 = []
    cost_mov_avg_10 = []

    if benchmark:
        ep_costs = []
        ep_rewards = []

    if record:
        wandb.login()
        # 4 million env interactions
        wandb.init(project=record_project, name=record_name)

        rb = ReplayBuffer(size=10000,
                          env_dict={
                              "obs": {"shape": obs_dim},
                              "act": {"shape": act_dim},
                              "rew": {},
                              "next_obs": {"shape": obs_dim},
                              "done": {}})

        # columns = ['observation', 'action', 'reward', 'cost', 'done']
        # sim_data = pd.DataFrame(index=[0], columns=columns)

    while n < num_episodes:
        if render:
            env.render()
            time.sleep(1e-3)

        a = get_action(o)
        next_o, r, d, info = env.step(a)

        if record:
            # buf.store(next_o, a, r, None, info['cost'], None, None, None)
            done_int = int(d==True)
            rb.add(obs=o, act=a, rew=r, next_obs=next_o, done=done_int)

        ep_ret += r
        ep_len += 1
        ep_cost += info['cost']

        # Important!
        o = next_o

        if d or (ep_len == max_ep_len):
            # finish recording and save csv
            if record:
                rb.on_episode_end()

                # make directory if does not exist
                if not os.path.exists(data_path + config_name + '_episodes'):
                    os.makedirs(data_path + config_name + '_episodes')

                # buf = CostPOBuffer(obs_dim, act_dim, local_steps_per_epoch, 0.99, 0.99)

            if len(rew_mov_avg_10) >= 25:
                rew_mov_avg_10.pop(0)
                cost_mov_avg_10.pop(0)

            rew_mov_avg_10.append(ep_ret)
            cost_mov_avg_10.append(ep_cost)

            mov_avg_ret = np.mean(rew_mov_avg_10)
            mov_avg_cost = np.mean(cost_mov_avg_10)

            expert_metrics = {log_prefix + 'episode return': ep_ret,
                              log_prefix + 'episode cost': ep_cost,
                              # 'cumulative return': cum_ret,
                              # 'cumulative cost': cum_cost,
                              log_prefix + '25ep mov avg return': mov_avg_ret,
                              log_prefix + '25ep mov avg cost': mov_avg_cost
                              }

            if benchmark:
                ep_rewards.append(ep_ret)
                ep_costs.append(ep_cost)

            wandb.log(expert_metrics)
            logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost)
            print('Episode %d \t EpRet %.3f \t EpLen %d \t EpCost %d' % (n, ep_ret, ep_len, ep_cost))
            o, r, d, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0
            n += 1


    logger.log_tabular('EpRet', with_min_and_max=True)
    logger.log_tabular('EpLen', average_only=True)
    logger.dump_tabular()

    if record:
        print("saving final buffer")
        bufname_pk = data_path + config_name + '_episodes/sim_data_' + str(int(num_episodes)) + '_buffer.pkl'
        file_pi = open(bufname_pk, 'wb')
        pickle.dump(rb.get_all_transitions(), file_pi)
        wandb.finish()

        return rb

    if benchmark:
        return ep_rewards, ep_costs
Exemple #10
0
class OnPolicyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_cost = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                next_obs, reward, done, info = self._env.step(env_act)
                # print('[DEBUG]  COST:', info['cost'])
                try:
                    cost = info['cost']
                except (TypeError, KeyError):
                    cost = 0
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward
                episode_cost += cost

                done_flag = done
                if (hasattr(self._env, "_max_episode_steps")
                        and episode_steps == self._env._max_episode_steps):
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, episode_cost, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    self.total_cost += episode_cost
                    cost_rate = self.total_cost / total_steps

                    wandb.log(
                        {
                            'Training_Return': episode_return,
                            'Training_Cost': episode_cost,
                            'Cost_Rate': cost_rate,
                            'FPS': fps
                        },
                        step=n_epoisode)
                    episode_steps = 0
                    episode_return = 0
                    episode_cost = 0
                    episode_start_time = time.time()

                if total_steps % self._test_interval == 0:
                    avg_test_return, avg_test_cost = self.evaluate_policy(
                        total_steps)
                    self.logger.info(
                        "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes"
                        .format(total_steps, avg_test_return, avg_test_cost,
                                self._test_episodes))
                    wandb.log(
                        {
                            'Evaluation_Return': avg_test_return,
                            'Evaluation_Cost': avg_test_cost
                        },
                        step=n_epoisode)
                    # wandb.log({'Evaluation_Step': total_steps})

                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    self.writer.flush()

                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer.get_all_transitions()
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                # Update normalizer
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._normalize_obs:
                        samples["obs"] = self._obs_normalizer(samples["obs"],
                                                              update=False)
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

        tf.summary.flush()

    def finish_horizon(self, last_val=0):
        self.local_buffer.on_episode_end()
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        avg_test_cost = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            episode_cost = 0.
            frames = []
            obs = self._test_env.reset()
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if not is_discrete(self._env.action_space) else
                       np.clip(act, self._env.action_space.low,
                               self._env.action_space.high))
                next_obs, reward, done, info = self._test_env.step(act)
                try:
                    cost = info['cost']
                except (TypeError, KeyError):
                    cost = 0
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                episode_cost += cost
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}_cost{2:010.4f}".format(
                total_steps, i, episode_return, episode_cost)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
            avg_test_cost += episode_cost
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_cost / self._test_episodes
Exemple #11
0
    def test_smaller_episode_than_stack_frame(self):
        """
        `on_episode_end()` caches stack size.

        When episode length is smaller than stack size,
        `on_episode_end()` must avoid caching from previous episode.

        Since cache does not wraparound, this bug does not happen
        at the first episode.

        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/108
        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/110
        """
        stack_size = 4
        episode_len1 = 5
        episode_len2 = 2
        rb = ReplayBuffer(32,
                          {"obs": {
                              "shape": (stack_size),
                              "dtype": np.int
                          }},
                          next_of="obs",
                          stack_compress="obs")

        obs = np.arange(episode_len1 + stack_size + 2, dtype=np.int)
        obs2 = np.arange(episode_len2 + stack_size + 2, dtype=np.int) + 100

        self.assertEqual(rb.get_current_episode_len(), 0)

        # Add 1st episode
        for i in range(episode_len1):
            rb.add(obs=obs[i:i + stack_size],
                   next_obs=obs[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1)
        self.assertEqual(rb.get_current_episode_len(), episode_len1)
        for i in range(episode_len1):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Reset environment
        rb.on_episode_end()
        self.assertEqual(rb.get_current_episode_len(), 0)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1)
        for i in range(episode_len1):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Add 2nd episode
        for i in range(episode_len2):
            rb.add(obs=obs2[i:i + stack_size],
                   next_obs=obs2[i + 1:i + 1 + stack_size])

        self.assertEqual(rb.get_current_episode_len(), episode_len2)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2)
        for i in range(episode_len1):
            with self.subTest(i=i, v="obs"):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
            with self.subTest(i=i, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len2):
            with self.subTest(i=i + episode_len1, v="obs"):
                np.testing.assert_equal(s["obs"][i + episode_len1],
                                        obs2[i:i + stack_size])
            with self.subTest(i=i + episode_len1, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i + episode_len1],
                                        obs2[i + 1:i + 1 + stack_size])

        rb.on_episode_end()
        self.assertEqual(rb.get_current_episode_len(), 0)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2)
        for i in range(episode_len1):
            with self.subTest(i=i, v="obs"):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
            with self.subTest(i=i, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len2):
            with self.subTest(i=i + episode_len1, v="obs"):
                np.testing.assert_equal(s["obs"][i + episode_len1],
                                        obs2[i:i + stack_size])
            with self.subTest(i=i + episode_len1, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i + episode_len1],
                                        obs2[i + 1:i + 1 + stack_size])
Exemple #12
0
    def test_Nstep_discounts_with_done(self):
        buffer_size = 32
        step = 4
        gamma = 0.5

        rb = ReplayBuffer(buffer_size, {"done": {}},
                          Nstep={
                              "size": step,
                              "gamma": gamma
                          })

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()

        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0], [1], [1], [1]]))

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0], [1], [1], [1], [0]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1],
                        [1]]))

        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1],
                        [1], [1]]))

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1],
                        [1], [1], [1], [1], [1]]))

        rb.clear()
        self.assertEqual(rb.get_stored_size(), 0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[1]]))
Exemple #13
0
observation = env.reset()

# Warming up
for n_step in range(100):
    action = env.action_space.sample()  # Random Action
    next_observation, reward, done, info = env.step(action)
    rb.add(obs=observation,
           act=action,
           rew=reward,
           next_obs=next_observation,
           done=done)
    observation = next_observation
    if done:
        observation = env.reset()
        rb.on_episode_end()

n_episode = 0
observation = env.reset()
for n_step in range(N_iteration):

    if np.random.rand() < egreedy:
        action = env.action_space.sample()
    else:
        Q = tf.squeeze(model(observation.reshape(1, -1)))
        action = np.argmax(Q)

    egreedy = decay_egreedy(egreedy)

    next_observation, reward, done, info = env.step(action)
    rb.add(obs=observation,
Exemple #14
0
class OnPolicyTrainer(Trainer):
    """
    Trainer class for on-policy reinforcement learning

    Command Line Args:

        * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)``
        * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)``
        * ``--n-experiments`` (int): Number of experiments. The default is ``1``
        * ``--show-progress``: Call ``render`` function during training
        * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)``
        * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)``
        * ``--model-dir`` (str): Directory to restore model.
        * ``--dir-suffix`` (str): Suffix for directory that stores results.
        * ``--normalize-obs``: Whether normalize observation
        * ``--logdir`` (str): Output directory name. The default is ``"results"``
        * ``--evaluate``: Whether evaluate trained model
        * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)``
        * ``--show-test-progress``: Call ``render`` function during evaluation.
        * ``--test-episodes`` (int): Number of episodes at test. The default is ``5``
        * ``--save-test-path``: Save trajectories of evaluation.
        * ``--show-test-images``: Show input images to neural networks when an episode finishes
        * ``--save-test-movie``: Save rendering results.
        * ``--use-prioritized-rb``: Use prioritized experience replay
        * ``--use-nstep-rb``: Use Nstep experience replay
        * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4``
        * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO``
    """
    def __init__(self, *args, **kwargs):
        """
        Initialize On-Policy Trainer

        Args:
            policy: Policy to be trained
            env (gym.Env): Environment for train
            args (Namespace or dict): config parameters specified with command line
            test_env (gym.Env): Environment for test.
        """
        super().__init__(*args, **kwargs)

    def __call__(self):
        """
        Execute training
        """
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                next_obs, reward, done, _ = self._env.step(env_act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward

                done_flag = done
                if (hasattr(self._env, "_max_episode_steps")
                        and episode_steps == self._env._max_episode_steps):
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/training_episode_length",
                                      data=episode_steps)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

                if total_steps % self._test_interval == 0:
                    avg_test_return, avg_test_steps = self.evaluate_policy(
                        total_steps)
                    self.logger.info(
                        "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                        .format(total_steps, avg_test_return,
                                self._test_episodes))
                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    tf.summary.scalar(
                        name="Common/average_test_episode_length",
                        data=avg_test_steps)
                    self.writer.flush()

                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer.get_all_transitions()
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                # Update normalizer
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._normalize_obs:
                        samples["obs"] = self._obs_normalizer(samples["obs"],
                                                              update=False)
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

        tf.summary.flush()

    def finish_horizon(self, last_val=0):
        """
        Finish horizon
        """
        self.local_buffer.on_episode_end()
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        """
        Evaluate policy

        Args:
            total_steps (int): Current total steps of training
        """
        avg_test_return = 0.
        avg_test_steps = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            avg_test_steps += 1
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if is_discrete(self._env.action_space) else np.clip(
                    act, self._env.action_space.low,
                    self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                avg_test_steps += 1
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes