Beispiel #1
0
    def test_has_next_of(self):
        bsize = 10
        rb = ReplayBuffer(bsize, {"a": {}}, next_of="a")
        a = np.random.rand(bsize + 1)

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])
            rb.on_episode_end()

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])
Beispiel #2
0
    def test_stack_compress(self):
        bsize = 10
        odim = 2
        ssize = 2
        rb = ReplayBuffer(bsize, {"a": {
            "shape": (odim, ssize)
        }},
                          stack_compress="a")
        a = np.random.rand(odim, bsize + ssize - 1)

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])
            rb.on_episode_end()

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])
Beispiel #3
0
    def test_buffer(self):

        buffer_size = 256
        obs_shape = (15,15)
        act_dim = 5

        N = 512

        erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape},
                                        "act":{"shape": act_dim},
                                        "rew":{},
                                        "next_obs":{"shape": obs_shape},
                                        "done":{}})

        for i in range(N):
            obs = np.full(obs_shape,i,dtype=np.double)
            act = np.full(act_dim,i,dtype=np.double)
            rew = i
            next_obs = obs + 1
            done = 0

            erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done)

        es = erb._encode_sample(range(buffer_size))

        erb.sample(32)

        erb.clear()

        self.assertEqual(erb.get_next_index(),0)
        self.assertEqual(erb.get_stored_size(),0)
Beispiel #4
0
def explorer(global_rb,env_dict,is_training_done,queue):
    local_buffer_size = int(1e+2)
    local_rb = ReplayBuffer(local_buffer_size,env_dict)

    model = MyModel()
    env = gym.make("CartPole-v1")

    obs = env.reset()
    while not is_training_done.is_set():
        if not queue.empty():
            w = queue.get()
            model.weights = w

        action = model.get_action(obs)
        next_obs, reward, done, _ = env.step(action)
        local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done)

        if done:
            local_rb.on_episode_end()
            obs = env.reset()
        else:
            obs = next_obs

        if local_rb.get_stored_size() == local_buffer_size:
            local_sample = local_rb.get_all_transitions()
            local_rb.clear()

            absTD = model.abs_TD_error(local_sample)
            global_rb.add(**local_sample,priorities=absTD)
Beispiel #5
0
class MeTrpoTrainer(MPCTrainer):
    def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs):
        kwargs["n_dynamics_model"] = 5
        super().__init__(*args, **kwargs)
        self._n_eval_episodes_per_model = n_eval_episodes_per_model

        # Replay buffer to train policy
        self.replay_buffer = get_replay_buffer(self._policy, self._env)

        # Replay buffer to compute GAE
        rb_dict = {
            "size": self._episode_max_steps,
            "default_dtype": np.float32,
            "env_dict": {
                "obs": {"shape": self._env.observation_space.shape},
                "act": {"shape": self._env.action_space.shape},
                "next_obs": {"shape": self._env.observation_space.shape},
                "rew": {},
                "done": {},
                "logp": {},
                "val": {}}}
        self.local_buffer = ReplayBuffer(**rb_dict)

    def predict_next_state(self, obses, acts, idx=None):
        is_single_input = obses.ndim == acts.ndim and acts.ndim == 1
        if is_single_input:
            obses = np.expand_dims(obses, axis=0)
            acts = np.expand_dims(acts, axis=0)

        inputs = np.concatenate([obses, acts], axis=1)
        idx = np.random.randint(self._n_dynamics_model) if idx is None else idx
        obs_diffs = self._dynamics_models[idx].predict(inputs)

        if is_single_input:
            return obses[0] + obs_diffs

        return obses + obs_diffs

    def _make_inputs_output_pairs(self, n_epoch):
        samples = self.dynamics_buffer.sample(self.dynamics_buffer.get_stored_size())
        inputs = np.concatenate([samples["obs"], samples["act"]], axis=1)
        labels = samples["next_obs"] - samples["obs"]

        return inputs, labels

    def __call__(self):
        total_steps = 0
        tf.summary.experimental.set_step(total_steps)

        while True:
            # Collect (s, a, s') pairs in a real environment
            self.collect_transitions_real_env()
            total_steps += self._n_collect_steps
            tf.summary.experimental.set_step(total_steps)

            # Train dynamics models
            self.fit_dynamics(n_epoch=1)
            if self._debug:
                ret_real_env, ret_sim_env = self._evaluate_model()
                self.logger.info("Returns (real, sim) = ({: .3f}, {: .3f})".format(ret_real_env, ret_sim_env))

            # Prepare initial states for evaluation
            init_states_for_eval = np.array([
                self._env.reset() for _ in range(self._n_dynamics_model * self._n_eval_episodes_per_model)])

            # Returns to evaluate policy improvement
            returns_before_update = self._evaluate_current_return(init_states_for_eval)

            n_updates = 0
            improve_ratios = []
            while True:
                n_updates += 1

                # Generate samples using dynamics models (simulated env)
                average_return = self.collect_transitions_sim_env()

                # Update policy
                self.update_policy()

                # Evaluate policy improvement
                returns_after_update = self._evaluate_current_return(init_states_for_eval)
                n_improved = np.sum(returns_after_update > returns_before_update)
                improved_ratio = n_improved / (self._n_dynamics_model * self._n_eval_episodes_per_model)
                improve_ratios.append(improved_ratio)
                if improved_ratio < 0.7:
                    break
                returns_before_update = returns_after_update

            self.logger.info(
                "Training total steps: {0: 7} sim return: {1: .4f} n_update: {2:}, ratios: {3:}".format(
                    total_steps, average_return, n_updates, improve_ratios))
            tf.summary.scalar(name="mpc/n_updates", data=n_updates)

            # Evaluate policy in a real environment
            if total_steps // self._n_collect_steps % 10 == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info("Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes".format(
                    total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(
                    name="Common/average_test_return", data=avg_test_return)

    def _evaluate_model(self):
        ret_real_env, ret_sim_env = 0., 0.
        n_episodes = 10
        for _ in range(n_episodes):
            real_obs = self._env.reset()
            sim_obs = real_obs.copy()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(real_obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
                else:
                    env_act = act

                next_real_obs, rew, _, _ = self._env.step(env_act)
                ret_real_env += rew
                real_obs = next_real_obs

                next_sim_obs = self.predict_next_state(sim_obs, env_act)
                ret_sim_env += self._reward_fn(real_obs, act)[0]
                sim_obs = next_sim_obs

        ret_real_env /= n_episodes
        ret_sim_env /= n_episodes
        return ret_real_env, ret_sim_env

    def update_policy(self):
        # Compute mean and std for normalizing advantage
        if self._policy.normalize_adv:
            samples = self.replay_buffer.get_all_transitions()
            mean_adv = np.mean(samples["adv"])
            std_adv = np.std(samples["adv"])

        for _ in range(self._policy.n_epoch):
            samples = self.replay_buffer._encode_sample(np.random.permutation(self._policy.horizon))
            adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) if self._policy.normalize_adv else samples["adv"]
            for idx in range(int(self._policy.horizon / self._policy.batch_size)):
                target = slice(idx * self._policy.batch_size,
                               (idx + 1) * self._policy.batch_size)
                self._policy.train(
                    states=samples["obs"][target],
                    actions=samples["act"][target],
                    advantages=adv[target],
                    logp_olds=samples["logp"][target],
                    returns=samples["ret"][target])

    def _evaluate_current_return(self, init_states):
        n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model
        assert init_states.shape[0] == n_episodes

        obses = init_states.copy()
        next_obses = np.zeros_like(obses)
        returns = np.zeros(shape=(n_episodes,), dtype=np.float32)

        for _ in range(self._episode_max_steps):
            acts, _ = self._policy.get_action(obses)
            for i in range(n_episodes):
                model_idx = i // self._n_eval_episodes_per_model
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(acts[i], self._env.action_space.low, self._env.action_space.high)
                else:
                    env_act = acts[i]
                next_obses[i] = self.predict_next_state(obses[i], env_act, idx=model_idx)
            returns += self._reward_fn(obses, acts)
            obses = next_obses

        return returns

    def _visualize_current_performance(self):
        obs = self._env.reset()
        for _ in range(self._episode_max_steps):
            act, _ = self._policy.get_action(obs)
            if not is_discrete(self._env.action_space):
                env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
            else:
                env_act = act
            next_obs = self.predict_next_state(obs, env_act)

            self._env.state = np.array([np.arctan2(next_obs[1], next_obs[0]), next_obs[2]], dtype=np.float32)
            # print(obs, act, next_obs, self._env.state)
            self._env.render()
            obs = next_obs

    def collect_transitions_real_env(self):
        total_steps = 0
        episode_steps = 0
        obs = self._env.reset()
        while total_steps < self._n_collect_steps:
            episode_steps += 1
            total_steps += 1
            act, _ = self._policy.get_action(obs)
            if not is_discrete(self._env.action_space):
                env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
            else:
                env_act = act
            next_obs, _, done, _ = self._env.step(env_act)
            self.dynamics_buffer.add(
                obs=obs, act=env_act, next_obs=next_obs)
            obs = next_obs
            if done or episode_steps == self._episode_max_steps:
                episode_steps = 0
                obs = self._env.reset()

    def collect_transitions_sim_env(self):
        """
        Generate transitions using dynamics model
        """
        self.replay_buffer.clear()
        n_episodes = 0
        ave_episode_return = 0
        while self.replay_buffer.get_stored_size() < self._policy.horizon:
            obs = self._env.reset()
            episode_return = 0.
            for _ in range(self._episode_max_steps):
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high)
                else:
                    env_act = act
                if self._debug:
                    next_obs, rew, _, _ = self._env.step(env_act)
                else:
                    next_obs = self.predict_next_state(obs, env_act)
                    rew = self._reward_fn(obs, act)[0]
                self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=rew,
                                      done=False, logp=logp, val=val)
                obs = next_obs
                episode_return += rew
            self.finish_horizon(last_val=val)
            ave_episode_return += episode_return
            n_episodes += 1
        return ave_episode_return / n_episodes

    def finish_horizon(self, last_val=0):
        """
        TODO: These codes are completly identical to the ones defined in on_policy_trainer.py. Use it.
        """
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(
                deltas, self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(
            obs=samples["obs"], act=samples["act"], done=samples["done"],
            ret=rets, adv=advs, logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(
                self._policy, self._test_env, size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if not hasattr(self._env.action_space, "high") else
                       np.clip(act, self._env.action_space.low, self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                if self._save_test_path:
                    replay_buffer.add(
                        obs=obs, act=act, next_obs=next_obs,
                        rew=reward, done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image('train/input_img', images, )
        return avg_test_return / self._test_episodes

    def _set_from_args(self, args):
        super()._set_from_args(args)
        self._n_collect_steps = args.n_collect_steps
        self._debug = args.debug

    @staticmethod
    def get_argument(parser=None):
        parser = MPCTrainer.get_argument(parser)
        parser.add_argument("--n-collect-steps", type=int, default=100)
        parser.add_argument("--debug", action='store_true')
        return parser
Beispiel #6
0
class OnPolicyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self._test_interval % self._policy.horizon == 0, \
            "Test interval should be divisible by policy horizon"

    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                act, logp, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()
            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer._encode_sample(
                    np.arange(self._policy.horizon))
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._save_model_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()

    def finish_horizon(self, last_val=0):
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        if self._normalize_obs:
            self._test_env.normalizer.set_params(
                *self._env.normalizer.get_params())
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(obs, test=True)
                act = act if not hasattr(self._env.action_space, "high") else \
                    np.clip(act, self._env.action_space.low, self._env.action_space.high)
                next_obs, reward, done, _ = self._test_env.step(act)
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes
Beispiel #7
0
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             env_fn,
             policy_fn,
             set_weights_fn,
             noise_level,
             n_env=64,
             n_thread=4,
             buffer_size=1024,
             episode_max_steps=1000,
             gpu=0):
    """
    Collect transitions and store them to prioritized replay buffer.

    :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]):
        Prioritized replay buffer sharing with multiple explorers and only one learner.
        This object is shared over processes, so it must be locked when trying to
        operate something with `lock` object.
    :param queue (multiprocessing.Queue):
        A FIFO shared with the `learner` and `evaluator` to get the latest network weights.
        This is process safe, so you don't need to lock process when use this.
    :param trained_steps (multiprocessing.Value):
        Number of steps to apply gradients.
    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param lock (multiprocessing.Lock):
        multiprocessing.Lock to lock other processes.
    :param env_fn (function):
        Method object to generate an environment.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param set_weights_fn (function):
        Method object to set network weights gotten from queue.
    :param noise_level (float):
        Noise level for exploration. For epsilon-greedy policy like DQN variants,
        this will be epsilon, and if DDPG variants this will be variance for Normal distribution.
    :param n_env (int):
        Number of environments to distribute. If this is set to be more than 1,
        `MultiThreadEnv` will be used.
    :param n_thread (int):
        Number of thread used in `MultiThreadEnv`.
    :param buffer_size (int):
        Size of local buffer. If this is filled with transitions, add them to `global_rb`
    :param episode_max_steps (int):
        Maximum number of steps of an episode.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    """
    import_tf()
    logger = logging.getLogger("tf2rl")

    if n_env > 1:
        envs = MultiThreadEnv(env_fn=env_fn,
                              batch_size=n_env,
                              thread_pool=n_thread,
                              max_episode_steps=episode_max_steps)
        env = envs._sample_env
    else:
        env = env_fn()

    policy = policy_fn(env=env,
                       name="Explorer",
                       memory_capacity=global_rb.get_buffer_size(),
                       noise_level=noise_level,
                       gpu=gpu)

    kwargs = get_default_rb_dict(buffer_size, env)
    if n_env > 1:
        kwargs["env_dict"]["priorities"] = {}
    local_rb = ReplayBuffer(**kwargs)
    local_idx = np.arange(buffer_size).astype(np.int)

    if n_env == 1:
        s = env.reset()
        episode_steps = 0
        total_reward = 0.
        total_rewards = []

    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        if n_env == 1:
            n_sample += 1
            episode_steps += 1
            a = policy.get_action(s)
            s_, r, done, _ = env.step(a)
            done_flag = done
            if episode_steps == env._max_episode_steps:
                done_flag = False
            total_reward += r
            local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag)

            s = s_
            if done or episode_steps == episode_max_steps:
                s = env.reset()
                total_rewards.append(total_reward)
                total_reward = 0
                episode_steps = 0
        else:
            n_sample += n_env
            obses = envs.py_observation()
            actions = policy.get_action(obses, tensor=True)
            next_obses, rewards, dones, _ = envs.step(actions)
            td_errors = policy.compute_td_error(states=obses,
                                                actions=actions,
                                                next_states=next_obses,
                                                rewards=rewards,
                                                dones=dones)
            local_rb.add(obs=obses,
                         act=actions,
                         next_obs=next_obses,
                         rew=rewards,
                         done=dones,
                         priorities=np.abs(td_errors + 1e-6))

        # Periodically copy weights of explorer
        if not queue.empty():
            set_weights_fn(policy, queue.get())

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb._encode_sample(local_idx)
            if n_env > 1:
                priorities = np.squeeze(samples["priorities"])
            else:
                td_errors = policy.compute_td_error(
                    states=samples["obs"],
                    actions=samples["act"],
                    next_states=samples["next_obs"],
                    rewards=samples["rew"],
                    dones=samples["done"])
                priorities = np.abs(np.squeeze(td_errors)) + 1e-6
            lock.acquire()
            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=priorities)
            lock.release()
            local_rb.clear()

            msg = "Grad: {0: 6d}\t".format(trained_steps.value)
            msg += "Samples: {0: 7d}\t".format(n_sample)
            msg += "TDErr: {0:.5f}\t".format(np.average(priorities))
            if n_env == 1:
                ave_rew = (0 if len(total_rewards) == 0 else
                           sum(total_rewards) / len(total_rewards))
                msg += "AveEpiRew: {0:.3f}\t".format(ave_rew)
                total_rewards = []
            msg += "FPS: {0:.2f}".format(
                (n_sample - n_sample_old) / (time.time() - start))
            logger.info(msg)

            start = time.time()
            n_sample_old = n_sample
Beispiel #8
0
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             buffer_size=1024,
             episode_max_steps=1000,
             epsilon=0.5,
             transitions=None):
    tf = import_tf()
    env = _env()
    stacked_frames = deque(maxlen=4)
    policy = Agent()
    policy.epsilon = epsilon
    env_dict = {
        "obs": {
            "shape": state_size
        },
        "act": {},
        "rew": {},
        "next_obs": {
            "shape": state_size
        },
        "done": {}
    }
    local_rb = ReplayBuffer(buffer_size,
                            env_dict=env_dict,
                            default_dtype=np.float16)
    local_idx = np.arange(buffer_size).astype(np.int)

    s = env.reset()
    s = stack_frames(stacked_frames, s, True)
    episode_steps = 0
    total_reward = 0.
    total_rewards = []

    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        transitions.value += 1
        n_sample += 1
        episode_steps += 1
        a = policy.acting(s)
        s_, r, done, _ = env.step(a)
        done_flag = done
        if episode_steps == episode_max_steps:
            done_flag = False
        total_reward += r
        s_ = stack_frames(stacked_frames, s_, False)
        policy.n_step_buffer.append((s, a, r, s_, done_flag))
        if len(policy.n_step_buffer) == policy.n_step:
            reward, next_state, done = policy.get_n_step_info(
                policy.n_step_buffer, policy.gamma)
            state, action = policy.n_step_buffer[0][:2]
            local_rb.add(obs=state,
                         act=action,
                         rew=reward,
                         next_obs=next_state,
                         done=done)

        s = s_
        if done or episode_steps == episode_max_steps:
            s = env.reset()
            s = stack_frames(stacked_frames, s, True)
            total_rewards.append(total_reward)
            total_reward = 0
            episode_steps = 0

        if not queue.empty():
            set_weights_fn(policy, queue.get())

        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb._encode_sample(local_idx)

            samples1 = {key: value[:50] for key, value in samples.items()}
            samples2 = {key: value[50:100] for key, value in samples.items()}
            samples3 = {key: value[100:150] for key, value in samples.items()}
            samples4 = {key: value[150:200] for key, value in samples.items()}

            for samples in [samples1, samples2, samples3, samples4]:
                td_errors = policy.compute_td_error(samples["obs"],
                                                    samples["act"],
                                                    samples["rew"],
                                                    samples["next_obs"],
                                                    samples["done"])
                priorities = td_errors.numpy() + 1e-6
                samples['priority'] = priorities

            samples = {
                key: np.concatenate(
                    (value, samples2[key], samples3[key], samples4[key]))
                for key, value in samples1.items()
            }

            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=samples['priority'])
            local_rb.clear()

            ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) /
                       len(total_rewards))

            total_rewards = []
            start = time.time()
            n_sample_old = n_sample
Beispiel #9
0
class HindsightReplayBuffer:
    """
    Replay Buffer class for Hindsight Experience Replay

    Ref: https://arxiv.org/abs/1707.01495
    """
    def __init__(self,
                 size: int,
                 env_dict: Dict,
                 max_episode_len: int,
                 reward_func: Callable,
                 *,
                 goal_func: Optional[Callable] = None,
                 goal_shape: Optional[Iterable[int]] = None,
                 state: str = "obs",
                 action: str = "act",
                 next_state: str = "next_obs",
                 strategy: str = "future",
                 additional_goals: int = 4,
                 prioritized=True,
                 **kwargs):
        """
        Initialize HindsightReplayBuffer

        Parameters
        ----------
        size : int
            Buffer Size
        env_dict : dict of dict
            Dictionary specifying environments. The keies of env_dict become
            environment names. The values of env_dict, which are also dict,
            defines "shape" (default 1) and "dtypes" (fallback to `default_dtype`)
        max_episode_len : int
            Maximum episode length.
        reward_func : Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray]
            Batch calculation of reward function SxAxG -> R.
        goal_func : Callable[[np.ndarray], np.ndarray], optional
            Batch extraction function for goal from state: S->G.
            If ``None`` (default), identity function is used (goal = state).
        goal_shape : Iterable[int], optional
            Shape of goal. If ``None`` (default), state shape is used.
        state : str, optional
            State name in ``env_dict``. The default is "obs".
        action : str, optional
            Action name in ``env_dict``. The default is "act".
        next_state : str, optional
            Next state name in ``env_dict``. The default is "next_obs".
        strategy : ["future", "episode", "random", "final"], optional
            Goal sampling strategy.
            "future" selects one of the future states in the same episode.
            "episode" selects states in the same episode.
            "random" selects from the all states in replay buffer.
            "final" selects the final state in the episode. For "final",
            ``additonal_goals`` is ignored.
            The default is "future"
        additional_goals : int, optional
            Number of additional goals. The default is ``4``.
        prioritized : bool, optional
            Whether use Prioritized Experience Replay. The default is ``True``.
        """
        self.max_episode_len = max_episode_len
        self.reward_func = reward_func
        self.goal_func = goal_func or (lambda s: s)

        self.state = state
        self.action = action
        self.next_state = next_state

        self.strategy = strategy
        known_strategy = ["future", "episode", "random", "final"]
        if self.strategy not in known_strategy:
            raise ValueError(f"Unknown Strategy: {strategy}. " +
                             f"Known Strategies: {known_strategy}")

        self.additional_goals = additional_goals
        if self.strategy == "final":
            self.additional_goals = 1

        self.prioritized = prioritized

        if goal_shape:
            goal_dict = {**env_dict[state], "shape": goal_shape}
            self.goal_shape = np.array(goal_shape, ndmin=1)
        else:
            goal_dict = env_dict[state]
            self.goal_shape = np.array(env_dict[state].get("shape", 1),
                                       ndmin=1)
        RB = PrioritizedReplayBuffer if self.prioritized else ReplayBuffer
        self.rb = RB(size, {
            **env_dict, "rew": {},
            "goal": goal_dict
        }, **kwargs)

        self.episode_rb = ReplayBuffer(self.max_episode_len, env_dict)

        self.rng = np.random.default_rng()

    def add(self, **kwargs):
        r"""Add transition(s) into replay buffer.

        Multple sets of transitions can be added simultaneously.

        Parameters
        ----------
        **kwargs : array like or float or int
            Transitions to be stored.
        """
        if self.episode_rb.get_stored_size() >= self.max_episode_len:
            raise ValueError("Exceed Max Episode Length")
        self.episode_rb.add(**kwargs)

    def sample(self, batch_size: int, **kwargs):
        r"""Sample the stored transitions randomly with speciped size

        Parameters
        ----------
        batch_size : int
            sampled batch size

        Returns
        -------
        sample : dict of ndarray
            Batch size of sampled transitions, which might contains
            the same transition multiple times.
        """
        return self.rb.sample(batch_size, **kwargs)

    def on_episode_end(self, goal):
        """
        Terminate the current episode and set hindsight goal

        Paremeters
        ----------
        goal : array-like
            Original goal state of this episode.
        """
        episode_len = self.episode_rb.get_stored_size()
        if episode_len == 0:
            return None

        trajectory = self.episode_rb.get_all_transitions()
        add_shape = (trajectory[self.state].shape[0], *self.goal_shape)

        goal = np.broadcast_to(np.asarray(goal), add_shape)
        rew = self.reward_func(trajectory[self.next_state],
                               trajectory[self.action], goal)

        self.rb.add(**trajectory, goal=goal, rew=rew)

        if self.strategy == "future":
            idx = np.zeros((self.additional_goals, episode_len),
                           dtype=np.int64)
            for i in range(episode_len):
                idx[:, i] = self.rng.integers(low=i,
                                              high=episode_len,
                                              size=self.additional_goals)
            for i in range(self.additional_goals):
                goal = self.goal_func(trajectory[self.next_state][idx[i]])
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], goal)
                self.rb.add(**trajectory, rew=rew, goal=goal)
        elif self.strategy == "episode":
            idx = self.rng.integers(low=0,
                                    high=episode_len,
                                    size=(self.additional_goals, episode_len))
            for _i in idx:
                goal = self.goal_func(trajectory[self.next_state][_i])
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], goal)
                self.rb.add(**trajectory, rew=rew, goal=goal)
        elif self.strategy == "final":
            goal = self.goal_func(
                np.broadcast_to(trajectory[self.next_state][-1],
                                trajectory[self.next_state].shape))
            rew = self.reward_func(trajectory[self.next_state],
                                   trajectory[self.action], goal)
            self.rb.add(**trajectory, rew=rew, goal=goal)
        else:  # random
            # Note 1:
            #   We should not prioritize goal selection,
            #   so that we manually create indices.
            # Note 2:
            #   Since we cannot access internal data directly,
            #   we have to extract set of transitions.
            #   Although this has overhead, it is fine
            #   becaue "random" strategy is used only for
            #   strategy comparison.
            idx = self.rng.integers(low=0,
                                    high=self.rb.get_stored_size(),
                                    size=self.additional_goals * episode_len)
            goal = self.goal_func(self.rb._encode_sample(idx)[self.next_state])
            goal = goal.reshape(
                (self.additional_goals, episode_len, *(goal.shape[1:])))
            for g in goal:
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], g)
                self.rb.add(**trajectory, rew=rew, goal=g)

        self.episode_rb.clear()
        self.rb.on_episode_end()

    def clear(self):
        """
        Clear replay buffer
        """
        self.rb.clear()
        self.episode_rb.clear()

    def get_stored_size(self):
        """
        Get stored size

        Returns
        -------
        int
            stored size
        """
        return self.rb.get_stored_size()

    def get_buffer_size(self):
        """
        Get buffer size

        Returns
        -------
        int
            buffer size
        """
        return self.rb.get_buffer_size()

    def get_all_transitions(self, shuffle: bool = False):
        r"""
        Get all transitions stored in replay buffer.

        Parameters
        ----------
        shuffle : bool, optional
            When True, transitions are shuffled. The default value is False.

        Returns
        -------
        transitions : dict of numpy.ndarray
            All transitions stored in this replay buffer.
        """
        return self.rb.get_all_transitions(shuffle)

    def update_priorities(self, indexes, priorities):
        """
        Update priorities

        Parameters
        ----------
        indexes : array_like
            indexes to update priorities
        priorities : array_like
            priorities to update

        Raises
        ------
        TypeError: When ``indexes`` or ``priorities`` are ``None``
        ValueError: When this buffer is constructed with ``prioritized=False``
        """
        if not self.prioritized:
            raise ValueError("Buffer is constructed without PER")

        self.rb.update_priorities(indexes, priorities)

    def get_max_priority(self):
        """
        Get max priority

        Returns
        -------
        float
            Max priority of stored priorities

        Raises
        ------
        ValueError: When this buffer is constructed with ``prioritied=False``
        """
        if not self.prioritized:
            raise ValueError("Buffer is constructed without PER")

        return self.rb.get_max_priority()
Beispiel #10
0
class MeTrpoTrainer(MPCTrainer):
    """
    Trainer class for Model-Ensemble Trust-Region Policy Optimization (ME-TRPO):https://arxiv.org/abs/1802.10592

    Command Line Args:

        * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)``
        * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)``
        * ``--n-experiments`` (int): Number of experiments. The default is ``1``
        * ``--show-progress``: Call ``render`` function during training
        * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)``
        * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)``
        * ``--model-dir`` (str): Directory to restore model.
        * ``--dir-suffix`` (str): Suffix for directory that stores results.
        * ``--normalize-obs``: Whether normalize observation
        * ``--logdir`` (str): Output directory name. The default is ``"results"``
        * ``--evaluate``: Whether evaluate trained model
        * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)``
        * ``--show-test-progress``: Call ``render`` function during evaluation.
        * ``--test-episodes`` (int): Number of episodes at test. The default is ``5``
        * ``--save-test-path``: Save trajectories of evaluation.
        * ``--show-test-images``: Show input images to neural networks when an episode finishes
        * ``--save-test-movie``: Save rendering results.
        * ``--use-prioritized-rb``: Use prioritized experience replay
        * ``--use-nstep-rb``: Use Nstep experience replay
        * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4``
        * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO``
        * ``--gpu`` (int): The default is ``0``
        * ``--max-iter`` (int): Maximum iteration. The default is ``100``
        * ``--horizon`` (int): Number of steps to online horizon
        * ``--n-sample`` (int): Number of samples. The default is ``1000``
        * ``--batch-size`` (int): Batch size. The default is ``512``.
        * ``--n-collect-steps`` (int): Number of steps to collect. The default is ``100``
        * ``--debug``: Enable debug
    """
    def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs):
        """
        Initialize ME-TRPO

        Args:
            policy: Policy to be trained
            env (gym.Env): Environment for train
            args (Namespace or dict): config parameters specified with command line
            test_env (gym.Env): Environment for test.
            reward_fn (callable): Reward function
            buffer_size (int): The default is ``int(1e6)``
            lr (float): Learning rate for dynamics model. The default is ``0.001``.
            n_eval_episode_per_model (int): Number of evalation episodes per a model. The default is ``5``
        """
        kwargs["n_dynamics_model"] = 5
        super().__init__(*args, **kwargs)
        self._n_eval_episodes_per_model = n_eval_episodes_per_model

        # Replay buffer to train policy
        self.replay_buffer = get_replay_buffer(self._policy, self._env)

        # Replay buffer to compute GAE
        rb_dict = {
            "size": self._episode_max_steps,
            "default_dtype": np.float32,
            "env_dict": {
                "obs": {
                    "shape": self._env.observation_space.shape
                },
                "act": {
                    "shape": self._env.action_space.shape
                },
                "next_obs": {
                    "shape": self._env.observation_space.shape
                },
                "rew": {},
                "done": {},
                "logp": {},
                "val": {}
            }
        }
        self.local_buffer = ReplayBuffer(**rb_dict)

    def predict_next_state(self, obses, acts, idx=None):
        """
        Predict Next State

        Args:
            obses
            acts
            idx (int): Index number of dynamics mode to use. If ``None`` (default), choose randomly.

        Returns:
            np.ndarray: next state
        """
        is_single_input = obses.ndim == acts.ndim and acts.ndim == 1
        if is_single_input:
            obses = np.expand_dims(obses, axis=0)
            acts = np.expand_dims(acts, axis=0)

        inputs = np.concatenate([obses, acts], axis=1)
        idx = np.random.randint(self._n_dynamics_model) if idx is None else idx
        obs_diffs = self._dynamics_models[idx].predict(inputs)

        if is_single_input:
            return obses[0] + obs_diffs

        return obses + obs_diffs

    def _make_inputs_output_pairs(self, n_epoch):
        samples = self.dynamics_buffer.sample(
            self.dynamics_buffer.get_stored_size())
        inputs = np.concatenate([samples["obs"], samples["act"]], axis=1)
        labels = samples["next_obs"] - samples["obs"]

        return inputs, labels

    def __call__(self):
        """
        Execute Training
        """
        total_steps = 0
        tf.summary.experimental.set_step(total_steps)

        while True:
            # Collect (s, a, s') pairs in a real environment
            self.collect_transitions_real_env()
            total_steps += self._n_collect_steps
            tf.summary.experimental.set_step(total_steps)

            # Train dynamics models
            self.fit_dynamics(n_epoch=1)
            if self._debug:
                ret_real_env, ret_sim_env = self._evaluate_model()
                self.logger.info(
                    "Returns (real, sim) = ({: .3f}, {: .3f})".format(
                        ret_real_env, ret_sim_env))

            # Prepare initial states for evaluation
            init_states_for_eval = np.array([
                self._env.reset()
                for _ in range(self._n_dynamics_model *
                               self._n_eval_episodes_per_model)
            ])

            # Returns to evaluate policy improvement
            returns_before_update = self._evaluate_current_return(
                init_states_for_eval)

            n_updates = 0
            improve_ratios = []
            while True:
                n_updates += 1

                # Generate samples using dynamics models (simulated env)
                average_return = self.collect_transitions_sim_env()

                # Update policy
                self.update_policy()

                # Evaluate policy improvement
                returns_after_update = self._evaluate_current_return(
                    init_states_for_eval)
                n_improved = np.sum(
                    returns_after_update > returns_before_update)
                improved_ratio = n_improved / (self._n_dynamics_model *
                                               self._n_eval_episodes_per_model)
                improve_ratios.append(improved_ratio)
                if improved_ratio < 0.7:
                    break
                returns_before_update = returns_after_update

            self.logger.info(
                "Training total steps: {0: 7} sim return: {1: .4f} n_update: {2:}, ratios: {3:}"
                .format(total_steps, average_return, n_updates,
                        improve_ratios))
            tf.summary.scalar(name="mpc/n_updates", data=n_updates)

            # Evaluate policy in a real environment
            if total_steps // self._n_collect_steps % 10 == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)

    def _evaluate_model(self):
        ret_real_env, ret_sim_env = 0., 0.
        n_episodes = 10
        for _ in range(n_episodes):
            real_obs = self._env.reset()
            sim_obs = real_obs.copy()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(real_obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act

                next_real_obs, rew, _, _ = self._env.step(env_act)
                ret_real_env += rew
                real_obs = next_real_obs

                next_sim_obs = self.predict_next_state(sim_obs, env_act)
                ret_sim_env += self._reward_fn(real_obs, act)[0]
                sim_obs = next_sim_obs

        ret_real_env /= n_episodes
        ret_sim_env /= n_episodes
        return ret_real_env, ret_sim_env

    def update_policy(self):
        """
        Update Policy
        """
        # Compute mean and std for normalizing advantage
        if self._policy.normalize_adv:
            samples = self.replay_buffer.get_all_transitions()
            mean_adv = np.mean(samples["adv"])
            std_adv = np.std(samples["adv"])

        for _ in range(self._policy.n_epoch):
            samples = self.replay_buffer._encode_sample(
                np.random.permutation(self._policy.horizon))
            adv = (samples["adv"] - mean_adv) / (
                std_adv +
                1e-8) if self._policy.normalize_adv else samples["adv"]
            for idx in range(
                    int(self._policy.horizon / self._policy.batch_size)):
                target = slice(idx * self._policy.batch_size,
                               (idx + 1) * self._policy.batch_size)
                self._policy.train(states=samples["obs"][target],
                                   actions=samples["act"][target],
                                   advantages=adv[target],
                                   logp_olds=samples["logp"][target],
                                   returns=samples["ret"][target])

    def _evaluate_current_return(self, init_states):
        n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model
        assert init_states.shape[0] == n_episodes

        obses = init_states.copy()
        next_obses = np.zeros_like(obses)
        returns = np.zeros(shape=(n_episodes, ), dtype=np.float32)

        for _ in range(self._episode_max_steps):
            acts, _ = self._policy.get_action(obses)
            for i in range(n_episodes):
                model_idx = i // self._n_eval_episodes_per_model
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(acts[i], self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = acts[i]
                next_obses[i] = self.predict_next_state(obses[i],
                                                        env_act,
                                                        idx=model_idx)
            returns += self._reward_fn(obses, acts)
            obses = next_obses

        return returns

    def _visualize_current_performance(self):
        obs = self._env.reset()
        for _ in range(self._episode_max_steps):
            act, _ = self._policy.get_action(obs)
            if not is_discrete(self._env.action_space):
                env_act = np.clip(act, self._env.action_space.low,
                                  self._env.action_space.high)
            else:
                env_act = act
            next_obs = self.predict_next_state(obs, env_act)

            self._env.state = np.array(
                [np.arctan2(next_obs[1], next_obs[0]), next_obs[2]],
                dtype=np.float32)
            # print(obs, act, next_obs, self._env.state)
            self._env.render()
            obs = next_obs

    def collect_transitions_real_env(self):
        """
        Collect Trandisions from Real Environment
        """
        total_steps = 0
        episode_steps = 0
        obs = self._env.reset()
        while total_steps < self._n_collect_steps:
            episode_steps += 1
            total_steps += 1
            act, _ = self._policy.get_action(obs)
            if not is_discrete(self._env.action_space):
                env_act = np.clip(act, self._env.action_space.low,
                                  self._env.action_space.high)
            else:
                env_act = act
            next_obs, _, done, _ = self._env.step(env_act)
            self.dynamics_buffer.add(obs=obs, act=env_act, next_obs=next_obs)
            obs = next_obs
            if done or episode_steps == self._episode_max_steps:
                episode_steps = 0
                obs = self._env.reset()

    def collect_transitions_sim_env(self):
        """
        Generate transitions using dynamics model
        """
        self.replay_buffer.clear()
        n_episodes = 0
        ave_episode_return = 0
        while self.replay_buffer.get_stored_size() < self._policy.horizon:
            obs = self._env.reset()
            episode_return = 0.
            for _ in range(self._episode_max_steps):
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                if self._debug:
                    next_obs, rew, _, _ = self._env.step(env_act)
                else:
                    next_obs = self.predict_next_state(obs, env_act)
                    rew = self._reward_fn(obs, act)[0]
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=rew,
                                      done=False,
                                      logp=logp,
                                      val=val)
                obs = next_obs
                episode_return += rew
            self.finish_horizon(last_val=val)
            ave_episode_return += episode_return
            n_episodes += 1
        return ave_episode_return / n_episodes

    def finish_horizon(self, last_val=0):
        """
        TODO: These codes are completly identical to the ones defined in on_policy_trainer.py. Use it.
        """
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if not hasattr(self._env.action_space, "high") else
                       np.clip(act, self._env.action_space.low,
                               self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes

    def _set_from_args(self, args):
        super()._set_from_args(args)
        self._n_collect_steps = args.n_collect_steps
        self._debug = args.debug

    @staticmethod
    def get_argument(parser=None):
        parser = MPCTrainer.get_argument(parser)
        parser.add_argument("--n-collect-steps", type=int, default=100)
        parser.add_argument("--debug", action='store_true')
        return parser
Beispiel #11
0
    def test_Nstep_discounts_with_done(self):
        buffer_size = 32
        step = 4
        gamma = 0.5

        rb = ReplayBuffer(buffer_size, {"done": {}},
                          Nstep={
                              "size": step,
                              "gamma": gamma
                          })

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()

        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0], [1], [1], [1]]))

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0], [1], [1], [1], [0]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1]]))

        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1],
                        [1]]))

        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1],
                        [1], [1]]))

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(
            rb.get_all_transitions()["done"],
            np.asarray([[0], [1], [1], [1], [0], [0], [1], [1], [1], [1], [1],
                        [1], [1], [1], [1], [1]]))

        rb.clear()
        self.assertEqual(rb.get_stored_size(), 0)
        rb.add(done=1)
        rb.on_episode_end()
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[1]]))
Beispiel #12
0
def explorer(global_rb, queue, trained_steps, n_transition,
             is_training_done, lock, env_fn, policy_fn,
             buffer_size=1024, max_transition=None,
             episode_max_steps=1000):
    """
    Collect transitions and store them to prioritized replay buffer.
    Args:
        global_rb:
            Prioritized replay buffer sharing with multiple explorers and only one learner.
            This object is shared over processes, so it must be locked when trying to
            operate something with `lock` object.
        queue:
            A FIFO shared with the learner to get latest network parameters.
            This is process safe, so you don't need to lock process when use this.
        trained_steps:
            Number of steps to apply gradients.
        n_transition:
            Number of collected transitions.
        is_training_done:
            multiprocessing.Event object to share the status of training.
        lock:
            multiprocessing.Lock to lock other processes. You must release after process is done.
        env_fn:
            Method object to generate an environment.
        policy_fn:
            Method object to generate an explorer.
        buffer_size:
            Size of local buffer. If it is filled with transitions, add them to `global_rb`
        max_transition:
            Maximum number of steps to explorer. Default value is None.
        episode_max_steps:
            Maximum number of steps of an episode.
    """
    env = env_fn()
    policy = policy_fn(env, "Explorer", global_rb.get_buffer_size())
    local_rb = ReplayBuffer(obs_shape=env.observation_space.shape,
                            act_dim=env.action_space.low.size,
                            size=buffer_size)

    s = env.reset()
    episode_steps = 0
    total_reward = 0.
    total_rewards = []
    start = time.time()
    sample_at_start = 0

    while not is_training_done.is_set():
        # Periodically copy weights of explorer
        if not queue.empty():
            actor_weights, critic_weights, critic_target_weights = queue.get()
            update_target_variables(policy.actor.weights, actor_weights, tau=1.)
            update_target_variables(policy.critic.weights, critic_weights, tau=1.)
            update_target_variables(policy.critic_target.weights, critic_target_weights, tau=1.)

        n_transition.value += 1
        episode_steps += 1
        a = policy.get_action(s)
        s_, r, done, _ = env.step(a)
        done_flag = done
        if episode_steps == env._max_episode_steps:
            done_flag = False
        total_reward += r
        local_rb.add(s, a, r, s_, done_flag)

        s = s_
        if done or episode_steps == episode_max_steps:
            s = env.reset()
            total_rewards.append(total_reward)
            total_reward = 0
            episode_steps = 0

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size - 1:
            temp_n_transition = n_transition.value
            samples = local_rb.sample(local_rb.get_stored_size())
            states, next_states, actions, rewards, done = samples["obs"], samples["next_obs"], samples["act"], samples["rew"], samples["done"]
            done = np.array(done, dtype=np.float64)
            td_errors = policy.compute_td_error(
                states, actions, next_states, rewards, done)
            print("Grad: {0: 6d}\tSamples: {1: 7d}\tTDErr: {2:.5f}\tAveEpiRew: {3:.3f}\tFPS: {4:.2f}".format(
                trained_steps.value, n_transition.value, np.average(np.abs(td_errors).flatten()),
                sum(total_rewards) / len(total_rewards), (temp_n_transition - sample_at_start) / (time.time() - start)))
            total_rewards = []
            lock.acquire()
            global_rb.add(
                states, actions, rewards, next_states, done,
                priorities=np.abs(td_errors)+1e-6)
            lock.release()
            local_rb.clear()
            start = time.time()
            sample_at_start = n_transition.value

        if max_transition is not None and n_transition.value >= max_transition:
            is_training_done.set()
Beispiel #13
0
class OnPolicyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super(OnPolicyTrainer, self).__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0  # 每次经验轨迹的步数
        episode_return = 0  # 累计折扣奖励
        episode_start_time = time.time()
        total_steps = 0
        n_episode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            for _ in range(self._policy.horizon):
                if self._normalize_obs:

                    obs = np.expand_dims(obs, axis=0)
                    obs = self._obs_normalizer(obs, update=False)
                    obs = np.squeeze(obs, axis=0)

                act, logp, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward
                done_flag = done
                if hasattr(self._env, "_max_episode_steps"
                           ) and episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_episode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_episode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

                # 测试时间间隔
                if total_steps % self._test_interval == 0:
                    avg_test_return = self.evaluate_policy(total_steps)
                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    self.writer.flush()

                # 以'_save_model_interval'的时间间隔保存模型参数
                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)
            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer._encode_sample(
                    np.arange(self._policy.horizon))
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])

            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer._encode_sample(
                    np.random.permutation(self._policy.horizon))
                if self._normalize_obs:
                    samples["obs"] = self._obs_normalizer(samples["obs"],
                                                          update=False)
                if self._policy.normalize_adv:
                    samples["adv"] = (samples["adv"] - mean_adv) / std_adv

                for idx in range(
                        int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx * self._policy.batch_size,
                                   (idx + 1) * self._policy.batch_size)
                    self._policy.train(states=samples["obs"][target],
                                       actions=samples["act"][target],
                                       advantages=samples["adv"][target],
                                       logp_olds=samples["logp"][target],
                                       returns=samples["ret"][target])

        tf.summary.flush()

    # 计算GAE-Lambda,这个函数当每个轨迹结束或者在epoch终止的时候被调用
    def finish_horizon(self, last_val=0):
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[
            1:] - vals[:-1]  # 时间差分误差集合[δ0, δ1, δ2, ..., δt]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:

                    obs = np.expand_dims(obs, axis=0)
                    obs = self._obs_normalizer(obs, update=False)
                    obs = np.squeeze(obs, axis=0)

                act, _ = self._policy.get_action(obs, test=True)
                act = act if not hasattr(self._env.action_space, "high") else \
                    np.clip(act, self._env.action_space.low, self._env.action_space.high)
                next_obs, reward, done, _ = self._test_env.step(act)
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()

                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return

        return avg_test_return / self._test_episodes
Beispiel #14
0
class OnPolicyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self._test_interval % self._policy.horizon == 0, \
            "Test interval should be divisible by policy horizon"

    def __call__(self):
        total_steps = 0
        n_episode = 0

        # TODO: clean codes
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            n_episode, total_rewards = self._collect_sample(
                n_episode, total_steps)
            total_steps += self._policy.horizon
            tf.summary.experimental.set_step(total_steps)

            if len(total_rewards) > 0:
                avg_training_return = sum(total_rewards) / len(total_rewards)
                tf.summary.scalar(name="Common/training_return",
                                  data=avg_training_return)

            # Train actor critic
            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer.sample(self._policy.horizon)
                if self._policy.normalize_adv:
                    adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                        samples["adv"])
                else:
                    adv = samples["adv"]
                for idx in range(
                        int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx * self._policy.batch_size,
                                   (idx + 1) * self._policy.batch_size)
                    self._policy.train(states=samples["obs"][target],
                                       actions=samples["act"][target],
                                       advantages=adv[target],
                                       logp_olds=samples["logp"][target],
                                       returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()

    def _collect_sample(self, n_episode, total_steps):
        episode_steps = 0
        episode_return = 0
        episode_returns = []
        episode_start_time = time.time()
        obs = self._env.reset()
        for _ in range(self._policy.horizon):
            act, logp, val = self._policy.get_action_and_val(obs)
            # TODO: Clean code
            clipped_act = act if not hasattr(self._env.action_space, "high") else \
                np.clip(act, self._env.action_space.low, self._env.action_space.high)
            next_obs, reward, done, _ = self._env.step(clipped_act)
            if self._show_progress:
                self._env.render()
            episode_steps += 1
            episode_return += reward

            done_flag = done
            if hasattr(self._env, "_max_episode_steps") and \
                    episode_steps == self._env._max_episode_steps:
                done_flag = False
            self.local_buffer.add(obs=obs,
                                  act=act,
                                  next_obs=next_obs,
                                  rew=reward,
                                  done=done_flag,
                                  logp=logp,
                                  val=val)
            obs = next_obs

            if done or episode_steps == self._episode_max_steps:
                total_steps += episode_steps
                self.finish_horizon()
                obs = self._env.reset()
                n_episode += 1
                fps = episode_steps / (time.time() - episode_start_time)
                self.logger.info(
                    "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                    .format(n_episode, int(total_steps), episode_steps,
                            episode_return, fps))

                tf.summary.scalar(name="Common/fps", data=fps)
                episode_returns.append(episode_return)
                episode_steps = 0
                episode_return = 0
                episode_start_time = time.time()
        self.finish_horizon(last_val=val)
        return n_episode, episode_returns

    def finish_horizon(self, last_val=0):
        """
        Call this at the end of a trajectory, or when one gets cut off
        by an epoch ending. This looks back in the buffer to where the
        trajectory started, and uses rewards and value estimates from
        the whole trajectory to compute advantage estimates with GAE-Lambda,
        as well as compute the rewards-to-go for each state, to use as
        the targets for the value function.
        The "last_val" argument should be 0 if the trajectory ended
        because the agent reached a terminal state (died), and otherwise
        should be V(s_T), the value function estimated for the last state.
        This allows us to bootstrap the reward-to-go calculation to account
        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
        """
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            done = False
            for _ in range(self._episode_max_steps):
                act, _ = self._policy.get_action(obs, test=True)
                act = act if not hasattr(self._env.action_space, "high") else \
                    np.clip(act, self._env.action_space.low, self._env.action_space.high)
                next_obs, reward, done, _ = self._test_env.step(act)
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes
Beispiel #15
0
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             buffer_size=1024,
             episode_max_steps=1000):
    env = gym.make('CartPole-v1')
    policy = Agent()
    env_dict = {
        "obs": {
            "shape": (state_size, )
        },
        "act": {},
        "rew": {},
        "next_obs": {
            "shape": (state_size, )
        },
        "done": {}
    }
    local_rb = ReplayBuffer(buffer_size, env_dict=env_dict)
    local_idx = np.arange(buffer_size).astype(np.int)

    s = env.reset()
    episode_steps = 0
    total_reward = 0.
    total_rewards = []

    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        n_sample += 1
        episode_steps += 1
        a = policy.acting(s)
        s_, r, done, _ = env.step(a)
        done_flag = done
        if episode_steps == env._max_episode_steps:
            done_flag = False
        total_reward += r
        policy.n_step_buffer.append((s, a, r, s_, done_flag))
        if len(policy.n_step_buffer) == policy.n_step:
            reward, next_state, done = policy.get_n_step_info(
                policy.n_step_buffer, policy.gamma)
            state, action = policy.n_step_buffer[0][:2]
            local_rb.add(obs=state,
                         act=action,
                         rew=reward,
                         next_obs=next_state,
                         done=done)

        s = s_
        if done or episode_steps == episode_max_steps:
            s = env.reset()
            total_rewards.append(total_reward)
            total_reward = 0
            episode_steps = 0

        if not queue.empty():
            set_weights_fn(policy, queue.get())

        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb._encode_sample(local_idx)
            td_errors = policy.compute_td_error(samples["obs"], samples["act"],
                                                samples["rew"],
                                                samples["next_obs"],
                                                samples["done"])
            priorities = np.abs(np.squeeze(td_errors)) + 1e-6

            lock.acquire()
            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=priorities)
            lock.release()
            local_rb.clear()

            ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) /
                       len(total_rewards))

            total_rewards = []
            start = time.time()
            n_sample_old = n_sample
Beispiel #16
0
class OnPolicyTrainer(Trainer):
    """
    Trainer class for on-policy reinforcement learning

    Command Line Args:

        * ``--max-steps`` (int): The maximum steps for training. The default is ``int(1e6)``
        * ``--episode-max-steps`` (int): The maximum steps for an episode. The default is ``int(1e3)``
        * ``--n-experiments`` (int): Number of experiments. The default is ``1``
        * ``--show-progress``: Call ``render`` function during training
        * ``--save-model-interval`` (int): Interval to save model. The default is ``int(1e4)``
        * ``--save-summary-interval`` (int): Interval to save summary. The default is ``int(1e3)``
        * ``--model-dir`` (str): Directory to restore model.
        * ``--dir-suffix`` (str): Suffix for directory that stores results.
        * ``--normalize-obs``: Whether normalize observation
        * ``--logdir`` (str): Output directory name. The default is ``"results"``
        * ``--evaluate``: Whether evaluate trained model
        * ``--test-interval`` (int): Interval to evaluate trained model. The default is ``int(1e4)``
        * ``--show-test-progress``: Call ``render`` function during evaluation.
        * ``--test-episodes`` (int): Number of episodes at test. The default is ``5``
        * ``--save-test-path``: Save trajectories of evaluation.
        * ``--show-test-images``: Show input images to neural networks when an episode finishes
        * ``--save-test-movie``: Save rendering results.
        * ``--use-prioritized-rb``: Use prioritized experience replay
        * ``--use-nstep-rb``: Use Nstep experience replay
        * ``--n-step`` (int): Number of steps for nstep experience reward. The default is ``4``
        * ``--logging-level`` (DEBUG, INFO, WARNING): Choose logging level. The default is ``INFO``
    """
    def __init__(self, *args, **kwargs):
        """
        Initialize On-Policy Trainer

        Args:
            policy: Policy to be trained
            env (gym.Env): Environment for train
            args (Namespace or dict): config parameters specified with command line
            test_env (gym.Env): Environment for test.
        """
        super().__init__(*args, **kwargs)

    def __call__(self):
        """
        Execute training
        """
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                next_obs, reward, done, _ = self._env.step(env_act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward

                done_flag = done
                if (hasattr(self._env, "_max_episode_steps")
                        and episode_steps == self._env._max_episode_steps):
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/training_episode_length",
                                      data=episode_steps)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

                if total_steps % self._test_interval == 0:
                    avg_test_return, avg_test_steps = self.evaluate_policy(
                        total_steps)
                    self.logger.info(
                        "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                        .format(total_steps, avg_test_return,
                                self._test_episodes))
                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    tf.summary.scalar(
                        name="Common/average_test_episode_length",
                        data=avg_test_steps)
                    self.writer.flush()

                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer.get_all_transitions()
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                # Update normalizer
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._normalize_obs:
                        samples["obs"] = self._obs_normalizer(samples["obs"],
                                                              update=False)
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

        tf.summary.flush()

    def finish_horizon(self, last_val=0):
        """
        Finish horizon
        """
        self.local_buffer.on_episode_end()
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        """
        Evaluate policy

        Args:
            total_steps (int): Current total steps of training
        """
        avg_test_return = 0.
        avg_test_steps = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            avg_test_steps += 1
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if is_discrete(self._env.action_space) else np.clip(
                    act, self._env.action_space.low,
                    self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                avg_test_steps += 1
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes