Example #1
0
def prepare_experiment(env, args):
    # Manager to share PER between a learner and explorers
    SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer)
    manager = SyncManager()
    manager.start()

    kwargs = get_default_rb_dict(args.replay_buffer_size, env)
    kwargs["check_for_update"] = True
    global_rb = manager.PrioritizedReplayBuffer(**kwargs)

    # queues to share network parameters between a learner and explorers
    n_queue = 1 if args.n_env > 1 else args.n_explorer
    n_queue += 1  # for evaluation
    queues = [manager.Queue() for _ in range(n_queue)]

    # Event object to share training status. if event is set True, all exolorers stop sampling transitions
    is_training_done = Event()

    # Lock
    lock = manager.Lock()

    # Shared memory objects to count number of samples and applied gradients
    trained_steps = Value('i', 0)

    return global_rb, queues, is_training_done, lock, trained_steps
Example #2
0
    def __call__(self):
        total_steps = 0
        n_episode = 0

        # TODO: clean codes
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            n_episode, total_rewards = self._collect_sample(
                n_episode, total_steps)
            total_steps += self._policy.horizon
            tf.summary.experimental.set_step(total_steps)

            if len(total_rewards) > 0:
                avg_training_return = sum(total_rewards) / len(total_rewards)
                tf.summary.scalar(name="Common/training_return",
                                  data=avg_training_return)

            # Train actor critic
            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer.sample(self._policy.horizon)
                if self._policy.normalize_adv:
                    adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                        samples["adv"])
                else:
                    adv = samples["adv"]
                for idx in range(
                        int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx * self._policy.batch_size,
                                   (idx + 1) * self._policy.batch_size)
                    self._policy.train(states=samples["obs"][target],
                                       actions=samples["act"][target],
                                       advantages=adv[target],
                                       logp_olds=samples["logp"][target],
                                       returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
Example #3
0
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs["size"] = size

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        kwargs["size"] = policy.horizon
        kwargs["env_dict"].pop("next_obs")
        kwargs["env_dict"].pop("rew")
        kwargs["env_dict"]["logp"] = {}
        kwargs["env_dict"]["ret"] = {}
        kwargs["env_dict"]["adv"] = {}
        if is_discrete(env.action_space):
            kwargs["env_dict"]["act"]["dtype"] = np.int32
        return ReplayBuffer(**kwargs)

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return PrioritizedReplayBuffer(**kwargs)

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return ReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
Example #4
0
    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                act, logp, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(act)
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()
            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer._encode_sample(
                    np.arange(self._policy.horizon))
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._save_model_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
Example #5
0
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             env_fn,
             policy_fn,
             set_weights_fn,
             noise_level,
             n_env=64,
             n_thread=4,
             buffer_size=1024,
             episode_max_steps=1000,
             gpu=0):
    """
    Collect transitions and store them to prioritized replay buffer.

    :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]):
        Prioritized replay buffer sharing with multiple explorers and only one learner.
        This object is shared over processes, so it must be locked when trying to
        operate something with `lock` object.
    :param queue (multiprocessing.Queue):
        A FIFO shared with the `learner` and `evaluator` to get the latest network weights.
        This is process safe, so you don't need to lock process when use this.
    :param trained_steps (multiprocessing.Value):
        Number of steps to apply gradients.
    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param lock (multiprocessing.Lock):
        multiprocessing.Lock to lock other processes.
    :param env_fn (function):
        Method object to generate an environment.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param set_weights_fn (function):
        Method object to set network weights gotten from queue.
    :param noise_level (float):
        Noise level for exploration. For epsilon-greedy policy like DQN variants,
        this will be epsilon, and if DDPG variants this will be variance for Normal distribution.
    :param n_env (int):
        Number of environments to distribute. If this is set to be more than 1,
        `MultiThreadEnv` will be used.
    :param n_thread (int):
        Number of thread used in `MultiThreadEnv`.
    :param buffer_size (int):
        Size of local buffer. If this is filled with transitions, add them to `global_rb`
    :param episode_max_steps (int):
        Maximum number of steps of an episode.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    """
    import_tf()
    logger = logging.getLogger("tf2rl")

    if n_env > 1:
        envs = MultiThreadEnv(env_fn=env_fn,
                              batch_size=n_env,
                              thread_pool=n_thread,
                              max_episode_steps=episode_max_steps)
        env = envs._sample_env
    else:
        env = env_fn()

    policy = policy_fn(env=env,
                       name="Explorer",
                       memory_capacity=global_rb.get_buffer_size(),
                       noise_level=noise_level,
                       gpu=gpu)

    kwargs = get_default_rb_dict(buffer_size, env)
    if n_env > 1:
        kwargs["env_dict"]["priorities"] = {}
    local_rb = ReplayBuffer(**kwargs)
    local_idx = np.arange(buffer_size).astype(np.int)

    if n_env == 1:
        s = env.reset()
        episode_steps = 0
        total_reward = 0.
        total_rewards = []

    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        if n_env == 1:
            n_sample += 1
            episode_steps += 1
            a = policy.get_action(s)
            s_, r, done, _ = env.step(a)
            done_flag = done
            if episode_steps == env._max_episode_steps:
                done_flag = False
            total_reward += r
            local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag)

            s = s_
            if done or episode_steps == episode_max_steps:
                s = env.reset()
                total_rewards.append(total_reward)
                total_reward = 0
                episode_steps = 0
        else:
            n_sample += n_env
            obses = envs.py_observation()
            actions = policy.get_action(obses, tensor=True)
            next_obses, rewards, dones, _ = envs.step(actions)
            td_errors = policy.compute_td_error(states=obses,
                                                actions=actions,
                                                next_states=next_obses,
                                                rewards=rewards,
                                                dones=dones)
            local_rb.add(obs=obses,
                         act=actions,
                         next_obs=next_obses,
                         rew=rewards,
                         done=dones,
                         priorities=np.abs(td_errors + 1e-6))

        # Periodically copy weights of explorer
        if not queue.empty():
            set_weights_fn(policy, queue.get())

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb._encode_sample(local_idx)
            if n_env > 1:
                priorities = np.squeeze(samples["priorities"])
            else:
                td_errors = policy.compute_td_error(
                    states=samples["obs"],
                    actions=samples["act"],
                    next_states=samples["next_obs"],
                    rewards=samples["rew"],
                    dones=samples["done"])
                priorities = np.abs(np.squeeze(td_errors)) + 1e-6
            lock.acquire()
            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=priorities)
            lock.release()
            local_rb.clear()

            msg = "Grad: {0: 6d}\t".format(trained_steps.value)
            msg += "Samples: {0: 7d}\t".format(n_sample)
            msg += "TDErr: {0:.5f}\t".format(np.average(priorities))
            if n_env == 1:
                ave_rew = (0 if len(total_rewards) == 0 else
                           sum(total_rewards) / len(total_rewards))
                msg += "AveEpiRew: {0:.3f}\t".format(ave_rew)
                total_rewards = []
            msg += "FPS: {0:.2f}".format(
                (n_sample - n_sample_old) / (time.time() - start))
            logger.info(msg)

            start = time.time()
            n_sample_old = n_sample
Example #6
0
    def __call__(self):
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        episode_steps = 0
        episode_return = 0
        episode_cost = 0
        episode_start_time = time.time()
        total_steps = np.array(0, dtype=np.int32)
        n_epoisode = 0
        obs = self._env.reset()

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            for _ in range(self._policy.horizon):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, logp, val = self._policy.get_action_and_val(obs)
                if not is_discrete(self._env.action_space):
                    env_act = np.clip(act, self._env.action_space.low,
                                      self._env.action_space.high)
                else:
                    env_act = act
                next_obs, reward, done, info = self._env.step(env_act)
                # print('[DEBUG]  COST:', info['cost'])
                try:
                    cost = info['cost']
                except (TypeError, KeyError):
                    cost = 0
                if self._show_progress:
                    self._env.render()

                episode_steps += 1
                total_steps += 1
                episode_return += reward
                episode_cost += cost

                done_flag = done
                if (hasattr(self._env, "_max_episode_steps")
                        and episode_steps == self._env._max_episode_steps):
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=logp,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    tf.summary.experimental.set_step(total_steps)
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_epoisode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}"
                        .format(n_epoisode, int(total_steps), episode_steps,
                                episode_return, episode_cost, fps))
                    tf.summary.scalar(name="Common/training_return",
                                      data=episode_return)
                    tf.summary.scalar(name="Common/fps", data=fps)
                    self.total_cost += episode_cost
                    cost_rate = self.total_cost / total_steps

                    wandb.log(
                        {
                            'Training_Return': episode_return,
                            'Training_Cost': episode_cost,
                            'Cost_Rate': cost_rate,
                            'FPS': fps
                        },
                        step=n_epoisode)
                    episode_steps = 0
                    episode_return = 0
                    episode_cost = 0
                    episode_start_time = time.time()

                if total_steps % self._test_interval == 0:
                    avg_test_return, avg_test_cost = self.evaluate_policy(
                        total_steps)
                    self.logger.info(
                        "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes"
                        .format(total_steps, avg_test_return, avg_test_cost,
                                self._test_episodes))
                    wandb.log(
                        {
                            'Evaluation_Return': avg_test_return,
                            'Evaluation_Cost': avg_test_cost
                        },
                        step=n_epoisode)
                    # wandb.log({'Evaluation_Step': total_steps})

                    tf.summary.scalar(name="Common/average_test_return",
                                      data=avg_test_return)
                    self.writer.flush()

                if total_steps % self._save_model_interval == 0:
                    self.checkpoint_manager.save()

            self.finish_horizon(last_val=val)

            tf.summary.experimental.set_step(total_steps)

            # Train actor critic
            if self._policy.normalize_adv:
                samples = self.replay_buffer.get_all_transitions()
                mean_adv = np.mean(samples["adv"])
                std_adv = np.std(samples["adv"])
                # Update normalizer
                if self._normalize_obs:
                    self._obs_normalizer.experience(samples["obs"])
            with tf.summary.record_if(total_steps %
                                      self._save_summary_interval == 0):
                for _ in range(self._policy.n_epoch):
                    samples = self.replay_buffer._encode_sample(
                        np.random.permutation(self._policy.horizon))
                    if self._normalize_obs:
                        samples["obs"] = self._obs_normalizer(samples["obs"],
                                                              update=False)
                    if self._policy.normalize_adv:
                        adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
                    else:
                        adv = samples["adv"]
                    for idx in range(
                            int(self._policy.horizon /
                                self._policy.batch_size)):
                        target = slice(idx * self._policy.batch_size,
                                       (idx + 1) * self._policy.batch_size)
                        self._policy.train(states=samples["obs"][target],
                                           actions=samples["act"][target],
                                           advantages=adv[target],
                                           logp_olds=samples["logp"][target],
                                           returns=samples["ret"][target])

        tf.summary.flush()
Example #7
0
    def __call__(self):
        total_steps = 0
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        n_episode = 0
        test_step_threshold = self._test_interval

        # TODO: clean codes
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        obs = self._env.reset()
        while total_steps < self._max_steps:
            for _ in range(self._policy.horizon):
                action, log_pi, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(action)
                if self._show_progress:
                    self._env.render()
                episode_steps += 1
                episode_return += reward
                total_steps += 1

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=log_pi,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_episode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_episode, int(total_steps), episode_steps,
                                episode_return, fps))

                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

            self.finish_horizon(last_val=val)
            tf.summary.experimental.set_step(total_steps)
            samples = self.replay_buffer.sample(self._policy.horizon)
            # Normalize advantages
            if self._policy.normalize_adv:
                adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                    samples["adv"])
            else:
                adv = samples["adv"]
            for _ in range(1):
                self._policy.train_actor(samples["obs"], samples["act"], adv,
                                         samples["logp"])
            # Train Critic
            for _ in range(5):
                self._policy.train_critic(samples["obs"], samples["ret"])
            if total_steps > test_step_threshold:
                test_step_threshold += self._test_interval
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                tf.summary.scalar(name="Common/fps", data=fps)

                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()