コード例 #1
0
ファイル: ddpg_evaluator.py プロジェクト: yxz9087/ray
class DDPGEvaluator(PolicyEvaluator):
    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]))

        # contains model, target_model
        self.model = DDPGModel(registry, self.env, config)

        self.sampler = SyncSampler(self.env,
                                   self.model.model,
                                   NoFilter(),
                                   config["num_local_steps"],
                                   horizon=config["horizon"])

    def sample(self):
        """Returns a batch of samples."""

        rollout = self.sampler.get_data()
        rollout.data["weights"] = np.ones_like(rollout.data["rewards"])

        # since each sample is one step, no discounting needs to be applied;
        # this does not involve config["gamma"]
        samples = process_rollout(rollout,
                                  NoFilter(),
                                  gamma=1.0,
                                  use_gae=False)

        return samples

    def update_target(self):
        """Updates target critic and target actor."""
        self.model.update_target()

    def compute_gradients(self, samples):
        """Returns critic, actor gradients."""
        return self.model.compute_gradients(samples)

    def apply_gradients(self, grads):
        """Applies gradients to evaluator weights."""
        self.model.apply_gradients(grads)

    def compute_apply(self, samples):
        grads, _ = self.compute_gradients(samples)
        self.apply_gradients(grads)

    def get_weights(self):
        """Returns model weights."""
        return self.model.get_weights()

    def set_weights(self, weights):
        """Sets model weights."""
        self.model.set_weights(weights)

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()
コード例 #2
0
    def __init__(self, env_creator, config, logdir, is_remote):
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"])
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        self.inputs = [
            ("obs", self.observations),
            ("value_targets", self.value_targets),
            ("advantages", self.advantages),
            ("actions", self.actions),
            ("logprobs", self.prev_logits),
            ("vf_preds", self.prev_vf_preds)
        ]
        self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs])

        # References to the model weights
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = SyncSampler(
            self.env, self.common_policy, self.obs_filter,
            self.config["horizon"], self.config["horizon"])
コード例 #3
0
    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.config = config

        self.policy = PGPolicy(registry, self.env.observation_space,
                               self.env.action_space, config)
        self.sampler = SyncSampler(
                        self.env, self.policy, NoFilter(),
                        config["batch_size"], horizon=config["horizon"])
コード例 #4
0
ファイル: ddpg_evaluator.py プロジェクト: zxsimple/ray
    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]))

        # contains model, target_model
        self.model = DDPGModel(registry, self.env, config)

        self.sampler = SyncSampler(
                        self.env, self.model.model, NoFilter(),
                        config["num_local_steps"], horizon=config["horizon"])
コード例 #5
0
class PGEvaluator(Evaluator):
    """Evaluator for simple policy gradient."""
    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.config = config

        self.policy = PGPolicy(registry, self.env.observation_space,
                               self.env.action_space, config)
        self.sampler = SyncSampler(self.env,
                                   self.policy,
                                   NoFilter(),
                                   config["batch_size"],
                                   horizon=config["horizon"])

    def sample(self):
        rollout = self.sampler.get_data()
        samples = process_rollout(rollout,
                                  NoFilter(),
                                  gamma=self.config["gamma"],
                                  use_gae=False)
        return samples

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def compute_gradients(self, samples):
        """ Returns gradient w.r.t. samples."""
        gradient, info = self.policy.compute_gradients(samples)
        return gradient

    def apply_gradients(self, grads):
        """Applies gradients to evaluator weights."""
        self.policy.apply_gradients(grads)

    def get_weights(self):
        """Returns model weights."""
        return self.policy.get_weights()

    def set_weights(self, weights):
        """Sets model weights."""
        return self.policy.set_weights(weights)
コード例 #6
0
ファイル: pg_evaluator.py プロジェクト: adgirish/ray
class PGEvaluator(Evaluator):
    """Evaluator for simple policy gradient."""

    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.config = config

        self.policy = PGPolicy(registry, self.env.observation_space,
                               self.env.action_space, config)
        self.sampler = SyncSampler(
                        self.env, self.policy, NoFilter(),
                        config["batch_size"], horizon=config["horizon"])

    def sample(self):
        rollout = self.sampler.get_data()
        samples = process_rollout(
                    rollout, NoFilter(),
                    gamma=self.config["gamma"], use_gae=False)
        return samples

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def compute_gradients(self, samples):
        """ Returns gradient w.r.t. samples."""
        gradient, info = self.policy.compute_gradients(samples)
        return gradient

    def apply_gradients(self, grads):
        """Applies gradients to evaluator weights."""
        self.policy.apply_gradients(grads)

    def get_weights(self):
        """Returns model weights."""
        return self.policy.get_weights()

    def set_weights(self, weights):
        """Sets model weights."""
        return self.policy.set_weights(weights)
コード例 #7
0
ファイル: pg_evaluator.py プロジェクト: adgirish/ray
    def __init__(self, registry, env_creator, config):
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.config = config

        self.policy = PGPolicy(registry, self.env.observation_space,
                               self.env.action_space, config)
        self.sampler = SyncSampler(
                        self.env, self.policy, NoFilter(),
                        config["batch_size"], horizon=config["horizon"])
コード例 #8
0
class PPOEvaluator(TFMultiGPUSupport):
    """
    Runner class that holds the simulator environment and the policy.

    Initializes the tensorflow graphs for both training and evaluation.
    One common policy graph is initialized on '/cpu:0' and holds all the shared
    network weights. When run as a remote agent, only this graph is used.
    """
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"])
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        self.inputs = [("obs", self.observations),
                       ("value_targets", self.value_targets),
                       ("advantages", self.advantages),
                       ("actions", self.actions),
                       ("logprobs", self.prev_logits),
                       ("vf_preds", self.prev_vf_preds)]
        self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs])

        # References to the model weights
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])

    def tf_loss_inputs(self):
        return self.inputs

    def build_tf_loss(self, input_placeholders):
        obs, vtargets, advs, acts, plog, pvf_preds = input_placeholders
        return ProximalPolicyGraph(self.env.observation_space,
                                   self.env.action_space, obs, vtargets, advs,
                                   acts, plog, pvf_preds, self.logit_dim,
                                   self.kl_coeff, self.distribution_class,
                                   self.config, self.sess, self.registry)

    def init_extra_ops(self, device_losses):
        self.extra_ops = OrderedDict()
        with tf.name_scope("test_outputs"):
            policies = device_losses
            self.extra_ops["loss"] = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.extra_ops["policy_loss"] = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_policy_loss for policy in policies]),
                0)
            self.extra_ops["vf_loss"] = tf.reduce_mean(
                tf.stack(values=[policy.mean_vf_loss for policy in policies]),
                0)
            self.extra_ops["kl"] = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.extra_ops["entropy"] = tf.reduce_mean(
                tf.stack(values=[policy.mean_entropy for policy in policies]),
                0)

    def extra_apply_grad_fetches(self):
        return list(self.extra_ops.values())

    def extra_apply_grad_feed_dict(self):
        return {self.kl_coeff: self.kl_coeff_val}

    def update_kl(self, sampled_kl):
        if sampled_kl > 2.0 * self.kl_target:
            self.kl_coeff_val *= 1.5
        elif sampled_kl < 0.5 * self.kl_target:
            self.kl_coeff_val *= 0.5

    def save(self):
        filters = self.get_filters(flush_after=True)
        return pickle.dumps({
            "filters": filters,
            "kl_coeff_val": self.kl_coeff_val,
            "kl_target": self.kl_target,
        })

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])
        self.kl_coeff_val = objs["kl_coeff_val"]
        self.kl_target = objs["kl_target"]

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            last_r = 0.0  # note: not needed since we don't truncate rollouts
            samples = compute_advantages(rollout,
                                         last_r,
                                         self.config["gamma"],
                                         self.config["lambda"],
                                         use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters
コード例 #9
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of exactly `batch_steps` in size. Episodes may be truncated
                    in order to meet this size requirement. When
                    `num_envs > 1`, episodes will be truncated to sequences of
                    `batch_size / num_envs` in length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:

            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)

        self.env = wrap(self.env)

        def make_env():
            return wrap(env_creator(env_config))

        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        # Always use vector env for consistency even if num_envs = 1
        if not isinstance(self.env, AsyncVectorEnv):
            if isinstance(self.env, ServingEnv):
                self.vector_env = _ServingEnvToAsync(self.env)
            else:
                if not isinstance(self.env, VectorEnv):
                    self.env = VectorEnv.wrap(make_env, [self.env],
                                              num_envs=num_envs)
                self.vector_env = _VectorEnvToAsync(self.env)
        else:
            self.vector_env = self.env

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.vector_env,
                                        self.policy_map["default"],
                                        self.obs_filter,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.vector_env,
                                       self.policy_map["default"],
                                       self.obs_filter,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes)
コード例 #10
0
class CommonPolicyEvaluator(PolicyEvaluator):
    """Policy evaluator implementation that operates on a rllib.PolicyGraph.

    TODO: multi-agent
    TODO: multi-gpu

    Examples:
        # Create a policy evaluator and using it to collect experiences.
        >>> evaluator = CommonPolicyEvaluator(
              env_creator=lambda _: gym.make("CartPole-v0"),
              policy_graph=PGPolicyGraph)
        >>> print(evaluator.sample().keys())
        {"obs": [[...]], "actions": [[...]], "rewards": [[...]],
         "dones": [[...]], "new_obs": [[...]]}

        # Creating policy evaluators using optimizer_cls.make().
        >>> optimizer = LocalSyncOptimizer.make(
              evaluator_cls=CommonPolicyEvaluator,
              evaluator_args={
                "env_creator": lambda _: gym.make("CartPole-v0"),
                "policy_graph": PGPolicyGraph,
              },
              num_workers=10)
        >>> for _ in range(10): optimizer.step()
    """
    @classmethod
    def as_remote(cls, num_cpus=None, num_gpus=None):
        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)

    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of exactly `batch_steps` in size. Episodes may be truncated
                    in order to meet this size requirement. When
                    `num_envs > 1`, episodes will be truncated to sequences of
                    `batch_size / num_envs` in length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:

            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)

        self.env = wrap(self.env)

        def make_env():
            return wrap(env_creator(env_config))

        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        # Always use vector env for consistency even if num_envs = 1
        if not isinstance(self.env, AsyncVectorEnv):
            if isinstance(self.env, ServingEnv):
                self.vector_env = _ServingEnvToAsync(self.env)
            else:
                if not isinstance(self.env, VectorEnv):
                    self.env = VectorEnv.wrap(make_env, [self.env],
                                              num_envs=num_envs)
                self.vector_env = _VectorEnvToAsync(self.env)
        else:
            self.vector_env = self.env

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.vector_env,
                                        self.policy_map["default"],
                                        self.obs_filter,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.vector_env,
                                       self.policy_map["default"],
                                       self.obs_filter,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes)

    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = SampleBatch.concat_samples(batches)

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch

    def for_policy(self, func):
        """Apply the given function to this evaluator's default policy."""

        return func(self.policy_map["default"])

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters

    def get_weights(self):
        return self.policy_map["default"].get_weights()

    def set_weights(self, weights):
        return self.policy_map["default"].set_weights(weights)

    def compute_gradients(self, samples):
        return self.policy_map["default"].compute_gradients(samples)

    def apply_gradients(self, grads):
        return self.policy_map["default"].apply_gradients(grads)

    def compute_apply(self, samples):
        grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
            samples)
        return grad_fetch

    def save(self):
        filters = self.get_filters(flush_after=True)
        state = self.policy_map["default"].get_state()
        return pickle.dumps({"filters": filters, "state": state})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])
        self.policy_map["default"].set_state(objs["state"])
コード例 #11
0
ファイル: ppo_evaluator.py プロジェクト: adgirish/ray
class PPOEvaluator(Evaluator):
    """
    Runner class that holds the simulator environment and the policy.

    Initializes the tensorflow graphs for both training and evaluation.
    One common policy graph is initialized on '/cpu:0' and holds all the shared
    network weights. When run as a remote agent, only this graph is used.
    """

    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = int(
                config["sgd_batchsize"] / len(devices)) * len(devices)
            assert self.batch_size % len(devices) == 0
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.value_targets, self.advantages,
             self.actions, self.prev_logits, self.prev_vf_preds],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_policy_loss for policy in policies]), 0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_vf_loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = SyncSampler(
            self.env, self.common_policy, self.obs_filter,
            self.config["horizon"], self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())

    def load_data(self, trajectories, full_trace):
        use_gae = self.config["use_gae"]
        dummy = np.zeros_like(trajectories["advantages"])
        return self.par_opt.load_data(
            self.sess,
            [trajectories["observations"],
             trajectories["value_targets"] if use_gae else dummy,
             trajectories["advantages"],
             trajectories["actions"],
             trajectories["logprobs"],
             trajectories["vf_preds"] if use_gae else dummy],
            full_trace=full_trace)

    def run_sgd_minibatch(
            self, batch_index, kl_coeff, full_trace, file_writer):
        return self.par_opt.optimize(
            self.sess,
            batch_index,
            extra_ops=[
                self.mean_loss, self.mean_policy_loss, self.mean_vf_loss,
                self.mean_kl, self.mean_entropy],
            extra_feed_dict={self.kl_coeff: kl_coeff},
            file_writer=file_writer if full_trace else None)

    def compute_gradients(self, samples):
        raise NotImplementedError

    def apply_gradients(self, grads):
        raise NotImplementedError

    def save(self):
        filters = self.get_filters(flush_after=True)
        return pickle.dumps({"filters": filters})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(
                rollout, self.rew_filter, self.config["gamma"],
                self.config["lambda"], use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters
コード例 #12
0
class Runner(object):
    """
    Runner class that holds the simulator environment and the policy.

    Initializes the tensorflow graphs for both training and evaluation.
    One common policy graph is initialized on '/cpu:0' and holds all the shared
    network weights. When run as a remote agent, only this graph is used.
    """

    def __init__(self, env_creator, config, logdir, is_remote):
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = create_and_wrap(env_creator, config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        # TODO(rliaw): pull this into model_catalog
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(
                tf.float32, shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None,))
        else:
            raise NotImplemented(
                "action space" + str(type(action_space)) +
                "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.value_targets, self.advantages,
             self.actions, self.prev_logits, self.prev_vf_preds],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_policy_loss for policy in policies]), 0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_vf_loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.sampler = SyncSampler(
            self.env, self.common_policy, obs_filter,
            self.config["horizon"], self.config["horizon"])
        self.reward_filter = MeanStdFilter((), clip=5.0)
        self.sess.run(tf.global_variables_initializer())

    def load_data(self, trajectories, full_trace):
        use_gae = self.config["use_gae"]
        dummy = np.zeros_like(trajectories["advantages"])
        return self.par_opt.load_data(
            self.sess,
            [trajectories["observations"],
             trajectories["value_targets"] if use_gae else dummy,
             trajectories["advantages"],
             trajectories["actions"].squeeze(),
             trajectories["logprobs"],
             trajectories["vf_preds"] if use_gae else dummy],
            full_trace=full_trace)

    def run_sgd_minibatch(
            self, batch_index, kl_coeff, full_trace, file_writer):
        return self.par_opt.optimize(
            self.sess,
            batch_index,
            extra_ops=[
                self.mean_loss, self.mean_policy_loss, self.mean_vf_loss,
                self.mean_kl, self.mean_entropy],
            extra_feed_dict={self.kl_coeff: kl_coeff},
            file_writer=file_writer if full_trace else None)

    def save(self):
        obs_filter = self.sampler.get_obs_filter()
        return pickle.dumps([obs_filter, self.reward_filter])

    def restore(self, objs):
        objs = pickle.loads(objs)
        obs_filter = objs[0]
        rew_filter = objs[1]
        self.update_filters(obs_filter, rew_filter)

    def get_weights(self):
        return self.variables.get_weights()

    def load_weights(self, weights):
        self.variables.set_weights(weights)

    def update_filters(self, obs_filter=None, rew_filter=None):
        if rew_filter:
            # No special handling required since outside of threaded code
            self.reward_filter = rew_filter.copy()
        if obs_filter:
            self.sampler.update_obs_filter(obs_filter)

    def get_obs_filter(self):
        return self.sampler.get_obs_filter()

    def compute_steps(self, config, obs_filter, rew_filter):
        """Compute multiple rollouts and concatenate the results.

        Args:
            config: Configuration parameters
            obs_filter: Function that is applied to each of the
                observations.
            reward_filter: Function that is applied to each of the rewards.

        Returns:
            states: List of states.
            total_rewards: Total rewards of the trajectories.
            trajectory_lengths: Lengths of the trajectories.
        """
        num_steps_so_far = 0
        trajectories = []
        self.update_filters(obs_filter, rew_filter)

        while num_steps_so_far < config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            trajectory = process_rollout(
                rollout, self.reward_filter, config["gamma"],
                config["lambda"], use_gae=config["use_gae"])
            num_steps_so_far += trajectory["rewards"].shape[0]
            trajectories.append(trajectory)
        metrics = self.sampler.get_metrics()
        total_rewards, trajectory_lengths = zip(*[
            (c.episode_reward, c.episode_length) for c in metrics])
        updated_obs_filter = self.sampler.get_obs_filter(flush=True)
        return (
            concatenate(trajectories),
            total_rewards,
            trajectory_lengths,
            updated_obs_filter,
            self.reward_filter)
コード例 #13
0
ファイル: ppo_evaluator.py プロジェクト: adgirish/ray
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = int(
                config["sgd_batchsize"] / len(devices)) * len(devices)
            assert self.batch_size % len(devices) == 0
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.value_targets, self.advantages,
             self.actions, self.prev_logits, self.prev_vf_preds],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_policy_loss for policy in policies]), 0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_vf_loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = SyncSampler(
            self.env, self.common_policy, self.obs_filter,
            self.config["horizon"], self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())
コード例 #14
0
class CommonPolicyEvaluator(PolicyEvaluator):
    """Policy evaluator implementation that operates on a rllib.PolicyGraph.

    TODO: multi-gpu

    Examples:
        # Create a policy evaluator and using it to collect experiences.
        >>> evaluator = CommonPolicyEvaluator(
              env_creator=lambda _: gym.make("CartPole-v0"),
              policy_graph=PGPolicyGraph)
        >>> print(evaluator.sample())
        SampleBatch({
            "obs": [[...]], "actions": [[...]], "rewards": [[...]],
            "dones": [[...]], "new_obs": [[...]]})

        # Creating policy evaluators using optimizer_cls.make().
        >>> optimizer = SyncSamplesOptimizer.make(
              evaluator_cls=CommonPolicyEvaluator,
              evaluator_args={
                "env_creator": lambda _: gym.make("CartPole-v0"),
                "policy_graph": PGPolicyGraph,
              },
              num_workers=10)
        >>> for _ in range(10): optimizer.step()

        # Creating a multi-agent policy evaluator
        >>> evaluator = CommonPolicyEvaluator(
              env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
              policy_graph={
                  # Use an ensemble of two policies for car agents
                  "car_policy1":
                    (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.99}),
                  "car_policy2":
                    (PGPolicyGraph, Box(...), Discrete(...), {"gamma": 0.95}),
                  # Use a single shared policy for all traffic lights
                  "traffic_light_policy":
                    (PGPolicyGraph, Box(...), Discrete(...), {}),
              },
              policy_mapping_fn=lambda agent_id:
                random.choice(["car_policy1", "car_policy2"])
                if agent_id.startswith("car_") else "traffic_light_policy")
        >>> print(evaluator.sample().keys())
        MultiAgentBatch({
            "car_policy1": SampleBatch(...),
            "car_policy2": SampleBatch(...),
            "traffic_light_policy": SampleBatch(...)})
    """
    @classmethod
    def as_remote(cls, num_cpus=None, num_gpus=None):
        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)

    def __init__(self,
                 env_creator,
                 policy_graph,
                 policy_mapping_fn=None,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of exactly `batch_steps` in size. Episodes may be truncated
                    in order to meet this size requirement. When
                    `num_envs > 1`, episodes will be truncated to sequences of
                    `batch_size / num_envs` in length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_graph`.
            worker_index (int): For remote evaluators, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
        """

        env_context = EnvContext(env_config or {}, worker_index)
        policy_config = policy_config or {}
        self.policy_config = policy_config
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_context)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:

            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)

        self.env = wrap(self.env)

        def make_env():
            return wrap(env_creator(env_context))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
        if _has_tensorflow_graph(policy_dict):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map = self._build_policy_map(
                        policy_dict, policy_config)
        else:
            self.policy_map = self._build_policy_map(policy_dict,
                                                     policy_config)

        self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = AsyncVectorEnv.wrap_async(self.env,
                                                   make_env=make_env,
                                                   num_envs=num_envs)

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.async_env,
                                        self.policy_map,
                                        policy_mapping_fn,
                                        self.filters,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes,
                                        tf_sess=self.tf_sess)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.async_env,
                                       self.policy_map,
                                       policy_mapping_fn,
                                       self.filters,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes,
                                       tf_sess=self.tf_sess)

    def _build_policy_map(self, policy_dict, policy_config):
        policy_map = {}
        for name, (cls, obs_space, act_space,
                   conf) in sorted(policy_dict.items()):
            merged_conf = policy_config.copy()
            merged_conf.update(conf)
            with tf.variable_scope(name):
                policy_map[name] = cls(obs_space, act_space, merged_conf)
        return policy_map

    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch|MultiAgentBatch from evaluating the current policies.
        """

        batches = [self.sampler.get_data()]
        steps_so_far = batches[0].count
        while steps_so_far < self.batch_steps:
            batch = self.sampler.get_data()
            steps_so_far += batch.count
            batches.append(batch)
        batch = batches[0].concat_samples(batches)

        if self.compress_observations:
            if isinstance(batch, MultiAgentBatch):
                for data in batch.policy_batches.values():
                    data["obs"] = [pack(o) for o in data["obs"]]
                    data["new_obs"] = [pack(o) for o in data["new_obs"]]
            else:
                batch["obs"] = [pack(o) for o in batch["obs"]]
                batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch

    def for_policy(self, func, policy_id=DEFAULT_POLICY_ID):
        """Apply the given function to the specified policy graph."""

        return func(self.policy_map[policy_id])

    def foreach_policy(self, func):
        """Apply the given function to each (policy, policy_id) tuple."""

        return [func(policy, pid) for pid, policy in self.policy_map.items()]

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters

    def get_weights(self):
        return {
            pid: policy.get_weights()
            for pid, policy in self.policy_map.items()
        }

    def set_weights(self, weights):
        for pid, w in weights.items():
            self.policy_map[pid].set_weights(w)

    def compute_gradients(self, samples):
        if isinstance(samples, MultiAgentBatch):
            grad_out, info_out = {}, {}
            if self.tf_sess is not None:
                builder = TFRunBuilder(self.tf_sess, "compute_gradients")
                for pid, batch in samples.policy_batches.items():
                    grad_out[pid], info_out[pid] = (
                        self.policy_map[pid].build_compute_gradients(
                            builder, batch))
                grad_out = {k: builder.get(v) for k, v in grad_out.items()}
                info_out = {k: builder.get(v) for k, v in info_out.items()}
            else:
                for pid, batch in samples.policy_batches.items():
                    grad_out[pid], info_out[pid] = (
                        self.policy_map[pid].compute_gradients(batch))
            return grad_out, info_out
        else:
            return self.policy_map[DEFAULT_POLICY_ID].compute_gradients(
                samples)

    def apply_gradients(self, grads):
        if isinstance(grads, dict):
            if self.tf_sess is not None:
                builder = TFRunBuilder(self.tf_sess, "apply_gradients")
                outputs = {
                    pid:
                    self.policy_map[pid].build_apply_gradients(builder, grad)
                    for pid, grad in grads.items()
                }
                return {k: builder.get(v) for k, v in outputs.items()}
            else:
                return {
                    pid: self.policy_map[pid].apply_gradients(g)
                    for pid, g in grads.items()
                }
        else:
            return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)

    def compute_apply(self, samples):
        if isinstance(samples, MultiAgentBatch):
            info_out = {}
            if self.tf_sess is not None:
                builder = TFRunBuilder(self.tf_sess, "compute_apply")
                for pid, batch in samples.policy_batches.items():
                    info_out[pid], _ = (
                        self.policy_map[pid].build_compute_apply(
                            builder, batch))
                info_out = {k: builder.get(v) for k, v in info_out.items()}
            else:
                for pid, batch in samples.policy_batches.items():
                    info_out[pid], _ = (
                        self.policy_map[pid].compute_apply(batch))
            return info_out
        else:
            grad_fetch, apply_fetch = (
                self.policy_map[DEFAULT_POLICY_ID].compute_apply(samples))
            return grad_fetch

    def save(self):
        filters = self.get_filters(flush_after=True)
        state = {
            pid: self.policy_map[pid].get_state()
            for pid in self.policy_map
        }
        return pickle.dumps({"filters": filters, "state": state})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])
        for pid, state in objs["state"].items():
            self.policy_map[pid].set_state(state)
コード例 #15
0
class PPOEvaluator(Evaluator):
    """
    Runner class that holds the simulator environment and the policy.

    Initializes the tensorflow graphs for both training and evaluation.
    One common policy graph is initialized on '/cpu:0' and holds all the shared
    network weights. When run as a remote agent, only this graph is used.
    """
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter("has_inf_or_nan",
                                        tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        # TODO(rliaw): pull this into model_catalog
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None, ))
        else:
            raise NotImplemented("action space" + str(type(action_space)) +
                                 "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(self.env.observation_space,
                                      self.env.action_space, obs, vtargets,
                                      advs, acts, plog, pvf_preds,
                                      self.logit_dim, self.kl_coeff,
                                      self.distribution_class, self.config,
                                      self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices,
            [
                self.observations, self.value_targets, self.advantages,
                self.actions, self.prev_logits, self.prev_vf_preds
            ], self.per_device_batch_size, build_loss, self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_policy_loss for policy in policies]),
                0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[policy.mean_vf_loss for policy in policies]),
                0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[policy.mean_entropy for policy in policies]),
                0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())

    def load_data(self, trajectories, full_trace):
        use_gae = self.config["use_gae"]
        dummy = np.zeros_like(trajectories["advantages"])
        return self.par_opt.load_data(self.sess, [
            trajectories["observations"],
            trajectories["value_targets"] if use_gae else dummy,
            trajectories["advantages"], trajectories["actions"],
            trajectories["logprobs"],
            trajectories["vf_preds"] if use_gae else dummy
        ],
                                      full_trace=full_trace)

    def run_sgd_minibatch(self, batch_index, kl_coeff, full_trace,
                          file_writer):
        return self.par_opt.optimize(
            self.sess,
            batch_index,
            extra_ops=[
                self.mean_loss, self.mean_policy_loss, self.mean_vf_loss,
                self.mean_kl, self.mean_entropy
            ],
            extra_feed_dict={self.kl_coeff: kl_coeff},
            file_writer=file_writer if full_trace else None)

    def compute_gradients(self, samples):
        raise NotImplementedError

    def apply_gradients(self, grads):
        raise NotImplementedError

    def save(self):
        filters = self.get_filters(flush_after=True)
        return pickle.dumps({"filters": filters})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def sample(self):
        """Returns experience samples from this Evaluator. Observation
        filter and reward filters are flushed here.

        Returns:
            SampleBatch: A columnar batch of experiences.
        """
        num_steps_so_far = 0
        all_samples = []

        while num_steps_so_far < self.config["min_steps_per_task"]:
            rollout = self.sampler.get_data()
            samples = process_rollout(rollout,
                                      self.rew_filter,
                                      self.config["gamma"],
                                      self.config["lambda"],
                                      use_gae=self.config["use_gae"])
            num_steps_so_far += samples.count
            all_samples.append(samples)
        return SampleBatch.concat_samples(all_samples)

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters
コード例 #16
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 observation_filter="NoFilter",
                 registry=None,
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following choices:
                complete_episodes: each batch will be at least batch_steps
                    in size, and will include one or more complete episodes.
                truncate_episodes: each batch will be around batch_steps
                    in size, and include transitions from one episode only.
                pack_episodes: each batch will be exactly batch_steps in
                    size, and may include transitions from multiple episodes.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            observation_filter (str): Name of observation filter to use.
            registry (tune.Registry): User-registered objects. Pass in the
                value from tune.registry.get_registry() if you're having
                trouble resolving things like custom envs.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        registry = registry or get_registry()
        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}

        assert batch_mode in [
            "complete_episodes", "truncate_episodes", "pack_episodes"
        ]
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        is_atari = hasattr(self.env.unwrapped, "ale")
        if is_atari and "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":
            self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
        else:
            self.env = ModelCatalog.get_preprocessor_as_wrapper(
                registry, self.env, model_config)

        self.vectorized = hasattr(self.env, "vector_reset")
        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, registry,
                                          policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, registry,
                                  policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        if self.vectorized:
            raise NotImplementedError("Vector envs not yet supported")
        else:
            if batch_mode not in [
                    "pack_episodes", "truncate_episodes", "complete_episodes"
            ]:
                raise NotImplementedError("Batch mode not yet supported")
            pack = batch_mode == "pack_episodes"
            if batch_mode == "complete_episodes":
                batch_steps = 999999
            if sample_async:
                self.sampler = AsyncSampler(self.env,
                                            self.policy_map["default"],
                                            self.obs_filter,
                                            batch_steps,
                                            pack=pack)
                self.sampler.start()
            else:
                self.sampler = SyncSampler(self.env,
                                           self.policy_map["default"],
                                           self.obs_filter,
                                           batch_steps,
                                           pack=pack)
コード例 #17
0
class CommonPolicyEvaluator(PolicyEvaluator):
    """Policy evaluator implementation that operates on a rllib.PolicyGraph.

    TODO: vector env
    TODO: multi-agent
    TODO: consumer buffering for multi-agent
    TODO: complete episode batch mode

    Examples:
        # Create a policy evaluator and using it to collect experiences.
        >>> evaluator = CommonPolicyEvaluator(
              env_creator=lambda _: gym.make("CartPole-v0"),
              policy_graph=PGPolicyGraph)
        >>> print(evaluator.sample().keys())
        {"obs": [[...]], "actions": [[...]], "rewards": [[...]],
         "dones": [[...]], "new_obs": [[...]]}

        # Creating policy evaluators using optimizer_cls.make().
        >>> optimizer = LocalSyncOptimizer.make(
              evaluator_cls=CommonPolicyEvaluator,
              evaluator_args={
                "env_creator": lambda _: gym.make("CartPole-v0"),
                "policy_graph": PGPolicyGraph,
              },
              num_workers=10)
        >>> for _ in range(10): optimizer.step()
    """
    @classmethod
    def as_remote(cls, num_cpus=None, num_gpus=None):
        return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)

    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 observation_filter="NoFilter",
                 registry=None,
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following choices:
                complete_episodes: each batch will be at least batch_steps
                    in size, and will include one or more complete episodes.
                truncate_episodes: each batch will be around batch_steps
                    in size, and include transitions from one episode only.
                pack_episodes: each batch will be exactly batch_steps in
                    size, and may include transitions from multiple episodes.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            observation_filter (str): Name of observation filter to use.
            registry (tune.Registry): User-registered objects. Pass in the
                value from tune.registry.get_registry() if you're having
                trouble resolving things like custom envs.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        registry = registry or get_registry()
        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}

        assert batch_mode in [
            "complete_episodes", "truncate_episodes", "pack_episodes"
        ]
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        is_atari = hasattr(self.env.unwrapped, "ale")
        if is_atari and "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":
            self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
        else:
            self.env = ModelCatalog.get_preprocessor_as_wrapper(
                registry, self.env, model_config)

        self.vectorized = hasattr(self.env, "vector_reset")
        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, registry,
                                          policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, registry,
                                  policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        if self.vectorized:
            raise NotImplementedError("Vector envs not yet supported")
        else:
            if batch_mode not in [
                    "pack_episodes", "truncate_episodes", "complete_episodes"
            ]:
                raise NotImplementedError("Batch mode not yet supported")
            pack = batch_mode == "pack_episodes"
            if batch_mode == "complete_episodes":
                batch_steps = 999999
            if sample_async:
                self.sampler = AsyncSampler(self.env,
                                            self.policy_map["default"],
                                            self.obs_filter,
                                            batch_steps,
                                            pack=pack)
                self.sampler.start()
            else:
                self.sampler = SyncSampler(self.env,
                                           self.policy_map["default"],
                                           self.obs_filter,
                                           batch_steps,
                                           pack=pack)

    def sample(self):
        """Evaluate the current policies and return a batch of experiences.

        Return:
            SampleBatch from evaluating the current policies.
        """

        batch = self.policy_map["default"].postprocess_trajectory(
            self.sampler.get_data())

        if self.compress_observations:
            batch["obs"] = [pack(o) for o in batch["obs"]]
            batch["new_obs"] = [pack(o) for o in batch["new_obs"]]

        return batch

    def apply(self, func):
        """Apply the given function to this evaluator instance."""

        return func(self)

    def for_policy(self, func):
        """Apply the given function to this evaluator's default policy."""

        return func(self.policy_map["default"])

    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters

    def get_weights(self):
        return self.policy_map["default"].get_weights()

    def set_weights(self, weights):
        return self.policy_map["default"].set_weights(weights)

    def compute_gradients(self, samples):
        return self.policy_map["default"].compute_gradients(samples)

    def apply_gradients(self, grads):
        return self.policy_map["default"].apply_gradients(grads)

    def compute_apply(self, samples):
        grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
            samples)
        return grad_fetch

    def save(self):
        filters = self.get_filters(flush_after=True)
        state = self.policy_map["default"].get_state()
        return pickle.dumps({"filters": filters, "state": state})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.sync_filters(objs["filters"])
        self.policy_map["default"].set_state(objs["state"])
コード例 #18
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter("has_inf_or_nan",
                                        tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        # TODO(rliaw): pull this into model_catalog
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None, ))
        else:
            raise NotImplemented("action space" + str(type(action_space)) +
                                 "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(self.env.observation_space,
                                      self.env.action_space, obs, vtargets,
                                      advs, acts, plog, pvf_preds,
                                      self.logit_dim, self.kl_coeff,
                                      self.distribution_class, self.config,
                                      self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices,
            [
                self.observations, self.value_targets, self.advantages,
                self.actions, self.prev_logits, self.prev_vf_preds
            ], self.per_device_batch_size, build_loss, self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_policy_loss for policy in policies]),
                0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[policy.mean_vf_loss for policy in policies]),
                0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[policy.mean_entropy for policy in policies]),
                0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())
コード例 #19
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 policy_mapping_fn=None,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of exactly `batch_steps` in size. Episodes may be truncated
                    in order to meet this size requirement. When
                    `num_envs > 1`, episodes will be truncated to sequences of
                    `batch_size / num_envs` in length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_graph`.
            worker_index (int): For remote evaluators, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
        """

        env_context = EnvContext(env_config or {}, worker_index)
        policy_config = policy_config or {}
        self.policy_config = policy_config
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_context)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:

            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)

        self.env = wrap(self.env)

        def make_env():
            return wrap(env_creator(env_context))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
        if _has_tensorflow_graph(policy_dict):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map = self._build_policy_map(
                        policy_dict, policy_config)
        else:
            self.policy_map = self._build_policy_map(policy_dict,
                                                     policy_config)

        self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = AsyncVectorEnv.wrap_async(self.env,
                                                   make_env=make_env,
                                                   num_envs=num_envs)

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.async_env,
                                        self.policy_map,
                                        policy_mapping_fn,
                                        self.filters,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes,
                                        tf_sess=self.tf_sess)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.async_env,
                                       self.policy_map,
                                       policy_mapping_fn,
                                       self.filters,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes,
                                       tf_sess=self.tf_sess)