Esempio n. 1
0
    def __init__(self,
                 registry,
                 env_creator,
                 config,
                 logdir,
                 start_sampler=True):
        env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.env = env
        policy_cls = get_policy_cls(config)
        # TODO(rliaw): should change this to be just env.observation_space
        self.policy = policy_cls(registry, env.observation_space.shape,
                                 env.action_space, config)
        self.config = config

        # Technically not needed when not remote
        self.obs_filter = get_filter(config["observation_filter"],
                                     env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
                                    config["batch_size"])
        if start_sampler and self.sampler. async:
            self.sampler.start()
        self.logdir = logdir
Esempio n. 2
0
 def __init__(self, env_creator, config, logdir):
     self.env = env = create_and_wrap(env_creator, config["model"])
     policy_cls = get_policy_cls(config)
     # TODO(rliaw): should change this to be just env.observation_space
     self.policy = policy_cls(env.observation_space.shape, env.action_space)
     obs_filter = get_filter(config["observation_filter"],
                             env.observation_space.shape)
     self.rew_filter = get_filter(config["reward_filter"], ())
     self.sampler = AsyncSampler(env, self.policy, obs_filter,
                                 config["batch_size"])
     self.logdir = logdir
Esempio n. 3
0
File: a3c.py Progetto: reazrrr/ray
 def _init(self):
     self.env = create_and_wrap(self.env_creator, self.config["model"])
     policy_cls = get_policy_cls(self.config)
     self.policy = policy_cls(self.env.observation_space.shape,
                              self.env.action_space)
     self.obs_filter = get_filter(self.config["observation_filter"],
                                  self.env.observation_space.shape)
     self.rew_filter = get_filter(self.config["reward_filter"], ())
     self.agents = [
         RemoteRunner.remote(self.env_creator, self.config, self.logdir)
         for i in range(self.config["num_workers"])
     ]
     self.parameters = self.policy.get_weights()
Esempio n. 4
0
    def __init__(self, registry, sess, action_space, preprocessor,
                 observation_filter):
        self.sess = sess
        self.action_space = action_space
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(
            observation_filter, self.preprocessor.shape)
        self.inputs = tf.placeholder(
            tf.float32, [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, dist_type="deterministic")
        model = ModelCatalog.get_model(registry, self.inputs, dist_dim,
                                       options={"fcnet_hiddens": [32, 32]})
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum([np.prod(variable.shape.as_list())
                               for _, variable
                               in self.variables.variables.items()])
        self.sess.run(tf.global_variables_initializer())
Esempio n. 5
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"])
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        self.inputs = [("obs", self.observations),
                       ("value_targets", self.value_targets),
                       ("advantages", self.advantages),
                       ("actions", self.actions),
                       ("logprobs", self.prev_logits),
                       ("vf_preds", self.prev_vf_preds)]
        self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs])

        # References to the model weights
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
Esempio n. 6
0
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, action_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 7
0
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 8
0
    def __init__(self, obs_space, action_space, config):
        self.action_space = action_space
        self.action_noise_std = config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space)
        self.observation_filter = get_filter(config["observation_filter"],
                                             self.preprocessor.shape)
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, config["model"], dist_type="deterministic")
        model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs},
                                       obs_space, action_space, dist_dim,
                                       config["model"])
        dist = dist_class(model.outputs, model)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 9
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.ddpg_graph.copy_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0
        nb_actions = env.action_space.shape[-1]
        stddev = config["exploration_noise"]
        self.exploration_noise = OUNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        self.action_range = (-1., 1.)

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.critic_loss, self.ddpg_graph.action_loss),
            self.sess)
        self.max_action = env.action_space.high
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        # Technically not needed when not remote
        self.obs_filter = get_filter(config["observation_filter"],
                                     env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }

        self.obs = self.env.reset()
Esempio n. 10
0
    def __init__(self, env_creator, config, logdir, start_sampler=True):
        self.env = env = create_and_wrap(env_creator, config["preprocessing"])
        policy_cls = get_policy_cls(config)
        # TODO(rliaw): should change this to be just env.observation_space
        self.policy = policy_cls(env.observation_space.shape, env.action_space,
                                 config)
        self.config = config

        # Technically not needed when not remote
        self.obs_filter = get_filter(config["observation_filter"],
                                     env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
                                    config["batch_size"])
        if start_sampler and self.sampler. async:
            self.sampler.start()
        self.logdir = logdir
Esempio n. 11
0
    def __init__(self, obs_space, action_space, config):
        super().__init__(obs_space, action_space, config)
        self.action_noise_std = self.config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(
            self.observation_space)
        self.observation_filter = get_filter(self.config["observation_filter"],
                                             self.preprocessor.shape)

        self.single_threaded = self.config.get("single_threaded", False)
        if self.config["framework"] == "tf":
            self.sess = make_session(single_threaded=self.single_threaded)

            # Set graph-level seed.
            if config.get("seed") is not None:
                with self.sess.as_default():
                    tf1.set_random_seed(config["seed"])

            self.inputs = tf1.placeholder(tf.float32, [None] +
                                          list(self.preprocessor.shape))
        else:
            if not tf1.executing_eagerly():
                tf1.enable_eager_execution()
            self.sess = self.inputs = None
            if config.get("seed") is not None:
                # Tf2.x.
                if config.get("framework") == "tf2":
                    tf.random.set_seed(config["seed"])
                # Tf-eager.
                elif tf1 and config.get("framework") == "tfe":
                    tf1.set_random_seed(config["seed"])

        # Policy network.
        self.dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, self.config["model"], dist_type="deterministic")

        self.model = ModelCatalog.get_model_v2(
            obs_space=self.preprocessor.observation_space,
            action_space=self.action_space,
            num_outputs=dist_dim,
            model_config=self.config["model"],
        )

        self.sampler = None
        if self.sess:
            dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs})
            dist = self.dist_class(dist_inputs, self.model)
            self.sampler = dist.sample()
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                dist_inputs, self.sess)
            self.sess.run(tf1.global_variables_initializer())
        else:
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                [], None, self.model.variables())

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
Esempio n. 12
0
def generate_policies(
    policy_id: str,
    policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space",
                                    dict],
    policies: Dict[str, TFPolicy],
    policies_to_train: List[str],
    policy_config: dict,
    preprocessors: Dict[str, Any],
    obs_filters: Dict[str, Any],
    observation_filter: str,
):
    """
    Get policies for each ``agent_id``, and instantiate new ones
    for newly created agents.
    """

    policy_cls, obs_space, act_space, conf = policy_constructor_tuple

    if policy_id in preprocessors != policy_id in policies:
        raise ValueError("'preprocessors' and 'policies' do not agree.")
    if policy_id in obs_filters != policy_id in policies:
        raise ValueError("'obs_filters' and 'policies' do not agree.")

    # If we haven't seen this id, we instantiate a new policy.
    if policy_id not in policies:
        merged_conf = merge_dicts(policy_config, conf)

        # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``.
        preprocessor = ModelCatalog.get_preprocessor_for_space(
            obs_space, merged_conf.get("model"))
        preprocessors[policy_id] = preprocessor
        obs_space = preprocessor.observation_space

        if tf and tf.executing_eagerly():
            if hasattr(policy_cls, "as_eager"):
                policy_cls = policy_cls.as_eager()
                if policy_config["eager_tracing"]:
                    policy_cls = policy_cls.with_tracing()
            elif not issubclass(policy_cls, TFPolicy):
                pass  # could be some other type of policy
            else:
                raise ValueError("This policy does not support eager "
                                 "execution: {}".format(policy_cls))
        if tf:
            with tf.variable_scope(policy_id):
                policies[policy_id] = policy_cls(obs_space, act_space,
                                                 merged_conf)
                policies_to_train.append(policy_id)
        else:
            policies[policy_id] = policy_cls(obs_space, act_space, merged_conf)
            policies_to_train.append(policy_id)

        obs_filters[policy_id] = get_filter(observation_filter,
                                            obs_space.shape)
    return policies, preprocessors, obs_filters, policies_to_train
Esempio n. 13
0
    def __init__(
            self, registry, env_creator, config, logdir, start_sampler=True):
        env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.env = env
        policy_cls = get_policy_cls(config)
        # TODO(rliaw): should change this to be just env.observation_space
        self.policy = policy_cls(
            registry, env.observation_space.shape, env.action_space, config)
        self.config = config

        # Technically not needed when not remote
        self.obs_filter = get_filter(
            config["observation_filter"], env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
                                    config["batch_size"])
        if start_sampler and self.sampler.async:
            self.sampler.start()
        self.logdir = logdir
Esempio n. 14
0
def before_init(policy, observation_space, action_space, config):
    policy.action_noise_std = config["action_noise_std"]
    policy.preprocessor = ModelCatalog.get_preprocessor_for_space(
        observation_space)
    policy.observation_filter = get_filter(config["observation_filter"],
                                           policy.preprocessor.shape)
    policy.single_threaded = config.get("single_threaded", False)

    def _set_flat_weights(policy, theta):
        pos = 0
        theta_dict = policy.model.state_dict()
        new_theta_dict = {}

        for k in sorted(theta_dict.keys()):
            shape = policy.param_shapes[k]
            num_params = int(np.prod(shape))
            new_theta_dict[k] = torch.from_numpy(
                np.reshape(theta[pos:pos + num_params], shape))
            pos += num_params
        policy.model.load_state_dict(new_theta_dict)

    def _get_flat_weights(policy):
        # Get the parameter tensors.
        theta_dict = policy.model.state_dict()
        # Flatten it into a single np.ndarray.
        theta_list = []
        for k in sorted(theta_dict.keys()):
            theta_list.append(torch.reshape(theta_dict[k], (-1, )))
        cat = torch.cat(theta_list, dim=0)
        return cat.numpy()

    type(policy).set_flat_weights = _set_flat_weights
    type(policy).get_flat_weights = _get_flat_weights

    def _compute_actions(policy, obs_batch, add_noise=False, update=True):
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(
            observation[None], update=update)

        observation = convert_to_torch_tensor(observation)
        dist_inputs, _ = policy.model({
            SampleBatch.CUR_OBS: observation
        }, [], None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample().detach().numpy()
        action = unbatch_actions(action)
        if add_noise and isinstance(policy.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * policy.action_noise_std
        return action

    type(policy).compute_actions = _compute_actions
Esempio n. 15
0
 def _init(self):
     self.global_step = 0
     self.kl_coeff = self.config["kl_coeff"]
     self.model = Runner(
         self.env_creator, self.config, self.logdir, False)
     self.agents = [
         RemoteRunner.remote(
             self.env_creator, self.config, self.logdir, True)
         for _ in range(self.config["num_workers"])]
     self.start_time = time.time()
     if self.config["write_logs"]:
         self.file_writer = tf.summary.FileWriter(
             self.logdir, self.model.sess.graph)
     else:
         self.file_writer = None
     self.saver = tf.train.Saver(max_to_keep=None)
     self.obs_filter = get_filter(
         self.config["observation_filter"],
         self.model.env.observation_space.shape)
Esempio n. 16
0
    def __init__(self,
                 sess,
                 action_space,
                 preprocessor,
                 observation_filter,
                 action_noise_std,
                 options={}):

        if len(preprocessor.shape) > 1:
            raise UnsupportedSpaceException(
                "Observation space {} is not supported with ARS.".format(
                    preprocessor.shape))

        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, dist_type="deterministic")

        model = ModelCatalog.get_model(self.inputs, dist_dim, options=options)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 17
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of exactly `batch_steps` in size. Episodes may be truncated
                    in order to meet this size requirement. When
                    `num_envs > 1`, episodes will be truncated to sequences of
                    `batch_size / num_envs` in length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                return wrap_deepmind(env, dim=model_config.get("dim", 80))
        else:

            def wrap(env):
                return ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)

        self.env = wrap(self.env)

        def make_env():
            return wrap(env_creator(env_config))

        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        # Always use vector env for consistency even if num_envs = 1
        if not isinstance(self.env, AsyncVectorEnv):
            if isinstance(self.env, ServingEnv):
                self.vector_env = _ServingEnvToAsync(self.env)
            else:
                if not isinstance(self.env, VectorEnv):
                    self.env = VectorEnv.wrap(make_env, [self.env],
                                              num_envs=num_envs)
                self.vector_env = _VectorEnvToAsync(self.env)
        else:
            self.vector_env = self.env

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.vector_env,
                                        self.policy_map["default"],
                                        self.obs_filter,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.vector_env,
                                       self.policy_map["default"],
                                       self.obs_filter,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes)
Esempio n. 18
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = int(
                config["sgd_batchsize"] / len(devices)) * len(devices)
            assert self.batch_size % len(devices) == 0
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.value_targets, self.advantages,
             self.actions, self.prev_logits, self.prev_vf_preds],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_policy_loss for policy in policies]), 0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_vf_loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = SyncSampler(
            self.env, self.common_policy, self.obs_filter,
            self.config["horizon"], self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())
Esempio n. 19
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 policy_mapping_fn=None,
                 policies_to_train=None,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="deepmind",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 clip_rewards=None,
                 clip_actions=True,
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0,
                 monitor_path=None,
                 log_dir=None,
                 log_level=None,
                 callbacks=None,
                 input_creator=lambda ioctx: ioctx.default_sampler_input(),
                 input_evaluation_method=None,
                 output_creator=lambda ioctx: NoopOutput()):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            policies_to_train (list): Optional whitelist of policies to train,
                or None for all policies.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of at most `batch_steps * num_envs` in size. The batch will
                    be exactly `batch_steps * num_envs` in size if
                    postprocessing does not change batch sizes. Episodes may be
                    truncated in order to meet this size requirement.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps * num_envs` in size. Episodes will
                    not be truncated, but multiple episodes may be packed
                    within one batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations.
                They can be decompressed with rllib/utils/compression.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
                experience postprocessing. Setting to None means clip for Atari
                only.
            clip_actions (bool): Whether to clip action values to the range
                specified by the policy action space.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_graph`.
            worker_index (int): For remote evaluators, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
            monitor_path (str): Write out episode stats and videos to this
                directory if specified.
            log_dir (str): Directory where logs can be placed.
            log_level (str): Set the root log level on creation.
            callbacks (dict): Dict of custom debug callbacks.
            input_creator (func): Function that returns an InputReader object
                for loading previous generated experiences.
            input_evaluation_method (str): How to evaluate the current policy.
                This only applies when the input is reading offline data.
                Options are:
                  - None: don't evaluate the policy. The episode reward and
                    other metrics will be NaN.
                  - "simulation": run the environment in the background, but
                    use this data for evaluation only and never for learning.
                  - "counterfactual": use counterfactual policy evaluation to
                    estimate performance.
            output_creator (func): Function that returns an OutputWriter object
                for saving generated experiences.
        """

        if log_level:
            logging.getLogger("ray.rllib").setLevel(log_level)

        env_context = EnvContext(env_config or {}, worker_index)
        policy_config = policy_config or {}
        self.policy_config = policy_config
        self.callbacks = callbacks or {}
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        if not callable(policy_mapping_fn):
            raise ValueError(
                "Policy mapping function not callable. If you're using Tune, "
                "make sure to escape the function with tune.function() "
                "to prevent it from being evaluated as an expression.")
        self.env_creator = env_creator
        self.sample_batch_size = batch_steps * num_envs
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations
        self.preprocessing_enabled = True

        self.env = env_creator(env_context)
        if isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                not model_config.get("custom_preprocessor") and \
                preprocessor_pref == "deepmind":

            # Deepmind wrappers already handle all preprocessing
            self.preprocessing_enabled = False

            if clip_rewards is None:
                clip_rewards = True

            def wrap(env):
                env = wrap_deepmind(
                    env,
                    dim=model_config.get("dim"),
                    framestack=model_config.get("framestack"))
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env
        else:

            def wrap(env):
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env

        self.env = wrap(self.env)

        def make_env(vector_index):
            return wrap(
                env_creator(env_context.with_vector_index(vector_index)))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
        self.policies_to_train = policies_to_train or list(policy_dict.keys())
        if _has_tensorflow_graph(policy_dict):
            if (ray.worker._mode() != ray.worker.LOCAL_MODE
                    and not ray.get_gpu_ids()):
                logger.info("Creating policy evaluation worker {}".format(
                    worker_index) +
                            " on CPU (please ignore any CUDA init errors)")
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(
                        config=tf.ConfigProto(
                            gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
        else:
            self.policy_map, self.preprocessors = self._build_policy_map(
                policy_dict, policy_config)

        self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
        if self.multiagent:
            if not (isinstance(self.env, MultiAgentEnv)
                    or isinstance(self.env, AsyncVectorEnv)):
                raise ValueError(
                    "Have multiple policy graphs {}, but the env ".format(
                        self.policy_map) +
                    "{} is not a subclass of MultiAgentEnv?".format(self.env))

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = AsyncVectorEnv.wrap_async(
            self.env, make_env=make_env, num_envs=num_envs)
        self.num_envs = num_envs

        if self.batch_mode == "truncate_episodes":
            unroll_length = batch_steps
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            unroll_length = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))

        if input_evaluation_method == "simulation":
            logger.warning(
                "Requested 'simulation' input evaluation method: "
                "will discard all sampler outputs and keep only metrics.")
            sample_async = True
        elif input_evaluation_method == "counterfactual":
            raise NotImplementedError
        elif input_evaluation_method is None:
            pass
        else:
            raise ValueError("Unknown evaluation method: {}".format(
                input_evaluation_method))

        if sample_async:
            self.sampler = AsyncSampler(
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                unroll_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                blackhole_outputs=input_evaluation_method == "simulation")
            self.sampler.start()
        else:
            self.sampler = SyncSampler(
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                unroll_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions)

        self.io_context = IOContext(log_dir, policy_config, worker_index, self)
        self.input_reader = input_creator(self.io_context)
        assert isinstance(self.input_reader, InputReader), self.input_reader
        self.output_writer = output_creator(self.io_context)
        assert isinstance(self.output_writer, OutputWriter), self.output_writer

        logger.debug("Created evaluator with env {} ({}), policies {}".format(
            self.async_env, self.env, self.policy_map))
Esempio n. 20
0
def before_init(policy, observation_space, action_space, config):
    policy.action_noise_std = config["action_noise_std"]
    policy.action_space_struct = get_base_struct_from_space(action_space)
    policy.preprocessor = ModelCatalog.get_preprocessor_for_space(
        observation_space)
    policy.observation_filter = get_filter(config["observation_filter"],
                                           policy.preprocessor.shape)
    policy.single_threaded = config.get("single_threaded", False)

    def _set_flat_weights(policy, theta):
        pos = 0
        theta_dict = policy.model.state_dict()
        new_theta_dict = {}

        for k in sorted(theta_dict.keys()):
            shape = policy.param_shapes[k]
            num_params = int(np.prod(shape))
            new_theta_dict[k] = torch.from_numpy(
                np.reshape(theta[pos:pos + num_params], shape))
            pos += num_params
        policy.model.load_state_dict(new_theta_dict)

    def _get_flat_weights(policy):
        # Get the parameter tensors.
        theta_dict = policy.model.state_dict()
        # Flatten it into a single np.ndarray.
        theta_list = []
        for k in sorted(theta_dict.keys()):
            theta_list.append(torch.reshape(theta_dict[k], (-1, )))
        cat = torch.cat(theta_list, dim=0)
        return cat.cpu().numpy()

    type(policy).set_flat_weights = _set_flat_weights
    type(policy).get_flat_weights = _get_flat_weights

    def _compute_actions(policy,
                         obs_batch,
                         add_noise=False,
                         update=True,
                         **kwargs):
        # Batch is given as list -> Try converting to numpy first.
        if isinstance(obs_batch, list) and len(obs_batch) == 1:
            obs_batch = obs_batch[0]
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(observation[None],
                                                update=update)

        observation = convert_to_torch_tensor(observation, policy.device)
        dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [],
                                      None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample()

        def _add_noise(single_action, single_action_space):
            single_action = single_action.detach().cpu().numpy()
            if add_noise and isinstance(single_action_space, gym.spaces.Box):
                single_action += np.random.randn(*single_action.shape) * \
                                 policy.action_noise_std
            return single_action

        action = tree.map_structure(_add_noise, action,
                                    policy.action_space_struct)
        action = unbatch(action)
        return action, [], {}

    def _compute_single_action(policy,
                               observation,
                               add_noise=False,
                               update=True,
                               **kwargs):
        action, state_outs, extra_fetches = policy.compute_actions(
            [observation], add_noise=add_noise, update=update, **kwargs)
        return action[0], state_outs, extra_fetches

    type(policy).compute_actions = _compute_actions
    type(policy).compute_single_action = _compute_single_action
Esempio n. 21
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 policy_mapping_fn=None,
                 policies_to_train=None,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="deepmind",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 clip_rewards=False,
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0,
                 monitor_path=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            policies_to_train (list): Optional whitelist of policies to train,
                or None for all policies.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of at most `batch_steps` in size. The batch will be exactly
                    `batch_steps` in size if postprocessing does not change
                    batch sizes. Episodes may be truncated in order to meet
                    this size requirement. When `num_envs > 1`, episodes will
                    be truncated to sequences of `batch_size / num_envs` in
                    length.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps in size. Episodes will not be
                    truncated, but multiple episodes may be packed within one
                    batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations.
                They can be decompressed with rllib/utils/compression.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
                experience postprocessing.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_graph`.
            worker_index (int): For remote evaluators, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
            monitor_path (str): Write out episode stats and videos to this
                directory if specified.
        """

        env_context = EnvContext(env_config or {}, worker_index)
        policy_config = policy_config or {}
        self.policy_config = policy_config
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        self.env_creator = env_creator
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_context)
        if isinstance(self.env, VectorEnv) or \
                isinstance(self.env, ServingEnv) or \
                isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":

            def wrap(env):
                env = wrap_deepmind(env,
                                    dim=model_config.get("dim", 84),
                                    framestack=not model_config.get("use_lstm")
                                    and not model_config.get("no_framestack"))
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env
        else:

            def wrap(env):
                env = ModelCatalog.get_preprocessor_as_wrapper(
                    env, model_config)
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env

        self.env = wrap(self.env)

        def make_env(vector_index):
            return wrap(
                env_creator(env_context.with_vector_index(vector_index)))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
        self.policies_to_train = policies_to_train or list(policy_dict.keys())
        if _has_tensorflow_graph(policy_dict):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map = self._build_policy_map(
                        policy_dict, policy_config)
        else:
            self.policy_map = self._build_policy_map(policy_dict,
                                                     policy_config)

        self.multiagent = self.policy_map.keys() != {DEFAULT_POLICY_ID}

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = AsyncVectorEnv.wrap_async(self.env,
                                                   make_env=make_env,
                                                   num_envs=num_envs)
        self.num_envs = num_envs

        if self.batch_mode == "truncate_episodes":
            if batch_steps % num_envs != 0:
                raise ValueError(
                    "In 'truncate_episodes' batch mode, `batch_steps` must be "
                    "evenly divisible by `num_envs`. Got {} and {}.".format(
                        batch_steps, num_envs))
            batch_steps = batch_steps // num_envs
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            batch_steps = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))
        if sample_async:
            self.sampler = AsyncSampler(self.async_env,
                                        self.policy_map,
                                        policy_mapping_fn,
                                        self.filters,
                                        clip_rewards,
                                        batch_steps,
                                        horizon=episode_horizon,
                                        pack=pack_episodes,
                                        tf_sess=self.tf_sess)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(self.async_env,
                                       self.policy_map,
                                       policy_mapping_fn,
                                       self.filters,
                                       clip_rewards,
                                       batch_steps,
                                       horizon=episode_horizon,
                                       pack=pack_episodes,
                                       tf_sess=self.tf_sess)
Esempio n. 22
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 preprocessor_pref="rllib",
                 sample_async=False,
                 compress_observations=False,
                 observation_filter="NoFilter",
                 registry=None,
                 env_config=None,
                 model_config=None,
                 policy_config=None):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                env config dict.
            policy_graph (class): A class implementing rllib.PolicyGraph or
                rllib.TFPolicyGraph.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following choices:
                complete_episodes: each batch will be at least batch_steps
                    in size, and will include one or more complete episodes.
                truncate_episodes: each batch will be around batch_steps
                    in size, and include transitions from one episode only.
                pack_episodes: each batch will be exactly batch_steps in
                    size, and may include transitions from multiple episodes.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations
                returned.
            observation_filter (str): Name of observation filter to use.
            registry (tune.Registry): User-registered objects. Pass in the
                value from tune.registry.get_registry() if you're having
                trouble resolving things like custom envs.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy.
        """

        registry = registry or get_registry()
        env_config = env_config or {}
        policy_config = policy_config or {}
        model_config = model_config or {}

        assert batch_mode in [
            "complete_episodes", "truncate_episodes", "pack_episodes"
        ]
        self.env_creator = env_creator
        self.policy_graph = policy_graph
        self.batch_steps = batch_steps
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations

        self.env = env_creator(env_config)
        is_atari = hasattr(self.env.unwrapped, "ale")
        if is_atari and "custom_preprocessor" not in model_config and \
                preprocessor_pref == "deepmind":
            self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
        else:
            self.env = ModelCatalog.get_preprocessor_as_wrapper(
                registry, self.env, model_config)

        self.vectorized = hasattr(self.env, "vector_reset")
        self.policy_map = {}

        if issubclass(policy_graph, TFPolicyGraph):
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.sess = tf_session_creator()
                else:
                    self.sess = tf.Session(config=tf.ConfigProto(
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.sess.as_default():
                    policy = policy_graph(self.env.observation_space,
                                          self.env.action_space, registry,
                                          policy_config)
        else:
            policy = policy_graph(self.env.observation_space,
                                  self.env.action_space, registry,
                                  policy_config)
        self.policy_map = {"default": policy}

        self.obs_filter = get_filter(observation_filter,
                                     self.env.observation_space.shape)
        self.filters = {"obs_filter": self.obs_filter}

        if self.vectorized:
            raise NotImplementedError("Vector envs not yet supported")
        else:
            if batch_mode not in [
                    "pack_episodes", "truncate_episodes", "complete_episodes"
            ]:
                raise NotImplementedError("Batch mode not yet supported")
            pack = batch_mode == "pack_episodes"
            if batch_mode == "complete_episodes":
                batch_steps = 999999
            if sample_async:
                self.sampler = AsyncSampler(self.env,
                                            self.policy_map["default"],
                                            self.obs_filter,
                                            batch_steps,
                                            pack=pack)
                self.sampler.start()
            else:
                self.sampler = SyncSampler(self.env,
                                           self.policy_map["default"],
                                           self.obs_filter,
                                           batch_steps,
                                           pack=pack)
Esempio n. 23
0
    def __init__(self,
                 env_creator,
                 policy,
                 policy_mapping_fn=None,
                 policies_to_train=None,
                 tf_session_creator=None,
                 rollout_fragment_length=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="deepmind",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_fn=None,
                 observation_filter="NoFilter",
                 clip_rewards=None,
                 clip_actions=True,
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0,
                 num_workers=0,
                 monitor_path=None,
                 log_dir=None,
                 log_level=None,
                 callbacks=None,
                 input_creator=lambda ioctx: ioctx.default_sampler_input(),
                 input_evaluation=frozenset([]),
                 output_creator=lambda ioctx: NoopOutput(),
                 remote_worker_envs=False,
                 remote_env_batch_wait_ms=0,
                 soft_horizon=False,
                 no_done_at_end=False,
                 seed=None,
                 extra_python_environs=None,
                 fake_sampler=False):
        """Initialize a rollout worker.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy (class|dict): Either a class implementing
                Policy, or a dictionary of policy id strings to
                (Policy, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            policies_to_train (list): Optional whitelist of policies to train,
                or None for all policies.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicy.
            rollout_fragment_length (int): The target number of env transitions
                to include in each sample batch returned from this worker.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of at most `rollout_fragment_length * num_envs` in size.
                    The batch will be exactly
                    `rollout_fragment_length * num_envs` in size if
                    postprocessing does not change batch sizes. Episodes may be
                    truncated in order to meet this size requirement.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `rollout_fragment_length * num_envs` in size.
                    Episodes will not be truncated, but multiple episodes may
                    be packed within one batch to meet the batch size. Note
                    that when `num_envs > 1`, episode steps will be buffered
                    until the episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations.
                They can be decompressed with rllib/utils/compression.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_fn (ObservationFunction): Optional multi-agent
                observation function.
            observation_filter (str): Name of observation filter to use.
            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
                experience postprocessing. Setting to None means clip for Atari
                only.
            clip_actions (bool): Whether to clip action values to the range
                specified by the policy action space.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy`.
            worker_index (int): For remote workers, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
            num_workers (int): For remote workers, how many workers altogether
                have been created?
            monitor_path (str): Write out episode stats and videos to this
                directory if specified.
            log_dir (str): Directory where logs can be placed.
            log_level (str): Set the root log level on creation.
            callbacks (DefaultCallbacks): Custom training callbacks.
            input_creator (func): Function that returns an InputReader object
                for loading previous generated experiences.
            input_evaluation (list): How to evaluate the policy performance.
                This only makes sense to set when the input is reading offline
                data. The possible values include:
                  - "is": the step-wise importance sampling estimator.
                  - "wis": the weighted step-wise is estimator.
                  - "simulation": run the environment in the background, but
                    use this data for evaluation only and never for learning.
            output_creator (func): Function that returns an OutputWriter object
                for saving generated experiences.
            remote_worker_envs (bool): If using num_envs > 1, whether to create
                those new envs in remote processes instead of in the current
                process. This adds overheads, but can make sense if your envs
            remote_env_batch_wait_ms (float): Timeout that remote workers
                are waiting when polling environments. 0 (continue when at
                least one env is ready) is a reasonable default, but optimal
                value could be obtained by measuring your environment
                step / reset and model inference perf.
            soft_horizon (bool): Calculate rewards but don't reset the
                environment when the horizon is hit.
            no_done_at_end (bool): Ignore the done=True at the end of the
                episode and instead record done=False.
            seed (int): Set the seed of both np and tf to this value to
                to ensure each remote worker has unique exploration behavior.
            extra_python_environs (dict): Extra python environments need to
                be set.
            fake_sampler (bool): Use a fake (inf speed) sampler for testing.
        """
        self._original_kwargs = locals().copy()
        del self._original_kwargs["self"]

        global _global_worker
        _global_worker = self

        # set extra environs first
        if extra_python_environs:
            for key, value in extra_python_environs.items():
                os.environ[key] = str(value)

        def gen_rollouts():
            while True:
                yield self.sample()

        ParallelIteratorWorker.__init__(self, gen_rollouts, False)

        policy_config = policy_config or {}
        if (tf and policy_config.get("eager")
                and not policy_config.get("no_eager_on_workers")
                # This eager check is necessary for certain all-framework tests
                # that use tf's eager_mode() context generator.
                and not tf.executing_eagerly()):
            tf.enable_eager_execution()

        if log_level:
            logging.getLogger("ray.rllib").setLevel(log_level)

        if worker_index > 1:
            disable_log_once_globally()  # only need 1 worker to log
        elif log_level == "DEBUG":
            enable_periodic_logging()

        env_context = EnvContext(env_config or {}, worker_index)
        self.policy_config = policy_config
        if callbacks:
            self.callbacks = callbacks()
        else:
            from ray.rllib.agents.callbacks import DefaultCallbacks
            self.callbacks = DefaultCallbacks()
        self.worker_index = worker_index
        self.num_workers = num_workers
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        if not callable(policy_mapping_fn):
            raise ValueError("Policy mapping function not callable?")
        self.env_creator = env_creator
        self.rollout_fragment_length = rollout_fragment_length * num_envs
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations
        self.preprocessing_enabled = True
        self.last_batch = None
        self.global_vars = None
        self.fake_sampler = fake_sampler

        self.env = _validate_env(env_creator(env_context))
        if isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, BaseEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                not model_config.get("custom_preprocessor") and \
                preprocessor_pref == "deepmind":

            # Deepmind wrappers already handle all preprocessing
            self.preprocessing_enabled = False

            if clip_rewards is None:
                clip_rewards = True

            def wrap(env):
                env = wrap_deepmind(
                    env,
                    dim=model_config.get("dim"),
                    framestack=model_config.get("framestack"))
                if monitor_path:
                    from gym import wrappers
                    env = wrappers.Monitor(env, monitor_path, resume=True)
                return env
        else:

            def wrap(env):
                if monitor_path:
                    from gym import wrappers
                    env = wrappers.Monitor(env, monitor_path, resume=True)
                return env

        self.env = wrap(self.env)

        def make_env(vector_index):
            return wrap(
                env_creator(
                    env_context.copy_with_overrides(
                        vector_index=vector_index, remote=remote_worker_envs)))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy, self.env)
        self.policies_to_train = policies_to_train or list(policy_dict.keys())
        # set numpy and python seed
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
            if not hasattr(self.env, "seed"):
                raise ValueError("Env doesn't support env.seed(): {}".format(
                    self.env))
            self.env.seed(seed)
            try:
                assert torch is not None
                torch.manual_seed(seed)
            except AssertionError:
                logger.info("Could not seed torch")
        if _has_tensorflow_graph(policy_dict) and not (tf and
                                                       tf.executing_eagerly()):
            if not tf:
                raise ImportError("Could not import tensorflow")
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(
                        config=tf.ConfigProto(
                            gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    # set graph-level seed
                    if seed is not None:
                        tf.set_random_seed(seed)
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
            if (ray.is_initialized()
                    and ray.worker._mode() != ray.worker.LOCAL_MODE):
                if not ray.get_gpu_ids():
                    logger.debug(
                        "Creating policy evaluation worker {}".format(
                            worker_index) +
                        " on CPU (please ignore any CUDA init errors)")
                elif not tf.test.is_gpu_available():
                    raise RuntimeError(
                        "GPUs were assigned to this worker by Ray, but "
                        "TensorFlow reports GPU acceleration is disabled. "
                        "This could be due to a bad CUDA or TF installation.")
        else:
            self.policy_map, self.preprocessors = self._build_policy_map(
                policy_dict, policy_config)

        self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
        if self.multiagent:
            if not ((isinstance(self.env, MultiAgentEnv)
                     or isinstance(self.env, ExternalMultiAgentEnv))
                    or isinstance(self.env, BaseEnv)):
                raise ValueError(
                    "Have multiple policies {}, but the env ".format(
                        self.policy_map) +
                    "{} is not a subclass of BaseEnv, MultiAgentEnv or "
                    "ExternalMultiAgentEnv?".format(self.env))

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }
        if self.worker_index == 0:
            logger.info("Built filter map: {}".format(self.filters))

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = BaseEnv.to_base_env(
            self.env,
            make_env=make_env,
            num_envs=num_envs,
            remote_envs=remote_worker_envs,
            remote_env_batch_wait_ms=remote_env_batch_wait_ms)
        self.num_envs = num_envs

        if self.batch_mode == "truncate_episodes":
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            rollout_fragment_length = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))

        self.io_context = IOContext(log_dir, policy_config, worker_index, self)
        self.reward_estimators = []
        for method in input_evaluation:
            if method == "simulation":
                logger.warning(
                    "Requested 'simulation' input evaluation method: "
                    "will discard all sampler outputs and keep only metrics.")
                sample_async = True
            elif method == "is":
                ise = ImportanceSamplingEstimator.create(self.io_context)
                self.reward_estimators.append(ise)
            elif method == "wis":
                wise = WeightedImportanceSamplingEstimator.create(
                    self.io_context)
                self.reward_estimators.append(wise)
            else:
                raise ValueError(
                    "Unknown evaluation method: {}".format(method))

        if sample_async:
            self.sampler = AsyncSampler(
                self,
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                rollout_fragment_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                blackhole_outputs="simulation" in input_evaluation,
                soft_horizon=soft_horizon,
                no_done_at_end=no_done_at_end,
                observation_fn=observation_fn)
            self.sampler.start()
        else:
            self.sampler = SyncSampler(
                self,
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                rollout_fragment_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                soft_horizon=soft_horizon,
                no_done_at_end=no_done_at_end,
                observation_fn=observation_fn)

        self.input_reader = input_creator(self.io_context)
        assert isinstance(self.input_reader, InputReader), self.input_reader
        self.output_writer = output_creator(self.io_context)
        assert isinstance(self.output_writer, OutputWriter), self.output_writer

        logger.debug(
            "Created rollout worker with env {} ({}), policies {}".format(
                self.async_env, self.env, self.policy_map))
Esempio n. 24
0
    def __init__(
            self,
            *,
            env_creator: Callable[[EnvContext], EnvType],
            validate_env: Optional[Callable[[EnvType, EnvContext],
                                            None]] = None,
            policy_spec: Union[type, Dict[
                str, Tuple[Optional[type], gym.Space, gym.Space,
                           PartialTrainerConfigDict]]] = None,
            policy_mapping_fn: Optional[Callable[[AgentID], PolicyID]] = None,
            policies_to_train: Optional[List[PolicyID]] = None,
            tf_session_creator: Optional[Callable[[], "tf1.Session"]] = None,
            rollout_fragment_length: int = 100,
            batch_mode: str = "truncate_episodes",
            episode_horizon: int = None,
            preprocessor_pref: str = "deepmind",
            sample_async: bool = False,
            compress_observations: bool = False,
            num_envs: int = 1,
            observation_fn: "ObservationFunction" = None,
            observation_filter: str = "NoFilter",
            clip_rewards: bool = None,
            clip_actions: bool = True,
            env_config: EnvConfigDict = None,
            model_config: ModelConfigDict = None,
            policy_config: TrainerConfigDict = None,
            worker_index: int = 0,
            num_workers: int = 0,
            monitor_path: str = None,
            log_dir: str = None,
            log_level: str = None,
            callbacks: Type["DefaultCallbacks"] = None,
            input_creator: Callable[[
                IOContext
            ], InputReader] = lambda ioctx: ioctx.default_sampler_input(),
            input_evaluation: List[str] = frozenset([]),
            output_creator: Callable[
                [IOContext], OutputWriter] = lambda ioctx: NoopOutput(),
            remote_worker_envs: bool = False,
            remote_env_batch_wait_ms: int = 0,
            soft_horizon: bool = False,
            no_done_at_end: bool = False,
            seed: int = None,
            extra_python_environs: dict = None,
            fake_sampler: bool = False,
            spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space,
                                                  gym.spaces.Space]]] = None,
            policy: Union[type, Dict[
                str, Tuple[Optional[type], gym.Space, gym.Space,
                           PartialTrainerConfigDict]]] = None,
    ):
        """Initialize a rollout worker.

        Args:
            env_creator (Callable[[EnvContext], EnvType]): Function that
                returns a gym.Env given an EnvContext wrapped configuration.
            validate_env (Optional[Callable[[EnvType, EnvContext], None]]):
                Optional callable to validate the generated environment (only
                on worker=0).
            policy_spec (Union[type, Dict[str, Tuple[Type[Policy], gym.Space,
                gym.Space, PartialTrainerConfigDict]]]): Either a Policy class
                or a dict of policy id strings to
                (Policy class, obs_space, action_space, config)-tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn can also be set (if not, will map all agents
                to DEFAULT_POLICY_ID).
            policy_mapping_fn (Optional[Callable[[AgentID], PolicyID]]): A
                callable that maps agent ids to policy ids in multi-agent mode.
                This function will be called each time a new agent appears in
                an episode, to bind that agent to a policy for the duration of
                the episode. If not provided, will map all agents to
                DEFAULT_POLICY_ID.
            policies_to_train (Optional[List[PolicyID]]): Optional list of
                policies to train, or None for all policies.
            tf_session_creator (Optional[Callable[[], tf1.Session]]): A
                function that returns a TF session. This is optional and only
                useful with TFPolicy.
            rollout_fragment_length (int): The target number of env transitions
                to include in each sample batch returned from this worker.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of at most `rollout_fragment_length * num_envs` in size.
                    The batch will be exactly
                    `rollout_fragment_length * num_envs` in size if
                    postprocessing does not change batch sizes. Episodes may be
                    truncated in order to meet this size requirement.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `rollout_fragment_length * num_envs` in size.
                    Episodes will not be truncated, but multiple episodes may
                    be packed within one batch to meet the batch size. Note
                    that when `num_envs > 1`, episode steps will be buffered
                    until the episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations.
                They can be decompressed with rllib/utils/compression.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_fn (ObservationFunction): Optional multi-agent
                observation function.
            observation_filter (str): Name of observation filter to use.
            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
                experience postprocessing. Setting to None means clip for Atari
                only.
            clip_actions (bool): Whether to clip action values to the range
                specified by the policy action space.
            env_config (EnvConfigDict): Config to pass to the env creator.
            model_config (ModelConfigDict): Config to use when creating the
                policy model.
            policy_config (TrainerConfigDict): Config to pass to the policy.
                In the multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_spec`.
            worker_index (int): For remote workers, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
            num_workers (int): For remote workers, how many workers altogether
                have been created?
            monitor_path (str): Write out episode stats and videos to this
                directory if specified.
            log_dir (str): Directory where logs can be placed.
            log_level (str): Set the root log level on creation.
            callbacks (DefaultCallbacks): Custom training callbacks.
            input_creator (Callable[[IOContext], InputReader]): Function that
                returns an InputReader object for loading previous generated
                experiences.
            input_evaluation (List[str]): How to evaluate the policy
                performance. This only makes sense to set when the input is
                reading offline data. The possible values include:
                  - "is": the step-wise importance sampling estimator.
                  - "wis": the weighted step-wise is estimator.
                  - "simulation": run the environment in the background, but
                    use this data for evaluation only and never for learning.
            output_creator (Callable[[IOContext], OutputWriter]): Function that
                returns an OutputWriter object for saving generated
                experiences.
            remote_worker_envs (bool): If using num_envs > 1, whether to create
                those new envs in remote processes instead of in the current
                process. This adds overheads, but can make sense if your envs
            remote_env_batch_wait_ms (float): Timeout that remote workers
                are waiting when polling environments. 0 (continue when at
                least one env is ready) is a reasonable default, but optimal
                value could be obtained by measuring your environment
                step / reset and model inference perf.
            soft_horizon (bool): Calculate rewards but don't reset the
                environment when the horizon is hit.
            no_done_at_end (bool): Ignore the done=True at the end of the
                episode and instead record done=False.
            seed (int): Set the seed of both np and tf to this value to
                to ensure each remote worker has unique exploration behavior.
            extra_python_environs (dict): Extra python environments need to
                be set.
            fake_sampler (bool): Use a fake (inf speed) sampler for testing.
            spaces (Optional[Dict[PolicyID, Tuple[gym.spaces.Space,
                gym.spaces.Space]]]): An optional space dict mapping policy IDs
                to (obs_space, action_space)-tuples. This is used in case no
                Env is created on this RolloutWorker.
            policy: Obsoleted arg. Use `policy_spec` instead.
        """
        # Deprecated arg.
        if policy is not None:
            deprecation_warning("policy", "policy_spec", error=False)
            policy_spec = policy
        assert policy_spec is not None, "Must provide `policy_spec` when " \
                                        "creating RolloutWorker!"

        self._original_kwargs: dict = locals().copy()
        del self._original_kwargs["self"]

        global _global_worker
        _global_worker = self

        # set extra environs first
        if extra_python_environs:
            for key, value in extra_python_environs.items():
                os.environ[key] = str(value)

        def gen_rollouts():
            while True:
                yield self.sample()

        ParallelIteratorWorker.__init__(self, gen_rollouts, False)

        policy_config: TrainerConfigDict = policy_config or {}
        if (tf1 and policy_config.get("framework") in ["tf2", "tfe"]
                # This eager check is necessary for certain all-framework tests
                # that use tf's eager_mode() context generator.
                and not tf1.executing_eagerly()):
            tf1.enable_eager_execution()

        if log_level:
            logging.getLogger("ray.rllib").setLevel(log_level)

        if worker_index > 1:
            disable_log_once_globally()  # only need 1 worker to log
        elif log_level == "DEBUG":
            enable_periodic_logging()

        env_context = EnvContext(env_config or {}, worker_index)
        self.env_context = env_context
        self.policy_config: TrainerConfigDict = policy_config
        if callbacks:
            self.callbacks: "DefaultCallbacks" = callbacks()
        else:
            from ray.rllib.agents.callbacks import DefaultCallbacks
            self.callbacks: "DefaultCallbacks" = DefaultCallbacks()
        self.worker_index: int = worker_index
        self.num_workers: int = num_workers
        model_config: ModelConfigDict = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        if not callable(policy_mapping_fn):
            raise ValueError("Policy mapping function not callable?")
        self.env_creator: Callable[[EnvContext], EnvType] = env_creator
        self.rollout_fragment_length: int = rollout_fragment_length * num_envs
        self.batch_mode: str = batch_mode
        self.compress_observations: bool = compress_observations
        self.preprocessing_enabled: bool = True
        self.last_batch: SampleBatchType = None
        self.global_vars: dict = None
        self.fake_sampler: bool = fake_sampler

        # No Env will be used in this particular worker (not needed).
        if worker_index == 0 and num_workers > 0 and \
                policy_config["create_env_on_driver"] is False:
            self.env = None
        # Create an env for this worker.
        else:
            self.env = _validate_env(env_creator(env_context))
            if validate_env is not None:
                validate_env(self.env, self.env_context)

            if isinstance(self.env, (BaseEnv, MultiAgentEnv)):

                def wrap(env):
                    return env  # we can't auto-wrap these env types

            elif is_atari(self.env) and \
                    not model_config.get("custom_preprocessor") and \
                    preprocessor_pref == "deepmind":

                # Deepmind wrappers already handle all preprocessing.
                self.preprocessing_enabled = False

                # If clip_rewards not explicitly set to False, switch it
                # on here (clip between -1.0 and 1.0).
                if clip_rewards is None:
                    clip_rewards = True

                def wrap(env):
                    env = wrap_deepmind(
                        env,
                        dim=model_config.get("dim"),
                        framestack=model_config.get("framestack"))
                    if monitor_path:
                        from gym import wrappers
                        env = wrappers.Monitor(env, monitor_path, resume=True)
                    return env
            else:

                def wrap(env):
                    if monitor_path:
                        from gym import wrappers
                        env = wrappers.Monitor(env, monitor_path, resume=True)
                    return env

            self.env: EnvType = wrap(self.env)

        def make_env(vector_index):
            return wrap(
                env_creator(
                    env_context.copy_with_overrides(
                        worker_index=worker_index,
                        vector_index=vector_index,
                        remote=remote_worker_envs)))

        self.make_env_fn = make_env

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(
            policy_spec, self.env, spaces=spaces)
        self.policies_to_train: List[PolicyID] = policies_to_train or list(
            policy_dict.keys())
        self.policy_map: Dict[PolicyID, Policy] = None
        self.preprocessors: Dict[PolicyID, Preprocessor] = None

        # set numpy and python seed
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
            if not hasattr(self.env, "seed"):
                logger.info("Env doesn't support env.seed(): {}".format(
                    self.env))
            else:
                self.env.seed(seed)
            try:
                assert torch is not None
                torch.manual_seed(seed)
            except AssertionError:
                logger.info("Could not seed torch")
        if _has_tensorflow_graph(policy_dict) and not (
                tf1 and tf1.executing_eagerly()):
            if not tf1:
                raise ImportError("Could not import tensorflow")
            with tf1.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf1.Session(
                        config=tf1.ConfigProto(
                            gpu_options=tf1.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    # set graph-level seed
                    if seed is not None:
                        tf1.set_random_seed(seed)
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
        else:
            self.policy_map, self.preprocessors = self._build_policy_map(
                policy_dict, policy_config)

        if (ray.is_initialized()
                and ray.worker._mode() != ray.worker.LOCAL_MODE):
            # Check available number of GPUs
            if not ray.get_gpu_ids():
                logger.debug("Creating policy evaluation worker {}".format(
                    worker_index) +
                             " on CPU (please ignore any CUDA init errors)")
            elif (policy_config["framework"] in ["tf2", "tf", "tfe"] and
                  not tf.config.experimental.list_physical_devices("GPU")) or \
                    (policy_config["framework"] == "torch" and
                     not torch.cuda.is_available()):
                raise RuntimeError(
                    "GPUs were assigned to this worker by Ray, but "
                    "your DL framework ({}) reports GPU acceleration is "
                    "disabled. This could be due to a bad CUDA- or {} "
                    "installation.".format(policy_config["framework"],
                                           policy_config["framework"]))

        self.multiagent: bool = set(
            self.policy_map.keys()) != {DEFAULT_POLICY_ID}
        if self.multiagent and self.env is not None:
            if not ((isinstance(self.env, MultiAgentEnv)
                     or isinstance(self.env, ExternalMultiAgentEnv))
                    or isinstance(self.env, BaseEnv)):
                raise ValueError(
                    "Have multiple policies {}, but the env ".format(
                        self.policy_map) +
                    "{} is not a subclass of BaseEnv, MultiAgentEnv or "
                    "ExternalMultiAgentEnv?".format(self.env))

        self.filters: Dict[PolicyID, Filter] = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }
        if self.worker_index == 0:
            logger.info("Built filter map: {}".format(self.filters))

        self.num_envs: int = num_envs

        if self.env is None:
            self.async_env = None
        elif "custom_vector_env" in policy_config:
            custom_vec_wrapper = policy_config["custom_vector_env"]
            self.async_env = custom_vec_wrapper(self.env)
        else:
            # Always use vector env for consistency even if num_envs = 1.
            self.async_env: BaseEnv = BaseEnv.to_base_env(
                self.env,
                make_env=make_env,
                num_envs=num_envs,
                remote_envs=remote_worker_envs,
                remote_env_batch_wait_ms=remote_env_batch_wait_ms)

        # `truncate_episodes`: Allow a batch to contain more than one episode
        # (fragments) and always make the batch `rollout_fragment_length`
        # long.
        if self.batch_mode == "truncate_episodes":
            pack = True
        # `complete_episodes`: Never cut episodes and sampler will return
        # exactly one (complete) episode per poll.
        elif self.batch_mode == "complete_episodes":
            rollout_fragment_length = float("inf")
            pack = False
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))

        self.io_context: IOContext = IOContext(log_dir, policy_config,
                                               worker_index, self)
        self.reward_estimators: List[OffPolicyEstimator] = []
        for method in input_evaluation:
            if method == "simulation":
                logger.warning(
                    "Requested 'simulation' input evaluation method: "
                    "will discard all sampler outputs and keep only metrics.")
                sample_async = True
            elif method == "is":
                ise = ImportanceSamplingEstimator.create(self.io_context)
                self.reward_estimators.append(ise)
            elif method == "wis":
                wise = WeightedImportanceSamplingEstimator.create(
                    self.io_context)
                self.reward_estimators.append(wise)
            else:
                raise ValueError(
                    "Unknown evaluation method: {}".format(method))

        if self.env is None:
            self.sampler = None
        elif sample_async:
            self.sampler = AsyncSampler(
                worker=self,
                env=self.async_env,
                policies=self.policy_map,
                policy_mapping_fn=policy_mapping_fn,
                preprocessors=self.preprocessors,
                obs_filters=self.filters,
                clip_rewards=clip_rewards,
                rollout_fragment_length=rollout_fragment_length,
                callbacks=self.callbacks,
                horizon=episode_horizon,
                multiple_episodes_in_batch=pack,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                blackhole_outputs="simulation" in input_evaluation,
                soft_horizon=soft_horizon,
                no_done_at_end=no_done_at_end,
                observation_fn=observation_fn,
                _use_trajectory_view_api=policy_config.get(
                    "_use_trajectory_view_api", False))
            # Start the Sampler thread.
            self.sampler.start()
        else:
            self.sampler = SyncSampler(
                worker=self,
                env=self.async_env,
                policies=self.policy_map,
                policy_mapping_fn=policy_mapping_fn,
                preprocessors=self.preprocessors,
                obs_filters=self.filters,
                clip_rewards=clip_rewards,
                rollout_fragment_length=rollout_fragment_length,
                callbacks=self.callbacks,
                horizon=episode_horizon,
                multiple_episodes_in_batch=pack,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                soft_horizon=soft_horizon,
                no_done_at_end=no_done_at_end,
                observation_fn=observation_fn,
                _use_trajectory_view_api=policy_config.get(
                    "_use_trajectory_view_api", False))

        self.input_reader: InputReader = input_creator(self.io_context)
        self.output_writer: OutputWriter = output_creator(self.io_context)

        logger.debug(
            "Created rollout worker with env {} ({}), policies {}".format(
                self.async_env, self.env, self.policy_map))
Esempio n. 25
0
    def __init__(self,
                 env_creator,
                 policy_graph,
                 policy_mapping_fn=None,
                 policies_to_train=None,
                 tf_session_creator=None,
                 batch_steps=100,
                 batch_mode="truncate_episodes",
                 episode_horizon=None,
                 preprocessor_pref="deepmind",
                 sample_async=False,
                 compress_observations=False,
                 num_envs=1,
                 observation_filter="NoFilter",
                 clip_rewards=None,
                 clip_actions=True,
                 env_config=None,
                 model_config=None,
                 policy_config=None,
                 worker_index=0,
                 monitor_path=None,
                 log_dir=None,
                 log_level=None,
                 callbacks=None,
                 input_creator=lambda ioctx: ioctx.default_sampler_input(),
                 input_evaluation_method=None,
                 output_creator=lambda ioctx: NoopOutput()):
        """Initialize a policy evaluator.

        Arguments:
            env_creator (func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples. If a
                dict is specified, then we are in multi-agent mode and a
                policy_mapping_fn should also be set.
            policy_mapping_fn (func): A function that maps agent ids to
                policy ids in multi-agent mode. This function will be called
                each time a new agent appears in an episode, to bind that agent
                to a policy for the duration of the episode.
            policies_to_train (list): Optional whitelist of policies to train,
                or None for all policies.
            tf_session_creator (func): A function that returns a TF session.
                This is optional and only useful with TFPolicyGraph.
            batch_steps (int): The target number of env transitions to include
                in each sample batch returned from this evaluator.
            batch_mode (str): One of the following batch modes:
                "truncate_episodes": Each call to sample() will return a batch
                    of at most `batch_steps * num_envs` in size. The batch will
                    be exactly `batch_steps * num_envs` in size if
                    postprocessing does not change batch sizes. Episodes may be
                    truncated in order to meet this size requirement.
                "complete_episodes": Each call to sample() will return a batch
                    of at least `batch_steps * num_envs` in size. Episodes will
                    not be truncated, but multiple episodes may be packed
                    within one batch to meet the batch size. Note that when
                    `num_envs > 1`, episode steps will be buffered until the
                    episode completes, and hence batches may contain
                    significant amounts of off-policy data.
            episode_horizon (int): Whether to stop episodes at this horizon.
            preprocessor_pref (str): Whether to prefer RLlib preprocessors
                ("rllib") or deepmind ("deepmind") when applicable.
            sample_async (bool): Whether to compute samples asynchronously in
                the background, which improves throughput but can cause samples
                to be slightly off-policy.
            compress_observations (bool): If true, compress the observations.
                They can be decompressed with rllib/utils/compression.
            num_envs (int): If more than one, will create multiple envs
                and vectorize the computation of actions. This has no effect if
                if the env already implements VectorEnv.
            observation_filter (str): Name of observation filter to use.
            clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
                experience postprocessing. Setting to None means clip for Atari
                only.
            clip_actions (bool): Whether to clip action values to the range
                specified by the policy action space.
            env_config (dict): Config to pass to the env creator.
            model_config (dict): Config to use when creating the policy model.
            policy_config (dict): Config to pass to the policy. In the
                multi-agent case, this config will be merged with the
                per-policy configs specified by `policy_graph`.
            worker_index (int): For remote evaluators, this should be set to a
                non-zero and unique value. This index is passed to created envs
                through EnvContext so that envs can be configured per worker.
            monitor_path (str): Write out episode stats and videos to this
                directory if specified.
            log_dir (str): Directory where logs can be placed.
            log_level (str): Set the root log level on creation.
            callbacks (dict): Dict of custom debug callbacks.
            input_creator (func): Function that returns an InputReader object
                for loading previous generated experiences.
            input_evaluation_method (str): How to evaluate the current policy.
                This only applies when the input is reading offline data.
                Options are:
                  - None: don't evaluate the policy. The episode reward and
                    other metrics will be NaN.
                  - "simulation": run the environment in the background, but
                    use this data for evaluation only and never for learning.
            output_creator (func): Function that returns an OutputWriter object
                for saving generated experiences.
        """

        if log_level:
            logging.getLogger("ray.rllib").setLevel(log_level)

        env_context = EnvContext(env_config or {}, worker_index)
        policy_config = policy_config or {}
        self.policy_config = policy_config
        self.callbacks = callbacks or {}
        model_config = model_config or {}
        policy_mapping_fn = (policy_mapping_fn
                             or (lambda agent_id: DEFAULT_POLICY_ID))
        if not callable(policy_mapping_fn):
            raise ValueError(
                "Policy mapping function not callable. If you're using Tune, "
                "make sure to escape the function with tune.function() "
                "to prevent it from being evaluated as an expression.")
        self.env_creator = env_creator
        self.sample_batch_size = batch_steps * num_envs
        self.batch_mode = batch_mode
        self.compress_observations = compress_observations
        self.preprocessing_enabled = True

        self.env = env_creator(env_context)
        if isinstance(self.env, MultiAgentEnv) or \
                isinstance(self.env, AsyncVectorEnv):

            def wrap(env):
                return env  # we can't auto-wrap these env types
        elif is_atari(self.env) and \
                not model_config.get("custom_preprocessor") and \
                preprocessor_pref == "deepmind":

            # Deepmind wrappers already handle all preprocessing
            self.preprocessing_enabled = False

            if clip_rewards is None:
                clip_rewards = True

            def wrap(env):
                env = wrap_deepmind(
                    env,
                    dim=model_config.get("dim"),
                    framestack=model_config.get("framestack"))
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env
        else:

            def wrap(env):
                if monitor_path:
                    env = _monitor(env, monitor_path)
                return env

        self.env = wrap(self.env)

        def make_env(vector_index):
            return wrap(
                env_creator(env_context.with_vector_index(vector_index)))

        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy_graph, self.env)
        self.policies_to_train = policies_to_train or list(policy_dict.keys())
        if _has_tensorflow_graph(policy_dict):
            if (ray.worker._mode() != ray.worker.LOCAL_MODE
                    and not ray.get_gpu_ids()):
                logger.info("Creating policy evaluation worker {}".format(
                    worker_index) +
                            " on CPU (please ignore any CUDA init errors)")
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
                    self.tf_sess = tf.Session(
                        config=tf.ConfigProto(
                            gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
        else:
            self.policy_map, self.preprocessors = self._build_policy_map(
                policy_dict, policy_config)

        self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
        if self.multiagent:
            if not (isinstance(self.env, MultiAgentEnv)
                    or isinstance(self.env, AsyncVectorEnv)):
                raise ValueError(
                    "Have multiple policy graphs {}, but the env ".format(
                        self.policy_map) +
                    "{} is not a subclass of MultiAgentEnv?".format(self.env))

        self.filters = {
            policy_id: get_filter(observation_filter,
                                  policy.observation_space.shape)
            for (policy_id, policy) in self.policy_map.items()
        }

        # Always use vector env for consistency even if num_envs = 1
        self.async_env = AsyncVectorEnv.wrap_async(
            self.env, make_env=make_env, num_envs=num_envs)
        self.num_envs = num_envs

        if self.batch_mode == "truncate_episodes":
            unroll_length = batch_steps
            pack_episodes = True
        elif self.batch_mode == "complete_episodes":
            unroll_length = float("inf")  # never cut episodes
            pack_episodes = False  # sampler will return 1 episode per poll
        else:
            raise ValueError("Unsupported batch mode: {}".format(
                self.batch_mode))

        if input_evaluation_method == "simulation":
            logger.warning(
                "Requested 'simulation' input evaluation method: "
                "will discard all sampler outputs and keep only metrics.")
            sample_async = True
        elif input_evaluation_method is None:
            pass
        else:
            raise ValueError("Unknown evaluation method: {}".format(
                input_evaluation_method))

        if sample_async:
            self.sampler = AsyncSampler(
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                unroll_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions,
                blackhole_outputs=input_evaluation_method == "simulation")
            self.sampler.start()
        else:
            self.sampler = SyncSampler(
                self.async_env,
                self.policy_map,
                policy_mapping_fn,
                self.preprocessors,
                self.filters,
                clip_rewards,
                unroll_length,
                self.callbacks,
                horizon=episode_horizon,
                pack=pack_episodes,
                tf_sess=self.tf_sess,
                clip_actions=clip_actions)

        self.io_context = IOContext(log_dir, policy_config, worker_index, self)
        self.input_reader = input_creator(self.io_context)
        assert isinstance(self.input_reader, InputReader), self.input_reader
        self.output_writer = output_creator(self.io_context)
        assert isinstance(self.output_writer, OutputWriter), self.output_writer

        logger.debug("Created evaluator with env {} ({}), policies {}".format(
            self.async_env, self.env, self.policy_map))
Esempio n. 26
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter("has_inf_or_nan",
                                        tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        # TODO(rliaw): pull this into model_catalog
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None, ))
        else:
            raise NotImplemented("action space" + str(type(action_space)) +
                                 "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(self.env.observation_space,
                                      self.env.action_space, obs, vtargets,
                                      advs, acts, plog, pvf_preds,
                                      self.logit_dim, self.kl_coeff,
                                      self.distribution_class, self.config,
                                      self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices,
            [
                self.observations, self.value_targets, self.advantages,
                self.actions, self.prev_logits, self.prev_vf_preds
            ], self.per_device_batch_size, build_loss, self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_policy_loss for policy in policies]),
                0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[policy.mean_vf_loss for policy in policies]),
                0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[policy.mean_entropy for policy in policies]),
                0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())
Esempio n. 27
0
def generate_policies(
    policy_id: str,
    policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space",
                                    dict],
    policies: Dict[str, TFPolicy],
    policies_to_train: List[str],
    dead_policies: Set[str],
    policy_config: dict,
    preprocessors: Dict[str, Any],
    obs_filters: Dict[str, Any],
    observation_filter: str,
    tf_sess,
):
    """
    Get policies for each ``agent_id``, and instantiate new ones
    for newly created agents.
    """

    policy_cls, obs_space, act_space, conf = policy_constructor_tuple

    if (policy_id in preprocessors) != (policy_id in policies):
        raise ValueError("'preprocessors' and 'policies' do not agree.")
    if (policy_id in obs_filters) != (policy_id in policies):
        raise ValueError("'obs_filters' and 'policies' do not agree.")

    # If we haven't seen this id, we instantiate a new policy.
    if policy_id not in policies:

        # We assume configs are homogeneous.
        # Use a dead policy for this new agent.
        if dead_policies:
            dead_policy_id = dead_policies.pop()
            dead_preprocessor = preprocessors.pop(dead_policy_id)
            dead_obs_space = dead_preprocessor.observation_space
            dead_policy = policies.pop(dead_policy_id)
            dead_obs_filter = obs_filters.pop(dead_policy_id)

            start = time.time()
            # Run variable initializer ops, assuming tf model.
            trainable_model_variables = dead_policy.model.trainable_variables()
            sess = dead_policy.get_session()
            sess.run([var.initializer for var in trainable_model_variables])

            preprocessors[policy_id] = dead_preprocessor
            policies[policy_id] = dead_policy
            obs_filters[policy_id] = dead_obs_filter

            policies_to_train.append(policy_id)
            # DEBUG
            print("sampler.py: Reinitializing dead model: %fs" %
                  (time.time() - start))
        else:
            merged_conf = merge_dicts(policy_config, conf)

            # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``.
            preprocessor = ModelCatalog.get_preprocessor_for_space(
                obs_space, merged_conf.get("model"))
            preprocessors[policy_id] = preprocessor
            obs_space = preprocessor.observation_space

            if tf and tf.executing_eagerly():
                if hasattr(policy_cls, "as_eager"):
                    policy_cls = policy_cls.as_eager()
                    if policy_config["eager_tracing"]:
                        policy_cls = policy_cls.with_tracing()
                elif not issubclass(policy_cls, TFPolicy):
                    pass  # could be some other type of policy
                else:
                    raise ValueError("This policy does not support eager "
                                     "execution: {}".format(policy_cls))

            if tf:
                # TODO: Is this necessary? Yes.
                with tf.variable_scope(policy_id):
                    # DEBUG
                    print("sampler.py: Default graph:", tf.get_default_graph())
                    print("sampler.py: Calling policy init.")
                    start = time.time()
                    policies[policy_id] = policy_cls(obs_space, act_space,
                                                     merged_conf)
                # DEBUG
                print("sampler.py: Done policy init: %fs" %
                      (time.time() - start))
                policies_to_train.append(policy_id)
            else:
                policies[policy_id] = policy_cls(obs_space, act_space,
                                                 merged_conf)
                policies_to_train.append(policy_id)

            # DEBUG
            # print("sampler.py: Getting new filter.")
            obs_filters[policy_id] = get_filter(observation_filter,
                                                obs_space.shape)
            # DEBUG
            # print("sampler.py: Got new filter.")
    return policies, preprocessors, obs_filters, policies_to_train, dead_policies