コード例 #1
0
    def testCustomActionDistribution(self):
        ray.init()
        # registration
        ModelCatalog.register_custom_action_dist("test",
                                                 CustomActionDistribution)
        action_space = Box(0, 1, shape=(5, 3), dtype=np.float32)

        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(str(dist_cls), str(CustomActionDistribution))
        self.assertEqual(param_shape, action_space.shape)

        # test the class works as a distribution
        dist_input = tf.placeholder(tf.float32, (None, ) + param_shape)
        dist = dist_cls(dist_input, model_config=model_config)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()

        # test passing the options to it
        model_config["custom_options"].update({"output_dim": (3, )})
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(param_shape, (3, ))
        dist_input = tf.placeholder(tf.float32, (None, ) + param_shape)
        dist = dist_cls(dist_input, model_config=model_config)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()
コード例 #2
0
ファイル: policies.py プロジェクト: robertnishihara/ray
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, action_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
コード例 #3
0
    def __init__(self, state_values, cumulative_rewards, logits, actions,
                 action_space, beta):
        ma_adv_norm = tf.get_variable(
            name="moving_average_of_advantage_norm",
            dtype=tf.float32,
            initializer=100.0,
            trainable=False)
        # advantage estimation
        adv = cumulative_rewards - state_values
        # update averaged advantage norm
        update_adv_norm = tf.assign_add(
            ref=ma_adv_norm,
            value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))

        # exponentially weighted advantages
        with tf.control_dependencies([update_adv_norm]):
            exp_advs = tf.exp(
                beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))

        # log\pi_\theta(a|s)
        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
        action_dist = dist_cls(logits)
        logprobs = action_dist.logp(actions)

        self.loss = -1.0 * tf.reduce_mean(
            tf.stop_gradient(exp_advs) * logprobs)
コード例 #4
0
    def __init__(self, obs_space, action_space, config):
        self.action_space = action_space
        self.action_noise_std = config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space)
        self.observation_filter = get_filter(config["observation_filter"],
                                             self.preprocessor.shape)
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, config["model"], dist_type="deterministic")
        model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs},
                                       obs_space, action_space, dist_dim,
                                       config["model"])
        dist = dist_class(model.outputs, model)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
コード例 #5
0
ファイル: policies.py プロジェクト: vliviu/ray
    def __init__(self, sess, action_space, preprocessor, observation_filter,
                 action_noise_std):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor

        if observation_filter == "MeanStdFilter":
            self.observation_filter = MeanStdFilter(self.preprocessor.shape,
                                                    clip=None)
        elif observation_filter == "NoFilter":
            self.observation_filter = NoFilter()
        else:
            raise Exception("Unknown observation_filter: " +
                            str("observation_filter"))

        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, dist_type="deterministic")
        model = ModelCatalog.get_model(self.inputs, dist_dim)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum([
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items()
        ])
        self.sess.run(tf.global_variables_initializer())
コード例 #6
0
    def test_custom_multi_action_distribution(self):
        class Model:
            pass

        ray.init(
            object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True
        )  # otherwise fails sometimes locally
        # registration
        ModelCatalog.register_custom_action_dist("test", CustomMultiActionDistribution)
        s1 = Discrete(5)
        s2 = Box(0, 1, shape=(3,), dtype=np.float32)
        spaces = dict(action_1=s1, action_2=s2)
        action_space = Dict(spaces)
        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(action_space, model_config)
        self.assertIsInstance(dist_cls, partial)
        self.assertEqual(param_shape, s1.n + 2 * s2.shape[0])

        # test the class works as a distribution
        dist_input = tf1.placeholder(tf.float32, (None, param_shape))
        model = Model()
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertIsInstance(dist.sample(), dict)
        self.assertIn("action_1", dist.sample())
        self.assertIn("action_2", dist.sample())
        self.assertEqual(dist.sample()["action_1"].dtype, tf.int64)
        self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape)

        with self.assertRaises(NotImplementedError):
            dist.entropy()
コード例 #7
0
ファイル: policies.py プロジェクト: vtpp2014/ray
    def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std):
        self.ac_space = ac_space
        self.ac_noise_std = ac_noise_std
        self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape)

        with tf.variable_scope(type(self).__name__) as scope:
            # Observation normalization.
            ob_mean = tf.get_variable(
                'ob_mean', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            ob_std = tf.get_variable(
                'ob_std', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            in_mean = tf.placeholder(tf.float32, self.preprocessor_shape)
            in_std = tf.placeholder(tf.float32, self.preprocessor_shape)
            self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[
                tf.assign(ob_mean, in_mean),
                tf.assign(ob_std, in_std),
            ])

            inputs = tf.placeholder(
                tf.float32, [None] + list(self.preprocessor_shape))

            # TODO(ekl): we should do clipping in a standard RLlib preprocessor
            clipped_inputs = tf.clip_by_value(
                (inputs - ob_mean) / ob_std, -5.0, 5.0)

            # Policy network.
            dist_class, dist_dim = ModelCatalog.get_action_dist(
                self.ac_space, dist_type='deterministic')
            model = ModelCatalog.get_model(clipped_inputs, dist_dim)
            dist = dist_class(model.outputs)
            self._act = U.function([inputs], dist.sample())
        return scope
コード例 #8
0
    def __init__(self, registry, sess, action_space, preprocessor,
                 observation_filter):
        self.sess = sess
        self.action_space = action_space
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(
            observation_filter, self.preprocessor.shape)
        self.inputs = tf.placeholder(
            tf.float32, [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, dist_type="deterministic")
        model = ModelCatalog.get_model(registry, self.inputs, dist_dim,
                                       options={"fcnet_hiddens": [32, 32]})
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum([np.prod(variable.shape.as_list())
                               for _, variable
                               in self.variables.variables.items()])
        self.sess.run(tf.global_variables_initializer())
コード例 #9
0
ファイル: policies.py プロジェクト: zbarry/ray
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
コード例 #10
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"])
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        self.inputs = [("obs", self.observations),
                       ("value_targets", self.value_targets),
                       ("advantages", self.advantages),
                       ("actions", self.actions),
                       ("logprobs", self.prev_logits),
                       ("vf_preds", self.prev_vf_preds)]
        self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs])

        # References to the model weights
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
コード例 #11
0
ファイル: marwil_policy.py プロジェクト: x-malet/ray
    def __init__(self, state_values, cumulative_rewards, logits, actions,
                 action_space, beta):
        ma_adv_norm = tf.get_variable(name="moving_average_of_advantage_norm",
                                      dtype=tf.float32,
                                      initializer=100.0,
                                      trainable=False)
        # advantage estimation
        adv = cumulative_rewards - state_values
        # update averaged advantage norm
        update_adv_norm = tf.assign_add(
            ref=ma_adv_norm,
            value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))

        # exponentially weighted advantages
        with tf.control_dependencies([update_adv_norm]):
            exp_advs = tf.exp(beta *
                              tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))

        # log\pi_\theta(a|s)
        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
        action_dist = dist_cls(logits)
        logprobs = action_dist.logp(actions)

        self.loss = -1.0 * tf.reduce_mean(
            tf.stop_gradient(exp_advs) * logprobs)
コード例 #12
0
ファイル: sac_tf_policy.py プロジェクト: marload/ray
def _get_dist_class(policy: Policy,
                    config: TrainerConfigDict,
                    action_space: gym.spaces.Space) -> \
        Type[TFActionDistribution]:
    """Helper function to return a dist class based on config and action space.

    Args:
        policy (Policy): The policy for which to return the action
            dist class.
        config (TrainerConfigDict): The Trainer's config dict.
        action_space (gym.spaces.Space): The action space used.

    Returns:
        Type[TFActionDistribution]: A TF distribution class.
    """
    if hasattr(policy, "dist_class") and policy.dist_class is not None:
        return policy.dist_class
    elif config["model"].get("custom_action_dist"):
        action_dist_class, _ = ModelCatalog.get_action_dist(action_space,
                                                            config["model"],
                                                            framework="tf")
        return action_dist_class
    elif isinstance(action_space, Discrete):
        return Categorical
    elif isinstance(action_space, Simplex):
        return Dirichlet
    else:
        assert isinstance(action_space, Box)
        if config["normalize_actions"]:
            return SquashedGaussian if \
                not config["_use_beta_distribution"] else Beta
        else:
            return DiagGaussian
コード例 #13
0
    def __init__(self, obs_space, action_space, config):
        super().__init__(obs_space, action_space, config)
        self.action_noise_std = self.config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(
            self.observation_space)
        self.observation_filter = get_filter(self.config["observation_filter"],
                                             self.preprocessor.shape)

        self.single_threaded = self.config.get("single_threaded", False)
        if self.config["framework"] == "tf":
            self.sess = make_session(single_threaded=self.single_threaded)

            # Set graph-level seed.
            if config.get("seed") is not None:
                with self.sess.as_default():
                    tf1.set_random_seed(config["seed"])

            self.inputs = tf1.placeholder(tf.float32, [None] +
                                          list(self.preprocessor.shape))
        else:
            if not tf1.executing_eagerly():
                tf1.enable_eager_execution()
            self.sess = self.inputs = None
            if config.get("seed") is not None:
                # Tf2.x.
                if config.get("framework") == "tf2":
                    tf.random.set_seed(config["seed"])
                # Tf-eager.
                elif tf1 and config.get("framework") == "tfe":
                    tf1.set_random_seed(config["seed"])

        # Policy network.
        self.dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, self.config["model"], dist_type="deterministic")

        self.model = ModelCatalog.get_model_v2(
            obs_space=self.preprocessor.observation_space,
            action_space=self.action_space,
            num_outputs=dist_dim,
            model_config=self.config["model"],
        )

        self.sampler = None
        if self.sess:
            dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs})
            dist = self.dist_class(dist_inputs, self.model)
            self.sampler = dist.sample()
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                dist_inputs, self.sess)
            self.sess.run(tf1.global_variables_initializer())
        else:
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                [], None, self.model.variables())

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
コード例 #14
0
 def __init__(self, inputs, model, action_space, name):
     child_dist = []
     input_lens = []
     for action in action_space.spaces:
         dist, action_size = ModelCatalog.get_action_dist(action, {})
         child_dist.append(dist)
         input_lens.append(action_size)
     super().__init__(inputs, model, action_space, child_dist, input_lens)
     with tf.variable_scope(name):
         self.entropy_list = [s.entropy() for s in self.child_distributions]
コード例 #15
0
    def test_custom_action_distribution(self):
        class Model():
            pass

        ray.init(object_store_memory=1000 * 1024 * 1024,
                 ignore_reinit_error=True)  # otherwise fails sometimes locally
        # registration
        ModelCatalog.register_custom_action_dist("test",
                                                 CustomActionDistribution)
        action_space = Box(0, 1, shape=(5, 3), dtype=np.float32)

        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(str(dist_cls), str(CustomActionDistribution))
        self.assertEqual(param_shape, action_space.shape)

        # test the class works as a distribution
        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model = Model()
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()

        # test passing the options to it
        model_config["custom_model_config"].update({"output_dim": (3, )})
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(param_shape, (3, ))
        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()
def build_model(policy, obs_space, action_space, config):
    _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"])

    policy.model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        logit_dim,
        config["model"],
        name=POLICY_SCOPE,
        framework="tf",
    )

    return policy.model
コード例 #17
0
 def _init(self, config, env_creator):
     self.env = env_creator(config["env_config"])
     self.state = {}
     self._policy = ImitationTFPolicy
     action_space = self.env.action_space
     dist_class, logit_dim = ModelCatalog.get_action_dist(
         action_space, self.config["model"])
     self.workers = self._make_workers(env_creator, self._policy, config,
                                       self.config["num_workers"])
     self.execution_plan = default_execution_plan
     #self.train_exec_impl = self.execution_plan(self.workers, config)
     self.train_exec_impl = None
     self.optimizer = ImitationMetrics(self.workers)
コード例 #18
0
ファイル: a2oc_policy.py プロジェクト: 7thStringofZhef/ray
def option_critic_make_model_and_action_dist(policy, obs_space, action_space, config):
    # Basic distribution class should be fine as long as I input the logits corresponding to the correct option
    dist_class = ModelCatalog.get_action_dist(
        action_space,
        config,
        framework="torch"
    )
    # option critic vision network. May want to revise to register this as a custom model, then grab it
    model = OptionCriticVisionNetwork(
        obs_space,
        action_space,
        action_space.n,
        config, 
        'test')
    return model, dist_class
コード例 #19
0
def get_distribution_inputs_and_class(policy,
                                      model,
                                      obs_batch,
                                      *,
                                      explore=True,
                                      is_training=False,
                                      **kwargs):
    model_out, _ = model({
        "obs": obs_batch,
        "is_training": is_training,
    }, [], None)
    dist_inputs = model.get_policy_output(model_out)
    dist_class, logit_dim = ModelCatalog.get_action_dist(
                    model.action_space, policy.config["model"], framework="torch")
    return dist_inputs, dist_class, []  # []=state out
コード例 #20
0
def make_mu_model(policy, obs_space, action_space, config):
    _, logit_dim = ModelCatalog.get_action_dist(
        action_space, config["model"], framework="torch")
    
    base_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=logit_dim,
        model_config=config["model"],
        framework="torch")
    
    mu_model = MuZeroModel(obs_space, action_space, logit_dim, config["model"], name="MuZeroModel",
                           base_model=base_model)
    
    return mu_model
コード例 #21
0
    def __init__(self, obs_space, action_space, config):
        """Target Network is updated by the master learner every
        trainer.update_target_frequency steps. All worker batches
        are importance sampled w.r. to the target network to ensure
        a more stable pi_old in PPO.
        """
        assert config[DELAY_UPDATE]
        _, logit_dim = ModelCatalog.get_action_dist(action_space,
                                                    config["model"])
        self.target_model = ModelCatalog.get_model_v2(obs_space,
                                                      action_space,
                                                      logit_dim,
                                                      config["model"],
                                                      name=TARGET_POLICY_SCOPE,
                                                      framework="tf")

        self.model_vars = self.model.variables()
        self.target_model_vars = self.target_model.variables()

        self.get_session().run(tf.initialize_variables(self.target_model_vars))

        self.tau_value = config.get("tau")
        self.tau = tf.placeholder(tf.float32, (), name="tau")
        assign_ops = []
        assert len(self.model_vars) == len(self.target_model_vars)
        for var, var_target in zip(self.model_vars, self.target_model_vars):
            assign_ops.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        self.update_target_expr = tf.group(*assign_ops)

        @make_tf_callable(self.get_session(), True)
        def compute_clone_network_logits(ob):
            # def compute_clone_network_logits(ob, prev_action, prev_reward):
            # We do not support recurrent network now.
            feed_dict = {
                SampleBatch.CUR_OBS: tf.convert_to_tensor(ob),
                # SampleBatch.PREV_REWARDS: tf.convert_to_tensor(
                #     prev_reward),
                "is_training": tf.convert_to_tensor(False)
            }
            # if prev_action is not None:
            #     feed_dict[SampleBatch.PREV_ACTIONS] = tf.convert_to_tensor(
            #         prev_action)
            model_out, _ = self.target_model(feed_dict)
            return model_out

        self._compute_clone_network_logits = compute_clone_network_logits
コード例 #22
0
ファイル: es_torch_policy.py プロジェクト: tuyulers5/jav44
def make_model_and_action_dist(policy, observation_space, action_space,
                               config):
    # Policy network.
    dist_class, dist_dim = ModelCatalog.get_action_dist(
        action_space,
        config["model"],  # model_options
        dist_type="deterministic",
        framework="torch")
    model = ModelCatalog.get_model_v2(policy.preprocessor.observation_space,
                                      action_space,
                                      num_outputs=dist_dim,
                                      model_config=config["model"],
                                      framework="torch")
    # Make all model params not require any gradients.
    for p in model.parameters():
        p.requires_grad = False
    return model, dist_class
コード例 #23
0
    def __init__(self, obs_space, action_space, config):
        assert config[DELAY_UPDATE]

        # Build the target network of this policy.
        _, logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"]
        )
        self.target_model = ModelCatalog.get_model_v2(
            obs_space,
            action_space,
            logit_dim,
            config["model"],
            name="target_func",
            framework="tf"
        )
        self.model_vars = self.model.variables()
        self.target_model_vars = self.target_model.variables()

        self.get_session().run(
            tf.variables_initializer(self.target_model_vars))

        # Here is the delayed update mechanism.
        self.tau_value = config.get("tau")
        self.tau = tf.placeholder(tf.float32, (), name="tau")
        assign_ops = []
        assert len(self.model_vars) == len(self.target_model_vars)
        for var, var_target in zip(self.model_vars, self.target_model_vars):
            assign_ops.append(
                var_target.
                    assign(self.tau * var + (1.0 - self.tau) * var_target)
            )
        self.update_target_expr = tf.group(*assign_ops)

        @make_tf_callable(self.get_session(), True)
        def compute_clone_network_logits(ob):
            feed_dict = {
                SampleBatch.CUR_OBS: tf.convert_to_tensor(ob),
                "is_training": tf.convert_to_tensor(False)
            }
            model_out, _ = self.target_model(feed_dict)
            return model_out

        self._compute_clone_network_logits = compute_clone_network_logits
コード例 #24
0
ファイル: nomad_model.py プロジェクト: ndalton12/AMPED
def make_nomad_model(policy, obs_space, action_space, config):
    _, logit_dim = ModelCatalog.get_action_dist(action_space,
                                                config["model"],
                                                framework="torch")

    base_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                           action_space=action_space,
                                           num_outputs=logit_dim,
                                           model_config=config["model"],
                                           framework="torch")

    nomad_model = NomadModel(obs_space,
                             action_space,
                             logit_dim,
                             config["model"],
                             name="NomadModel",
                             base_model=base_model,
                             order=config["mcts_param"]["order"])

    return nomad_model
コード例 #25
0
def build_appo_model(policy, obs_space, action_space, config):
    _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"])

    policy.model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        logit_dim,
        config["model"],
        name=POLICY_SCOPE,
        framework="torch" if config["use_pytorch"] else "tf")
    policy.model_variables = policy.model.variables()

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        logit_dim,
        config["model"],
        name=TARGET_POLICY_SCOPE,
        framework="torch" if config["use_pytorch"] else "tf")
    policy.target_model_variables = policy.target_model.variables()

    return policy.model
コード例 #26
0
ファイル: policies.py プロジェクト: yuanfeng0905/ray
    def __init__(self,
                 sess,
                 action_space,
                 preprocessor,
                 observation_filter,
                 action_noise_std,
                 options={}):

        if len(preprocessor.shape) > 1:
            raise UnsupportedSpaceException(
                "Observation space {} is not supported with ARS.".format(
                    preprocessor.shape))

        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, dist_type="deterministic")

        model = ModelCatalog.get_model(self.inputs, dist_dim, options=options)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
コード例 #27
0
ファイル: sac_policy.py プロジェクト: ziyuwan/pipeline-psro
def build_action_output(policy, model, input_dict, obs_space, action_space,
                        config):

    logits, _ = model({
        "obs": input_dict[SampleBatch.CUR_OBS],
        "is_training": policy._get_is_training_placeholder(),
    }, [], None)

    dist_class, logit_dim = ModelCatalog.get_action_dist(
        action_space, config["model"])

    action_dist = dist_class(logits, model)
    stochastic_actions = action_dist.sample()
    log_pis = action_dist.sampled_action_logp()

    deterministic_actions = tf.math.argmax(logits, dimension=-1)

    actions = tf.cond(policy.stochastic, lambda: stochastic_actions,
                      lambda: deterministic_actions)

    action_probabilities = tf.cond(policy.stochastic, lambda: log_pis,
                                   lambda: tf.zeros_like(log_pis))
    policy.output_actions = actions
    return actions, action_probabilities
コード例 #28
0
ファイル: marwil_policy.py プロジェクト: x-malet/ray
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(tf.float32, [None],
                                         name="prev_reward")

        with tf.variable_scope(POLICY_SCOPE) as scope:
            self.model = ModelCatalog.get_model(
                {
                    "obs": self.obs_t,
                    "prev_actions": prev_actions_ph,
                    "prev_rewards": prev_rewards_ph,
                    "is_training": self._get_is_training_placeholder(),
                }, observation_space, action_space, logit_dim,
                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(VALUE_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (self.p_loss.loss +
                     self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicy
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            (SampleBatch.CUR_OBS, self.obs_t),
            (SampleBatch.ACTIONS, self.act_t),
            (Postprocessing.ADVANTAGES, self.cum_rew_t),
        ]
        TFPolicy.__init__(self,
                          observation_space,
                          action_space,
                          self.sess,
                          obs_input=self.obs_t,
                          action_sampler=self.output_actions,
                          action_prob=action_dist.sampled_action_prob(),
                          loss=objective,
                          model=self.model,
                          loss_inputs=self.loss_inputs,
                          state_inputs=self.model.state_in,
                          state_outputs=self.model.state_out,
                          prev_action_input=prev_actions_ph,
                          prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }
コード例 #29
0
ファイル: agent.py プロジェクト: xgong/ray
    def __init__(
            self, name, batchsize, preprocessor, config, logdir, is_remote):
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor)
        if preprocessor.shape is None:
            preprocessor.shape = self.env.observation_space.shape
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.preprocessor = preprocessor
        self.sess = tf.Session(config=config_proto)
        if config["use_tf_debugger"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + preprocessor.shape)
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(
                tf.float32, shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None,))
        else:
            raise NotImplemented(
                "action space" + str(type(action_space)) +
                "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = 1
            self.per_device_batch_size = 1
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, advs, acts, plog):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, advs, acts, plog, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.advantages, self.actions,
             self.prev_logits],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None)
        self.reward_filter = MeanStdFilter((), clip=5.0)
        self.sess.run(tf.global_variables_initializer())
コード例 #30
0
 def required_model_output_shape(action_space, model_config):
     input_lens = []
     for action in action_space.spaces:
         dist, action_size = ModelCatalog.get_action_dist(action, {})
         input_lens.append(action_size)
     return sum(input_lens)
コード例 #31
0
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter("has_inf_or_nan",
                                        tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(name="newkl",
                                       shape=(),
                                       dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None, ))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None, ))

        action_space = self.env.action_space
        # TODO(rliaw): pull this into model_catalog
        if isinstance(action_space, gym.spaces.Box):
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, action_space.shape[0]))
        elif isinstance(action_space, gym.spaces.Discrete):
            self.actions = tf.placeholder(tf.int64, shape=(None, ))
        else:
            raise NotImplemented("action space" + str(type(action_space)) +
                                 "currently not supported")
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(tf.float32,
                                          shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, ))

        assert config["sgd_batchsize"] % len(devices) == 0, \
            "Batch size must be evenly divisible by devices"
        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = config["sgd_batchsize"]
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(self.env.observation_space,
                                      self.env.action_space, obs, vtargets,
                                      advs, acts, plog, pvf_preds,
                                      self.logit_dim, self.kl_coeff,
                                      self.distribution_class, self.config,
                                      self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices,
            [
                self.observations, self.value_targets, self.advantages,
                self.actions, self.prev_logits, self.prev_vf_preds
            ], self.per_device_batch_size, build_loss, self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(
                    values=[policy.mean_policy_loss for policy in policies]),
                0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[policy.mean_vf_loss for policy in policies]),
                0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[policy.mean_entropy for policy in policies]),
                0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(config["observation_filter"],
                                     self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }
        self.sampler = SyncSampler(self.env, self.common_policy,
                                   self.obs_filter, self.config["horizon"],
                                   self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())
コード例 #32
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(
            tf.float32, shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(
            tf.float32, [None], name="prev_reward")

        with tf.variable_scope(P_SCOPE) as scope:
            self.model = ModelCatalog.get_model({
                "obs": self.obs_t,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            }, observation_space, action_space, logit_dim,
                                                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(V_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = _scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (
            self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("advantages", self.cum_rew_t),
        ]
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.obs_t,
            action_sampler=self.output_actions,
            action_prob=action_dist.sampled_action_prob(),
            loss=objective,
            model=self.model,
            loss_inputs=self.loss_inputs,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions_ph,
            prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }
コード例 #33
0
    def __init__(self, env_creator, config, is_ext_train=False):
        self.local_steps = 0
        self.config = config
        self.summarize = config.get("summarize")
        env = ModelCatalog.get_preprocessor_as_wrapper(
            env_creator(self.config["env_config"]), self.config["model"])

        if is_ext_train:
            train_dataset = input_fn(
                self.config["inverse_model"]["ext_train_file_path"])
            valid_dataset = input_fn(
                self.config["inverse_model"]["ext_valid_file_path"])
            iterator = tf.data.Iterator.from_structure(
                train_dataset.output_types, train_dataset.output_shapes)
            next_element = iterator.get_next()
            self.x = next_element[0]
            self.ac = next_element[1]

            self.training_init_op = iterator.make_initializer(train_dataset)
            self.validation_init_op = iterator.make_initializer(valid_dataset)
        else:
            self.x = tf.placeholder(
                tf.float32,
                shape=[
                    None,
                    numpy.prod([2] + list(env.observation_space.shape))
                ])
            if isinstance(env.action_space, gym.spaces.Box):
                self.ac = tf.placeholder(tf.float32,
                                         [None] + list(env.action_space.shape),
                                         name="ac")
            elif isinstance(env.action_space, gym.spaces.Discrete):
                self.ac = tf.placeholder(tf.int64, [None], name="ac")
            else:
                raise NotImplementedError("action space" +
                                          str(type(env.action_space)) +
                                          "currently not supported")

        # Setup graph
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            env.action_space, self.config["model"])
        self._model = FullyConnectedNetwork(self.x, logit_dim, {})
        self.logits = self._model.outputs
        self.curr_dist = dist_class(self.logits)
        self.sample = self.curr_dist.sample()
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        # Setup loss
        log_prob = self.curr_dist.logp(self.ac)
        self.pi_loss = -tf.reduce_sum(log_prob)
        self.loss = self.pi_loss
        self.optimizer = tf.train.AdamOptimizer(self.config["lr"]).minimize(
            self.loss)

        # Setup similarity -> cosine similarity
        normalize_sample = tf.nn.l2_normalize(self.sample, 1)
        normalize_ac = tf.nn.l2_normalize(self.ac, 1)
        self.similarity = 1 - tf.losses.cosine_distance(
            normalize_sample, normalize_ac, dim=1)

        # Initialize
        self.initialize()
コード例 #34
0
ファイル: ppo_evaluator.py プロジェクト: adgirish/ray
    def __init__(self, registry, env_creator, config, logdir, is_remote):
        self.registry = registry
        self.is_remote = is_remote
        if is_remote:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            devices = ["/cpu:0"]
        else:
            devices = config["devices"]
        self.devices = devices
        self.config = config
        self.logdir = logdir
        self.env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        if is_remote:
            config_proto = tf.ConfigProto()
        else:
            config_proto = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=config_proto)
        if config["tf_debug_inf_or_nan"] and not is_remote:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
            self.sess.add_tensor_filter(
                "has_inf_or_nan", tf_debug.has_inf_or_nan)

        # Defines the training inputs:
        # The coefficient of the KL penalty.
        self.kl_coeff = tf.placeholder(
            name="newkl", shape=(), dtype=tf.float32)

        # The input observations.
        self.observations = tf.placeholder(
            tf.float32, shape=(None,) + self.env.observation_space.shape)
        # Targets of the value function.
        self.value_targets = tf.placeholder(tf.float32, shape=(None,))
        # Advantage values in the policy gradient estimator.
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
        self.prev_logits = tf.placeholder(
            tf.float32, shape=(None, self.logit_dim))
        # Value function predictions before the policy update.
        self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))

        if is_remote:
            self.batch_size = config["rollout_batchsize"]
            self.per_device_batch_size = config["rollout_batchsize"]
        else:
            self.batch_size = int(
                config["sgd_batchsize"] / len(devices)) * len(devices)
            assert self.batch_size % len(devices) == 0
            self.per_device_batch_size = int(self.batch_size / len(devices))

        def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
            return ProximalPolicyLoss(
                self.env.observation_space, self.env.action_space,
                obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
                self.kl_coeff, self.distribution_class, self.config,
                self.sess, self.registry)

        self.par_opt = LocalSyncParallelOptimizer(
            tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
            self.devices,
            [self.observations, self.value_targets, self.advantages,
             self.actions, self.prev_logits, self.prev_vf_preds],
            self.per_device_batch_size,
            build_loss,
            self.logdir)

        # Metric ops
        with tf.name_scope("test_outputs"):
            policies = self.par_opt.get_device_losses()
            self.mean_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.loss for policy in policies]), 0)
            self.mean_policy_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_policy_loss for policy in policies]), 0)
            self.mean_vf_loss = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_vf_loss for policy in policies]), 0)
            self.mean_kl = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_kl for policy in policies]), 0)
            self.mean_entropy = tf.reduce_mean(
                tf.stack(values=[
                    policy.mean_entropy for policy in policies]), 0)

        # References to the model weights
        self.common_policy = self.par_opt.get_common_loss()
        self.variables = ray.experimental.TensorFlowVariables(
            self.common_policy.loss, self.sess)
        self.obs_filter = get_filter(
            config["observation_filter"], self.env.observation_space.shape)
        self.rew_filter = MeanStdFilter((), clip=5.0)
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = SyncSampler(
            self.env, self.common_policy, self.obs_filter,
            self.config["horizon"], self.config["horizon"])
        self.sess.run(tf.global_variables_initializer())