Python scope_vars Examples

Programming Language: Python

Namespace/Package Name: ray.rllib.utils.tf_ops

Method/Function: scope_vars

Examples at hotexamples.com: 4

Python scope_vars - 4 examples found. These are the top rated real world Python examples of ray.rllib.utils.tf_ops.scope_vars extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: marwil_policy.py Project: x-malet/ray

    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(tf.float32, [None],
                                         name="prev_reward")

        with tf.variable_scope(POLICY_SCOPE) as scope:
            self.model = ModelCatalog.get_model(
                {
                    "obs": self.obs_t,
                    "prev_actions": prev_actions_ph,
                    "prev_rewards": prev_rewards_ph,
                    "is_training": self._get_is_training_placeholder(),
                }, observation_space, action_space, logit_dim,
                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(VALUE_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (self.p_loss.loss +
                     self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicy
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            (SampleBatch.CUR_OBS, self.obs_t),
            (SampleBatch.ACTIONS, self.act_t),
            (Postprocessing.ADVANTAGES, self.cum_rew_t),
        ]
        TFPolicy.__init__(self,
                          observation_space,
                          action_space,
                          self.sess,
                          obs_input=self.obs_t,
                          action_sampler=self.output_actions,
                          action_prob=action_dist.sampled_action_prob(),
                          loss=objective,
                          model=self.model,
                          loss_inputs=self.loss_inputs,
                          state_inputs=self.model.state_in,
                          state_outputs=self.model.state_out,
                          prev_action_input=prev_actions_ph,
                          prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }

Example #2

Show file

File: modelv1_compat.py Project: rustbeltanalytica/Web-Assets

 def variables(self):
     var_list = super(ModelV1Wrapper, self).variables()
     for v in scope_vars(self.variable_scope):
         if v not in var_list:
             var_list.append(v)
     return var_list

Example #3

Show file

    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config)
        if not isinstance(action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    action_space))
        if len(action_space.shape) > 1:
            raise UnsupportedSpaceException(
                "Action space has multiple dimensions "
                "{}. ".format(action_space.shape) +
                "Consider reshaping this into a single dimension, "
                "using a Tuple action space, or the multi-agent API.")

        self.config = config
        self.cur_noise_scale = 1.0
        self.cur_pure_exploration_phase = False
        self.dim_actions = action_space.shape[0]
        self.low_action = action_space.low
        self.high_action = action_space.high

        # create global step for counting the number of update operations
        self.global_step = tf.train.get_or_create_global_step()

        # use separate optimizers for actor & critic
        self._actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["actor_lr"])
        self._critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["critic_lr"])

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
        self.pure_exploration_phase = tf.placeholder(
            tf.bool, (), name="pure_exploration_phase")
        self.cur_observations = tf.placeholder(tf.float32,
                                               shape=(None, ) +
                                               observation_space.shape,
                                               name="cur_obs")

        with tf.variable_scope(POLICY_SCOPE) as scope:
            policy_out, self.policy_model = self._build_policy_network(
                self.cur_observations, observation_space, action_space)
            self.policy_vars = scope_vars(scope.name)

        # Noise vars for P network except for layer normalization vars
        if self.config["parameter_noise"]:
            self._build_parameter_noise([
                var for var in self.policy_vars if "LayerNorm" not in var.name
            ])

        # Action outputs
        with tf.variable_scope(ACTION_SCOPE):
            self.output_actions = self._add_exploration_noise(
                policy_out, self.stochastic, self.noise_scale,
                self.pure_exploration_phase, action_space)

        if self.config["smooth_target_policy"]:
            self.reset_noise_op = tf.no_op()
        else:
            with tf.variable_scope(ACTION_SCOPE, reuse=True):
                exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
                self.reset_noise_op = tf.assign(exploration_sample,
                                                self.dim_actions * [.0])

        # Replay inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape,
                                    name="observation")
        self.act_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + action_space.shape,
                                    name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(tf.float32,
                                      shape=(None, ) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(tf.float32, [None],
                                                 name="weight")

        # policy network evaluation
        with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope:
            prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
            self.policy_t, _ = self._build_policy_network(
                self.obs_t, observation_space, action_space)
            policy_batchnorm_update_ops = list(
                set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
                prev_update_ops)

        # target policy network evaluation
        with tf.variable_scope(POLICY_TARGET_SCOPE) as scope:
            policy_tp1, _ = self._build_policy_network(self.obs_tp1,
                                                       observation_space,
                                                       action_space)
            target_policy_vars = scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(ACTION_SCOPE, reuse=True):
            if config["smooth_target_policy"]:
                target_noise_clip = self.config["target_noise_clip"]
                clipped_normal_sample = tf.clip_by_value(
                    tf.random_normal(tf.shape(policy_tp1),
                                     stddev=self.config["target_noise"]),
                    -target_noise_clip, target_noise_clip)
                policy_tp1_smoothed = tf.clip_by_value(
                    policy_tp1 + clipped_normal_sample,
                    action_space.low * tf.ones_like(policy_tp1),
                    action_space.high * tf.ones_like(policy_tp1))
            else:
                # no smoothing, just use deterministic actions
                policy_tp1_smoothed = policy_tp1

        # q network evaluation
        prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
        with tf.variable_scope(Q_SCOPE) as scope:
            # Q-values for given actions & observations in given current
            q_t, self.q_model = self._build_q_network(self.obs_t,
                                                      observation_space,
                                                      action_space, self.act_t)
            self.q_func_vars = scope_vars(scope.name)
        self.stats = {
            "mean_q": tf.reduce_mean(q_t),
            "max_q": tf.reduce_max(q_t),
            "min_q": tf.reduce_min(q_t),
        }
        with tf.variable_scope(Q_SCOPE, reuse=True):
            # Q-values for current policy (no noise) in given current state
            q_t_det_policy, _ = self._build_q_network(self.obs_t,
                                                      observation_space,
                                                      action_space,
                                                      self.policy_t)
        if self.config["twin_q"]:
            with tf.variable_scope(TWIN_Q_SCOPE) as scope:
                twin_q_t, self.twin_q_model = self._build_q_network(
                    self.obs_t, observation_space, action_space, self.act_t)
                self.twin_q_func_vars = scope_vars(scope.name)
        q_batchnorm_update_ops = list(
            set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

        # target q network evaluation
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space,
                                             action_space, policy_tp1_smoothed)
            target_q_func_vars = scope_vars(scope.name)
        if self.config["twin_q"]:
            with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope:
                twin_q_tp1, _ = self._build_q_network(self.obs_tp1,
                                                      observation_space,
                                                      action_space,
                                                      policy_tp1_smoothed)
                twin_target_q_func_vars = scope_vars(scope.name)

        if self.config["twin_q"]:
            self.critic_loss, self.actor_loss, self.td_error \
                = self._build_actor_critic_loss(
                    q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t,
                    twin_q_tp1=twin_q_tp1)
        else:
            self.critic_loss, self.actor_loss, self.td_error \
                = self._build_actor_critic_loss(
                    q_t, q_tp1, q_t_det_policy)

        if config["l2_reg"] is not None:
            for var in self.policy_vars:
                if "bias" not in var.name:
                    self.actor_loss += (config["l2_reg"] * 0.5 *
                                        tf.nn.l2_loss(var))
            for var in self.q_func_vars:
                if "bias" not in var.name:
                    self.critic_loss += (config["l2_reg"] * 0.5 *
                                         tf.nn.l2_loss(var))
            if self.config["twin_q"]:
                for var in self.twin_q_func_vars:
                    if "bias" not in var.name:
                        self.critic_loss += (config["l2_reg"] * 0.5 *
                                             tf.nn.l2_loss(var))

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        self.tau_value = config.get("tau")
        self.tau = tf.placeholder(tf.float32, (), name="tau")
        update_target_expr = []
        for var, var_target in zip(
                sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        if self.config["twin_q"]:
            for var, var_target in zip(
                    sorted(self.twin_q_func_vars, key=lambda v: v.name),
                    sorted(twin_target_q_func_vars, key=lambda v: v.name)):
                update_target_expr.append(
                    var_target.assign(self.tau * var +
                                      (1.0 - self.tau) * var_target))
        for var, var_target in zip(
                sorted(self.policy_vars, key=lambda v: v.name),
                sorted(target_policy_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        self.update_target_expr = tf.group(*update_target_expr)

        self.sess = tf.get_default_session()
        self.loss_inputs = [
            (SampleBatch.CUR_OBS, self.obs_t),
            (SampleBatch.ACTIONS, self.act_t),
            (SampleBatch.REWARDS, self.rew_t),
            (SampleBatch.NEXT_OBS, self.obs_tp1),
            (SampleBatch.DONES, self.done_mask),
            (PRIO_WEIGHTS, self.importance_weights),
        ]
        input_dict = dict(self.loss_inputs)

        if self.config["use_state_preprocessor"]:
            # Model self-supervised losses
            self.actor_loss = self.policy_model.custom_loss(
                self.actor_loss, input_dict)
            self.critic_loss = self.q_model.custom_loss(
                self.critic_loss, input_dict)
            if self.config["twin_q"]:
                self.critic_loss = self.twin_q_model.custom_loss(
                    self.critic_loss, input_dict)

        TFPolicy.__init__(self,
                          observation_space,
                          action_space,
                          self.sess,
                          obs_input=self.cur_observations,
                          action_sampler=self.output_actions,
                          loss=self.actor_loss + self.critic_loss,
                          loss_inputs=self.loss_inputs,
                          update_ops=q_batchnorm_update_ops +
                          policy_batchnorm_update_ops)
        self.sess.run(tf.global_variables_initializer())

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            tf.group(q_t_det_policy, q_tp1), self.sess)

        # Hard initial update
        self.update_target(tau=1.0)

Example #4

Show file

File: modelv1_compat.py Project: x-malet/ray

 def variables(self):
     return super(ModelV1Wrapper, self).variables() + scope_vars(
         self.variable_scope)