Ejemplo n.º 1
0
    def __init__(self, env, action_script, scale, to_learn, use_mask=True,
                 learn_residuals=False):
        """
        Args:
            env: GKP environmen
            action_script: module or class with attributes corresponding to
                           action components such as 'alpha', 'phi' etc
            scale: dictionary of scaling factors for action components
            to_learn: dictionary of bool values for action components
            use_mask: flag to control masking of action components
            learn_residuals (bool): flag to learn residual over the scripted
                protocol. If False, will learn actions from scratch. If True,
                will learn a residual to be added to scripted protocol.

        """
        super(ActionWrapper, self).__init__(env)

        self.scale = scale
        self.period = action_script.period # periodicity of the protocol
        self.to_learn = to_learn
        self.use_mask = use_mask
        self.mask = action_script.mask
        self.learn_residuals = learn_residuals
        
        # load the script of actions and convert to tensors
        self.script = action_script.script
        for a, val in self.script.items():
            self.script[a] = tf.constant(val, dtype=tf.float32)

        self._action_spec = {a : specs.BoundedTensorSpec(
            shape = C.shape[1:], dtype=tf.float32, minimum=-1, maximum=1)
            for a, C in self.script.items() if self.to_learn[a]}
Ejemplo n.º 2
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 batch_size=1,
                 policy_state_spec_name='policy_state_spec',
                 policy_state_name='policy_state',
                 initial_policy_state=None):
        batch_shape = (batch_size, )
        self._batch_shape = batch_shape
        minimum = np.asarray(1, dtype=np.int32)
        maximum = np.asarray(2, dtype=np.int32)
        self._maximum = maximum
        policy_state_spec = specs.BoundedTensorSpec(
            (),
            tf.int32,
            minimum=minimum,
            maximum=maximum,
            name=policy_state_spec_name)
        info_spec = action_spec
        self._policy_state = common.create_variable(name=policy_state_name,
                                                    initial_value=maximum,
                                                    shape=batch_shape,
                                                    dtype=tf.int32)
        if initial_policy_state is None:
            self._initial_policy_state = tf.fill([batch_size],
                                                 tf.constant(0, tf.int32))
        else:
            self._initial_policy_state = initial_policy_state

        super(TFPolicyMock, self).__init__(time_step_spec, action_spec,
                                           policy_state_spec, info_spec)
Ejemplo n.º 3
0
    def get_policy(self):
        def policy_fn(observation, dtype=tf.int32):
            if tf.rank(observation) < 1:
                observation = [observation]

            if self._latent_policy:
                embed = self._embed_state(observation)
            else:
                embed = tf.one_hot(observation, self._num_states)
            distribution = tf.matmul(
                embed, tf.nn.softmax(self._embed_policy_logits, axis=-1))

            policy_info = {'distribution': distribution}
            return (tfp.distributions.Categorical(probs=distribution,
                                                  dtype=dtype), policy_info)

        policy_info_spec = {
            'log_probability':
            specs.TensorSpec([], tf.float32),
            'distribution':
            specs.BoundedTensorSpec([self._num_actions],
                                    tf.float32,
                                    minimum=0.0,
                                    maximum=1.0)
        }
        return policy_fn, policy_info_spec
Ejemplo n.º 4
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 batch_size=1,
                 policy_state_spec_name='policy_state_spec',
                 policy_state_name='policy_state'):
        batch_shape = (batch_size, )
        self._batch_shape = batch_shape
        minimum = np.asarray(1, dtype=np.int32)
        maximum = np.asarray(2, dtype=np.int32)
        self._maximum = maximum
        policy_state_spec = specs.BoundedTensorSpec(
            (),
            tf.int32,
            minimum=minimum,
            maximum=maximum,
            name=policy_state_spec_name)
        info_spec = action_spec
        self._policy_state = tf.get_variable(
            name=policy_state_name,
            shape=batch_shape,
            dtype=tf.int32,
            initializer=tf.constant_initializer(maximum))
        self._initial_policy_state = tf.constant(0,
                                                 shape=batch_shape,
                                                 dtype=tf.int32)

        super(TFPolicyMock, self).__init__(time_step_spec, action_spec,
                                           policy_state_spec, info_spec)
Ejemplo n.º 5
0
 def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'):
   self._dtype = dtype
   self._scope = scope
   self._initial_state = tf.cast(initial_state, dtype=self._dtype)
   observation_spec = specs.TensorSpec([1], self._dtype, 'observation')
   action_spec = specs.BoundedTensorSpec([], tf.int32, minimum=0, maximum=10)
   time_step_spec = ts.time_step_spec(observation_spec)
   super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec)
   self._state = common.create_variable('state', initial_state,
                                        dtype=self._dtype)
   self.steps = common.create_variable('steps', 0)
   self.episodes = common.create_variable('episodes', 0)
   self.resets = common.create_variable('resets', 0)
Ejemplo n.º 6
0
 def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'):
     self._dtype = dtype
     self._scope = scope
     self._initial_state = tf.cast(initial_state, dtype=self._dtype)
     observation_spec = specs.TensorSpec([1], self._dtype, 'observation')
     action_spec = specs.BoundedTensorSpec([],
                                           tf.int32,
                                           minimum=0,
                                           maximum=10)
     time_step_spec = ts.time_step_spec(observation_spec)
     super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec)
     with tf.compat.v1.variable_scope(self._scope):
         self._state = tf.Variable(initial_state,
                                   name='state',
                                   dtype=self._dtype)
         self.steps = tf.Variable(0, name='steps')
         self.episodes = tf.Variable(0, name='episodes')
         self.resets = tf.Variable(0, name='resets')
Ejemplo n.º 7
0
 def testLoad(self):
   specs.ArraySpec([1, 2, 3], np.int32)
   specs.BoundedArraySpec([1, 2, 3], np.int32, 0, 1)
   specs.TensorSpec([1, 2, 3], np.int32)
   specs.BoundedTensorSpec([1, 2, 3], np.int32, 0, 1)
Ejemplo n.º 8
0
    def __init__(self,
                 tf_env,
                 context_ranges=None,
                 context_shapes=None,
                 state_indices=None,
                 variable_indices=None,
                 gamma_index=None,
                 settable_context=False,
                 timers=None,
                 samplers=None,
                 reward_weights=None,
                 reward_fn=None,
                 random_sampler_mode='random',
                 normalizers=None,
                 context_transition_fn=None,
                 context_multi_transition_fn=None,
                 meta_action_every_n=None):
        self._tf_env = tf_env
        self.variable_indices = variable_indices
        self.gamma_index = gamma_index
        self._settable_context = settable_context
        self.timers = timers
        self._context_transition_fn = context_transition_fn
        self._context_multi_transition_fn = context_multi_transition_fn
        self._random_sampler_mode = random_sampler_mode

        # assign specs
        self._obs_spec = self._tf_env.observation_spec()
        self._context_shapes = tuple([
            shape if shape is not None else self._obs_spec.shape
            for shape in context_shapes
        ])
        self.context_specs = tuple([
            specs.TensorSpec(dtype=self._obs_spec.dtype, shape=shape)
            for shape in self._context_shapes
        ])
        if context_ranges is not None:
            self.context_ranges = context_ranges
        else:
            self.context_ranges = [None] * len(self._context_shapes)

        self.context_as_action_specs = tuple([
            specs.BoundedTensorSpec(
                shape=shape,
                dtype=(tf.float32 if self._obs_spec.dtype
                       in [tf.float32, tf.float64] else self._obs_spec.dtype),
                minimum=context_range[0],
                maximum=context_range[-1]) for shape, context_range in zip(
                    self._context_shapes, self.context_ranges)
        ])

        if state_indices is not None:
            self.state_indices = state_indices
        else:
            self.state_indices = [None] * len(self._context_shapes)
        if self.variable_indices is not None and self.n != len(
                self.variable_indices):
            raise ValueError(
                'variable_indices (%s) must have the same length as contexts (%s).'
                % (self.variable_indices, self.context_specs))
        assert self.n == len(self.context_ranges)
        assert self.n == len(self.state_indices)

        # assign reward/sampler fns
        self._sampler_fns = dict()
        self._samplers = dict()
        self._reward_fns = dict()

        # assign reward fns
        self._add_custom_reward_fns()
        reward_weights = reward_weights or None
        self._reward_fn = self._make_reward_fn(reward_fn, reward_weights)

        # assign samplers
        self._add_custom_sampler_fns()
        for mode, sampler_fns in samplers.items():
            self._make_sampler_fn(sampler_fns, mode)

        # create normalizers
        if normalizers is None:
            self._normalizers = [None] * len(self.context_specs)
        else:
            self._normalizers = [
                normalizer(tf.zeros(shape=spec.shape, dtype=spec.dtype))
                if normalizer is not None else None
                for normalizer, spec in zip(normalizers, self.context_specs)
            ]
        assert self.n == len(self._normalizers)

        self.meta_action_every_n = meta_action_every_n

        # create vars
        self.context_vars = {}
        self.timer_vars = {}
        self.create_vars(self.VAR_NAME)
        self.t = tf.Variable(tf.zeros(shape=(), dtype=tf.int32),
                             name='num_timer_steps')
Ejemplo n.º 9
0
def create_tf_policy_from_table(probability_table,
                                obs_to_index_fn,
                                return_distribution=False):
    """Creates a callable policy function given a table of state to distribution.

  Args:
    probability_table: A Tensor-like object determining the action distribution.
    obs_to_index_fn: A function mapping environment observation to index in
      table.
    return_distribution: Whether policy_fn should return a distribution. If not,
      returns sampled actions.

  Returns:
    policy_fn: A function mapping observations to action distribution or sampled
      actions and policy info.
    policy_info_spec: A spec that determines the type of objects returned by
      policy info.
  """
    probability_table = tf.convert_to_tensor(probability_table,
                                             dtype=tf.float32)
    n_actions = tf.shape(probability_table)[-1]

    def policy_fn(observation,
                  probability_table=probability_table,
                  obs_to_index_fn=obs_to_index_fn,
                  return_distribution=return_distribution,
                  dtype=tf.int32):
        state = obs_to_index_fn(observation)
        distribution = tf.gather(probability_table, state)
        batched = tf.rank(distribution) > 1
        if not batched:
            distributions = distribution[None, :]
        else:
            distributions = distribution

        batch_size = tf.shape(distributions)[0]

        actions = tf.random.categorical(tf.math.log(1e-8 + distributions),
                                        1,
                                        dtype=dtype)
        actions = tf.squeeze(actions, -1)
        probs = tf.gather_nd(
            distributions,
            tf.stack([tf.range(batch_size, dtype=dtype), actions], -1))

        if not batched:
            action = actions[0]
            log_prob = tf.math.log(1e-8 + probs[0])
        else:
            action = actions
            log_prob = tf.math.log(1e-8 + probs)

        if return_distribution:
            policy_info = {'distribution': distribution}
            return (tfp.distributions.Categorical(probs=distribution,
                                                  dtype=dtype), policy_info)
        else:
            policy_info = {
                'log_probability': log_prob,
                'distribution': distribution
            }
            return action, policy_info

    policy_info_spec = {
        'log_probability':
        specs.TensorSpec([], tf.float32),
        'distribution':
        specs.BoundedTensorSpec([n_actions],
                                tf.float32,
                                minimum=0.0,
                                maximum=1.0)
    }
    return policy_fn, policy_info_spec