Beispiel #1
0
    def __init__(self,
                 env_spec,
                 cond_policy,
                 hidden_layer_sizes=(100, 100),
                 reg=1e-3,
                 squash=True,
                 reparameterize=True,
                 name='gaussian_cond_policy',
                 joint=False,
                 opponent_policy=False,
                 agent_id=None):
        """
        Args:
            env_spec (`rllab.EnvSpec`): Specification of the environment
                to create the policy for.
            hidden_layer_sizes (`list` of `int`): Sizes for the Multilayer
                perceptron hidden layers.
            reg (`float`): Regularization coeffiecient for the Gaussian parameters.
            squash (`bool`): If True, squash the Gaussian the gmm action samples
               between -1 and 1 with tanh.
            reparameterize ('bool'): If True, gradients will flow directly through
                the action samples.
        """
        Serializable.quick_init(self, locals())

        if isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
                if opponent_policy:
                    print('opponent_policy', opponent_policy)
                    self._action_dim = env_spec.action_space.opponent_flat_dim(
                        agent_id)
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim

        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim
        self.cond_policy = cond_policy
        self._hidden_layers = hidden_layer_sizes
        self._is_deterministic = False
        self._fixed_h = None
        self._squash = squash
        self._reparameterize = reparameterize
        self._reg = reg
        self._observation_ph = tf.placeholder(
            dtype=tf.float32,
            shape=(None, self._observation_dim),
            name='observations',
        )
        self.name = name + '_agent_{}'.format(agent_id)
        self.build()

        self._scope_name = (tf.get_variable_scope().name + "/" +
                            self.name).lstrip("/")

        super(NNPolicy, self).__init__(env_spec)
Beispiel #2
0
    def __init__(self, env_spec, policies, agent_id, k, mu=1., name='g_level_k', correct_tanh=True):
        Serializable.quick_init(self, locals())
        self._policies = policies
        assert k > 1
        self._k = k
        self._mu = mu
        self._dists = self.level_distribution(self._k, self._mu)
        if correct_tanh:
            self._correction_factor = 1.
        else:
            self._correction_factor = 0.

        self._observation_dim = env_spec.observation_space[agent_id].flat_dim
        self._action_dim = env_spec.action_space[agent_id].flat_dim

        self._name = name + '_agent_{}'.format(agent_id)
        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='{}_observation_agent_{}'.format(name, agent_id))

        self.agent_id = agent_id
        self._actions, self.all_actions = self.actions_for(self._observation_ph, reuse=True, all_action=True)

        super(GeneralizedMultiLevelPolicy, self).__init__(
            env_spec, self._observation_ph, self._actions, self._name)
Beispiel #3
0
    def __init__(self, env_spec, q_functions):
        Serializable.quick_init(self, locals())

        self.q_functions = q_functions
        agent_id = 0
        joint = True
        if isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim

        self._observations_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observations')
        self._actions_ph = tf.placeholder(tf.float32,
                                          shape=[None, self._action_dim],
                                          name='actions')

        self._output = self.output_for(self._observations_ph,
                                       self._actions_ph,
                                       reuse=True)
Beispiel #4
0
    def __init__(self,
                 env_spec=None,
                 observation_space=None,
                 action_space=None,
                 hidden_layer_sizes=(100, 100),
                 name='value_function',
                 joint=False,
                 agent_id=None):
        Serializable.quick_init(self, locals())
        self._name = name + '_agent_{}'.format(agent_id)

        if env_spec is None:
            self._observation_dim = observation_space.flat_dim
            self._action_dim = env_spec.action_space.flat_dim
        elif isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim

        self._observations_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observations_agent_{}'.format(agent_id))
        super(NNVFunction,
              self).__init__(inputs=(self._observations_ph, ),
                             name=self._name,
                             hidden_layer_sizes=hidden_layer_sizes)
Beispiel #5
0
    def __init__(self, env_spec, max_replay_buffer_size, joint=False, agent_id=None):
        super(SimpleReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)
        self.joint = joint
        self._env_spec = env_spec
        self.agent_id = agent_id
        if isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[agent_id].flat_dim
            self._action_dim = env_spec.action_space[agent_id].flat_dim
            if joint:
                self._opponent_action_dim = env_spec.action_space.opponent_flat_dim(agent_id)
                print(agent_id, self._opponent_action_dim )
                self._opponent_actions = np.zeros((max_replay_buffer_size, self._opponent_action_dim ))
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim

        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros((max_replay_buffer_size,
                                       self._observation_dim))
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = np.zeros((max_replay_buffer_size,
                                   self._observation_dim))
        self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
        self._rewards = np.zeros(max_replay_buffer_size)
        self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
        self._top = 0
        self._size = 0
Beispiel #6
0
 def __setstate__(self, d):
     Serializable.__setstate__(self, d)
     global load_params
     if load_params:
         tf.get_default_session().run(
             tf.variables_initializer(self.get_params()))
         self.set_param_values(d["params"])
Beispiel #7
0
    def __init__(self,
                 env_spec,
                 max_pool_size,
                 replacement_policy='stochastic',
                 replacement_prob=1.0,
                 max_skip_episode=10):
        Serializable.quick_init(self, locals())
        super(SimpleReplayPool, self).__init__(env_spec)

        max_pool_size = int(max_pool_size)

        self._max_pool_size = max_pool_size
        self._replacement_policy = replacement_policy
        self._replacement_prob = replacement_prob
        self._max_skip_episode = max_skip_episode
        self._observations = np.zeros((max_pool_size, self._observation_dim))
        self._actions = np.zeros((max_pool_size, self._action_dim))
        self._rewards = np.zeros(max_pool_size)
        # self._terminals[i] = a terminal was received at time i
        self._terminals = np.zeros(max_pool_size, dtype='uint8')
        # self._final_state[i] = state i was the final state in a rollout,
        # so it should never be sampled since it has no correspond
        # In other words, we're saving the s_{t+1} after sampling a tuple of
        # (s_t, a_t, r_t, s_{t+1}, TERMINAL=TRUE)
        self._final_state = np.zeros(max_pool_size, dtype='uint8')
        self._bottom = 0
        self._top = 0
        self._size = 0
        self._env_info = dict()
Beispiel #8
0
    def __init__(self,
                 env_spec,
                 hidden_layer_sizes=(100, 100),
                 name='qf',
                 joint=False,
                 agent_id=None):
        Serializable.quick_init(self, locals())
        if isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim

        self._obs_pl = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation',
        )

        self._action_pl = tf.placeholder(
            tf.float32,
            shape=[None, self._action_dim],
            name='actions',
        )

        super(NNQFunction,
              self).__init__(name, (self._obs_pl, self._action_pl),
                             hidden_layer_sizes)
Beispiel #9
0
    def __init__(self, env_spec, obs_pl, action, scope_name=None):
        Serializable.quick_init(self, locals())

        self._observation_ph = obs_pl
        self._action = action
        self._scope_name = (tf.get_variable_scope().name
                            if not scope_name else scope_name)
        super(NNPolicy, self).__init__(env_spec)
Beispiel #10
0
    def __init__(self, inputs, name, hidden_layer_sizes):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name
        self._inputs = inputs
        self._layer_sizes = list(hidden_layer_sizes) + [1]

        self._output = self._output_for(self._inputs)
Beispiel #11
0
    def __setstate__(self, d):
        """Set Serializable state fo the RLAlgorithm instance."""

        Serializable.__setstate__(self, d)
        self._qf1.set_param_values(d['qf1-params'])
        self._qf2.set_param_values(d['qf2-params'])
        self._vf.set_param_values(d['vf-params'])
        self._policy.set_param_values(d['policy-params'])
        self._pool.__setstate__(d['pool'])
        self._env.__setstate__(d['env'])
Beispiel #12
0
    def __init__(self,
                 env_spec=None,
                 observation_space=None,
                 action_space=None,
                 opponent_action_space=None,
                 hidden_layer_sizes=(100, 100),
                 squash=False,
                 squash_func=tf.tanh,
                 name='conditional_policy',
                 u_range=1.,
                 shift=None,
                 scale=None,
                 joint=False,
                 agent_id=None,
                 sampling=False):
        Serializable.quick_init(self, locals())
        self.agent_id = agent_id
        if env_spec is None:
            self._observation_dim = observation_space.flat_dim
            self._action_dim = action_space.flat_dim
            self._opponent_action_dim = opponent_action_space.flat_dim
        else:
            assert isinstance(env_spec, MAEnvSpec)
            assert agent_id is not None
            self._action_dim = env_spec.action_space[agent_id].flat_dim
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            self._opponent_action_dim = env_spec.action_space.opponent_flat_dim(
                agent_id)
        print('opp dim', self._opponent_action_dim)
        self._layer_sizes = list(hidden_layer_sizes) + [
            self._opponent_action_dim
        ]
        self._squash = squash
        self._squash_func = squash_func
        self._u_range = u_range
        self.shift = shift
        self.scale = scale
        self.sampling = sampling
        self._name = name + '_agent_{}'.format(agent_id)

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation_{}_agent_{}'.format(name, agent_id))
        self._actions_ph = tf.placeholder(tf.float32,
                                          shape=[None, self._action_dim],
                                          name='actions_{}_agent_{}'.format(
                                              name, agent_id))
        self._opponent_actions = self.actions_for(self._observation_ph,
                                                  self._actions_ph)

        super(StochasticNNConditionalPolicy,
              self).__init__(env_spec, self._observation_ph,
                             self._opponent_actions, self._name)
Beispiel #13
0
 def __init__(
         self,
         observation_space,
         action_space):
     """
     :type observation_space: Space
     :type action_space: Space
     """
     Serializable.quick_init(self, locals())
     self._observation_space = observation_space
     self._action_space = action_space
Beispiel #14
0
    def __init__(self,
                 env_spec=None,
                 observation_space=None,
                 action_space=None,
                 nego_round=1,
                 hidden_layer_sizes=(100, 100),
                 squash=False,
                 squash_func=tf.tanh,
                 name='accstochastic_policy',
                 u_range=1.,
                 shift=None,
                 scale=None,
                 joint=False,
                 agent_id=None,
                 sampling=False):
        Serializable.quick_init(self, locals())
        if env_spec is None:
            self._observation_dim = observation_space
            self._action_dim = action_space
        elif isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space.flat_dim
            self._action_dim = env_spec.action_space.flat_dim
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim**2
        self._layer_sizes = list(hidden_layer_sizes) + [1]
        self._squash = squash
        self._squash_func = squash_func
        self._u_range = u_range
        self.shift = shift
        self.scale = scale
        self._name = name + '_agent_{}'.format(agent_id)
        self.sampling = sampling
        self.agent_id = agent_id
        self.nego_round = nego_round

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation_{}_agent_{}'.format(name, agent_id))
        # self._actions_ph = tf.placeholder(
        #     tf.float32,
        #     shape=[None, self._observation_dim],
        #     name='actions_{}_agent_{}'.format(name, agent_id))
        # self._observation_ph = None
        # self._actions = None

        self._actions = self.actions_for(self._observation_ph)

        super(ACConditionedStochasticNNPolicy,
              self).__init__(env_spec, self._observation_ph, self._actions,
                             self._name)
Beispiel #15
0
 def __init__(self,
              env_spec,
              agent_id,
              opponent=False,
              urange=[-1, 1.],
              if_softmax=False):
     Serializable.quick_init(self, locals())
     self._action_dim = env_spec.action_space[agent_id].flat_dim
     self._urange = urange
     self._if_softmax = if_softmax
     if opponent:
         self._action_dim = env_spec.action_space.opponent_flat_dim(
             agent_id)
     self._name = 'uniform_policy_{}'.format(agent_id)
     super(UniformPolicy, self).__init__(env_spec)
Beispiel #16
0
 def __init__(
         self,
         observation_spaces,
         action_spaces):
     """
     :type observation_spaces: MASpace
     :type action_spaces: MASpace
     """
     Serializable.quick_init(self, locals())
     assert isinstance(observation_spaces, MASpace)
     assert isinstance(action_spaces, MASpace)
     self.agent_num = observation_spaces.agent_num
     self._observation_spaces = observation_spaces
     self._action_spaces = action_spaces
     self._env_specs = np.array(EnvSpec(observation_space, action_space) for observation_space, action_space in zip(observation_spaces, action_spaces))
Beispiel #17
0
    def __init__(self,
                 env_spec,
                 K=2,
                 hidden_layer_sizes=(100, 100),
                 reg=1e-3,
                 squash=True,
                 reparameterize=False,
                 qf=None,
                 name='gmm_policy'):
        """
        Args:
            env_spec (`rllab.EnvSpec`): Specification of the environment
                to create the policy for.
            K (`int`): Number of mixture components.
            hidden_layer_sizes (`list` of `int`): Sizes for the Multilayer
                perceptron hidden layers.
            reg (`float`): Regularization coeffiecient for the GMM parameters.
            squash (`bool`): If True, squash the GMM the gmm action samples
               between -1 and 1 with tanh.
            qf (`ValueFunction`): Q-function approximator.
        """
        Serializable.quick_init(self, locals())

        self._hidden_layers = hidden_layer_sizes
        self._action_dim = env_spec.action_space.flat_dim
        self._Ds = env_spec.observation_space.flat_dim
        self._K = K
        self._is_deterministic = False
        self._fixed_h = None
        self._squash = squash
        self._qf = qf
        self._reg = reg

        # We can only reparameterize if there was one component in the GMM,
        # in which case one should use sac.policies.GaussianPolicy
        assert not reparameterize
        self._reparameterize = reparameterize

        self.name = name
        self.build()

        self._scope_name = (tf.get_variable_scope().name + "/" +
                            name).lstrip("/")

        # TODO.code_consolidation: This should probably call
        # `super(GMMPolicy, self).__init__`
        super(NNPolicy, self).__init__(env_spec)
Beispiel #18
0
 def __init__(self,
              env_spec,
              agent_id=None,
              hidden_layer_sizes=(100, 100),
              name='vf'):
     Serializable.quick_init(self, locals())
     self._observation_dim = env_spec.observation_space.flat_dim
     if agent_id is not None and agent_id != 'all':
         self._observation_dim = env_spec.observation_space[
             agent_id].flat_dim
     self._obs_pl = tf.placeholder(
         tf.float32,
         shape=[None, self._observation_dim],
         name='observation',
     )
     super(NNVFunction, self).__init__(name, (self._obs_pl, ),
                                       hidden_layer_sizes)
Beispiel #19
0
    def __init__(self, env_spec, q_functions):
        Serializable.quick_init(self, locals())

        self.q_functions = q_functions

        self._action_dim = env_spec.action_space.flat_dim
        self._observation_dim = env_spec.observation_space.flat_dim

        self._observations_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observations')
        self._actions_ph = tf.placeholder(tf.float32,
                                          shape=[None, self._action_dim],
                                          name='actions')

        self._output = self.output_for(self._observations_ph,
                                       self._actions_ph,
                                       reuse=True)
Beispiel #20
0
    def __getstate__(self):
        """Get Serializable state of the RLALgorithm instance."""

        d = Serializable.__getstate__(self)
        d.update({
            'qf1-params': self._qf1.get_param_values(),
            'qf2-params': self._qf2.get_param_values(),
            'vf-params': self._vf.get_param_values(),
            'policy-params': self._policy.get_param_values(),
            'pool': self._pool.__getstate__(),
            'env': self._env.__getstate__(),
        })
        return d
Beispiel #21
0
 def __init__(self,
              agent_num,
              game_name='pbeauty',
              p=0.67,
              reward_type='abs',
              action_low=-1.,
              action_high=1.):
     Serializable.quick_init(self, locals())
     self.agent_num = agent_num
     self.p = p
     self.game_name = game_name
     self.reward_type = reward_type
     self.action_range = [action_low, action_high]
     lows = np.array(
         [np.array([action_low]) for _ in range(self.agent_num)])
     highs = np.array(
         [np.array([action_high]) for _ in range(self.agent_num)])
     self.action_spaces = MABox(lows=lows, highs=highs)
     self.observation_spaces = MADiscrete([1] * self.agent_num)
     self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces)
     self.t = 0
     self.rewards = np.zeros((self.agent_num, ))
Beispiel #22
0
    def __init__(self, env_spec, base_policy, conditional_policy, opponent_conditional_policy, agent_id, k, name='level_k'):
        Serializable.quick_init(self, locals())
        self._base_policy = base_policy
        self._conditional_policy = conditional_policy
        self._opponent_conditional_policy = opponent_conditional_policy
        self._k = k
        self._observation_dim = env_spec.observation_space[agent_id].flat_dim
        self._action_dim = env_spec.action_space[agent_id].flat_dim

        self._name = name + '_agent_{}'.format(agent_id)
        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='{}_observation_agent_{}'.format(name, agent_id))

        # self._observation_ph = None
        # self._actions = None

        self.agent_id = agent_id
        self._actions, self.all_actions = self.actions_for(self._observation_ph, reuse=True, all_action=True)

        super(MultiLevelPolicy, self).__init__(
            env_spec, self._observation_ph, self._actions, self._name)
Beispiel #23
0
    def __init__(self,
                 env_spec,
                 layer_sizes=(128, 16),
                 output_nonlinearity=None,
                 name='observations_preprocessor'):

        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        self._name = name

        self._observation_dim = env_spec.observation_space.flat_dim

        obs_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._observation_dim),
            name='observations',
        )

        self._input_pls = (obs_ph, )
        self._layer_sizes = layer_sizes
        self._output_nonlinearity = output_nonlinearity

        self._output_t = self.get_output_for(obs_ph, reuse=tf.AUTO_REUSE)
Beispiel #24
0
 def __getstate__(self):
     d = Serializable.__getstate__(self)
     global load_params
     if load_params:
         d["params"] = self.get_param_values()
     return d
Beispiel #25
0
    def __init__(self,
                 env_spec=None,
                 observation_space=None,
                 action_space=None,
                 hidden_layer_sizes=(100, 100),
                 squash=False,
                 squash_func=tf.tanh,
                 name='policy',
                 noise_level=0.0,
                 u_range=1.,
                 shift=None,
                 scale=None,
                 joint=False,
                 opponent_policy=False,
                 agent_id=None,
                 sampling=False,
                 mu=0,
                 theta=0.15,
                 sigma=0.3):
        Serializable.quick_init(self, locals())
        if env_spec is None:
            self._observation_dim = observation_space.flat_dim
            self._action_dim = action_space.flat_dim
        elif isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[
                agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
                if opponent_policy:
                    print('opponent_policy', opponent_policy)
                    self._action_dim = env_spec.action_space.opponent_flat_dim(
                        agent_id)
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim

        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim
        self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]
        print(self._layer_sizes)
        self._squash = squash
        self._squash_func = squash_func
        self.agent_id = agent_id
        self._u_range = u_range
        self.shift = shift
        self.scale = scale
        self._name = name + '_agent_{}'.format(agent_id)
        self.noise_level = noise_level
        self.sampling = sampling

        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self._action_dim) * self.mu

        self._observation_ph = tf.placeholder(
            tf.float32,
            shape=[None, self._observation_dim],
            name='observation_agent_{}'.format(agent_id))

        self._actions = self.actions_for(self._observation_ph)

        super(DeterministicNNPolicy,
              self).__init__(env_spec, self._observation_ph, self._actions,
                             self._name)
Beispiel #26
0
    def __init__(self,
                 env_spec,
                 agent_id=None,
                 observation_space=None,
                 action_space=None,
                 mode="train",
                 squash=True,
                 bijector_config=None,
                 reparameterize=False,
                 observations_preprocessor=None,
                 fix_h_on_reset=False,
                 q_function=None,
                 n_map_action_candidates=100,
                 name="lsp_policy"):
        """Initialize LatentSpacePolicy.

        Args:
            env_spec (`rllab.EnvSpec`): Specification of the environment
                to create the policy for.
            bijector_config (`dict`): Parameter configuration for bijector.
            squash (`bool`): If True, squash the action samples between
                -1 and 1 with tanh.
            n_map_action_candidates ('int'): Number of action candidates for
            estimating the maximum a posteriori (deterministic) action.
        """
        Serializable.quick_init(self, locals())

        self._env_spec = env_spec

        if env_spec is None:
            self._observation_dim = observation_space.flat_dim
            self._action_dim = action_space.flat_dim
        elif isinstance(env_spec, MAEnvSpec):
            assert agent_id is not None
            self._observation_dim = env_spec.observation_space[agent_id].flat_dim
            if joint:
                self._action_dim = env_spec.action_space.flat_dim
            else:
                self._action_dim = env_spec.action_space[agent_id].flat_dim
        else:
            self._action_dim = env_spec.action_space.flat_dim
            self._observation_dim = env_spec.observation_space.flat_dim
        # self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim]




        self._bijector_config = bijector_config
        self._mode = mode
        self._squash = squash
        self._reparameterize = reparameterize
        self._fix_h_on_reset = fix_h_on_reset
        self._q_function = q_function
        self._n_map_action_candidates=n_map_action_candidates

        self._action_dim = env_spec.action_space.flat_dim
        self._Ds = env_spec.observation_space.flat_dim
        self._fixed_h = None
        self._is_deterministic = False
        self._observations_preprocessor = observations_preprocessor

        self.name = name + '_agent_{}'.format(agent_id)
        self.build()

        self._scope_name = (
            tf.get_variable_scope().name + "/" + name
        ).lstrip("/")
        super(NNPolicy, self).__init__(env_spec)
Beispiel #27
0
    def __init__(
        self,
        base_kwargs,
        agent_id,
        env,
        policy,
        initial_exploration_policy,
        qf1,
        qf2,
        vf,
        pool,
        target_entropy='auto',
        plotter=None,
        lr=3e-3,
        scale_reward=1,
        discount=0.99,
        tau=0.01,
        target_update_interval=1,
        action_prior='uniform',
        reparameterize=False,
        save_full_state=False,
    ):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.

            env (`rllab.Env`): rllab environment object.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.

            qf1 (`valuefunction`): First Q-function approximator.
            qf2 (`valuefunction`): Second Q-function approximator. Usage of two
                Q-functions improves performance by reducing overestimation
                bias.
            vf (`ValueFunction`): Soft value function approximator.

            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during trai.ning.

            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.

            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
        """

        Serializable.quick_init(self, locals())
        super(MASAC, self).__init__(**base_kwargs)

        self._env = env
        self._agent_id = agent_id
        self._policy = policy
        self._initial_exploration_policy = initial_exploration_policy
        self._qf1 = qf1
        self._qf2 = qf2
        # self._vf = vf
        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._scale_reward = scale_reward
        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        self._target_entropy = (-np.prod(self._env.action_space.shape) if
                                target_entropy == 'auto' else target_entropy)

        # Reparameterize parameter must match between the algorithm and the
        # policy actions are sampled from.
        assert reparameterize == self._policy._reparameterize
        self._reparameterize = reparameterize

        self._save_full_state = save_full_state

        self._observation_dim = self.env.observation_spaces[
            self._agent_id].flat_dim
        self._action_dim = self.env.action_spaces[self._agent_id].flat_dim
        # just for two agent case
        self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim(
            self._agent_id)

        self._training_ops = list()

        self._init_placeholders()
        self._init_actor_update()
        self._init_critic_update()
        self._init_target_ops()

        # Initialize all uninitialized variables. This prevents initializing
        # pre-trained policy and qf and vf variables.
        uninit_vars = []
        for var in tf.global_variables():
            try:
                self._sess.run(var)
            except tf.errors.FailedPreconditionError:
                uninit_vars.append(var)
        self._sess.run(tf.variables_initializer(uninit_vars))