def __init__(self, env_spec, cond_policy, hidden_layer_sizes=(100, 100), reg=1e-3, squash=True, reparameterize=True, name='gaussian_cond_policy', joint=False, opponent_policy=False, agent_id=None): """ Args: env_spec (`rllab.EnvSpec`): Specification of the environment to create the policy for. hidden_layer_sizes (`list` of `int`): Sizes for the Multilayer perceptron hidden layers. reg (`float`): Regularization coeffiecient for the Gaussian parameters. squash (`bool`): If True, squash the Gaussian the gmm action samples between -1 and 1 with tanh. reparameterize ('bool'): If True, gradients will flow directly through the action samples. """ Serializable.quick_init(self, locals()) if isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[ agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim if opponent_policy: print('opponent_policy', opponent_policy) self._action_dim = env_spec.action_space.opponent_flat_dim( agent_id) else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self.cond_policy = cond_policy self._hidden_layers = hidden_layer_sizes self._is_deterministic = False self._fixed_h = None self._squash = squash self._reparameterize = reparameterize self._reg = reg self._observation_ph = tf.placeholder( dtype=tf.float32, shape=(None, self._observation_dim), name='observations', ) self.name = name + '_agent_{}'.format(agent_id) self.build() self._scope_name = (tf.get_variable_scope().name + "/" + self.name).lstrip("/") super(NNPolicy, self).__init__(env_spec)
def __init__(self, env_spec, policies, agent_id, k, mu=1., name='g_level_k', correct_tanh=True): Serializable.quick_init(self, locals()) self._policies = policies assert k > 1 self._k = k self._mu = mu self._dists = self.level_distribution(self._k, self._mu) if correct_tanh: self._correction_factor = 1. else: self._correction_factor = 0. self._observation_dim = env_spec.observation_space[agent_id].flat_dim self._action_dim = env_spec.action_space[agent_id].flat_dim self._name = name + '_agent_{}'.format(agent_id) self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='{}_observation_agent_{}'.format(name, agent_id)) self.agent_id = agent_id self._actions, self.all_actions = self.actions_for(self._observation_ph, reuse=True, all_action=True) super(GeneralizedMultiLevelPolicy, self).__init__( env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, q_functions): Serializable.quick_init(self, locals()) self.q_functions = q_functions agent_id = 0 joint = True if isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[ agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observations') self._actions_ph = tf.placeholder(tf.float32, shape=[None, self._action_dim], name='actions') self._output = self.output_for(self._observations_ph, self._actions_ph, reuse=True)
def __init__(self, env_spec=None, observation_space=None, action_space=None, hidden_layer_sizes=(100, 100), name='value_function', joint=False, agent_id=None): Serializable.quick_init(self, locals()) self._name = name + '_agent_{}'.format(agent_id) if env_spec is None: self._observation_dim = observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim elif isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[ agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observations_agent_{}'.format(agent_id)) super(NNVFunction, self).__init__(inputs=(self._observations_ph, ), name=self._name, hidden_layer_sizes=hidden_layer_sizes)
def __init__(self, env_spec, max_replay_buffer_size, joint=False, agent_id=None): super(SimpleReplayBuffer, self).__init__() Serializable.quick_init(self, locals()) max_replay_buffer_size = int(max_replay_buffer_size) self.joint = joint self._env_spec = env_spec self.agent_id = agent_id if isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[agent_id].flat_dim self._action_dim = env_spec.action_space[agent_id].flat_dim if joint: self._opponent_action_dim = env_spec.action_space.opponent_flat_dim(agent_id) print(agent_id, self._opponent_action_dim ) self._opponent_actions = np.zeros((max_replay_buffer_size, self._opponent_action_dim )) else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._max_buffer_size = max_replay_buffer_size self._observations = np.zeros((max_replay_buffer_size, self._observation_dim)) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = np.zeros((max_replay_buffer_size, self._observation_dim)) self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) self._rewards = np.zeros(max_replay_buffer_size) self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') self._top = 0 self._size = 0
def __setstate__(self, d): Serializable.__setstate__(self, d) global load_params if load_params: tf.get_default_session().run( tf.variables_initializer(self.get_params())) self.set_param_values(d["params"])
def __init__(self, env_spec, max_pool_size, replacement_policy='stochastic', replacement_prob=1.0, max_skip_episode=10): Serializable.quick_init(self, locals()) super(SimpleReplayPool, self).__init__(env_spec) max_pool_size = int(max_pool_size) self._max_pool_size = max_pool_size self._replacement_policy = replacement_policy self._replacement_prob = replacement_prob self._max_skip_episode = max_skip_episode self._observations = np.zeros((max_pool_size, self._observation_dim)) self._actions = np.zeros((max_pool_size, self._action_dim)) self._rewards = np.zeros(max_pool_size) # self._terminals[i] = a terminal was received at time i self._terminals = np.zeros(max_pool_size, dtype='uint8') # self._final_state[i] = state i was the final state in a rollout, # so it should never be sampled since it has no correspond # In other words, we're saving the s_{t+1} after sampling a tuple of # (s_t, a_t, r_t, s_{t+1}, TERMINAL=TRUE) self._final_state = np.zeros(max_pool_size, dtype='uint8') self._bottom = 0 self._top = 0 self._size = 0 self._env_info = dict()
def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='qf', joint=False, agent_id=None): Serializable.quick_init(self, locals()) if isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[ agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._obs_pl = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation', ) self._action_pl = tf.placeholder( tf.float32, shape=[None, self._action_dim], name='actions', ) super(NNQFunction, self).__init__(name, (self._obs_pl, self._action_pl), hidden_layer_sizes)
def __init__(self, env_spec, obs_pl, action, scope_name=None): Serializable.quick_init(self, locals()) self._observation_ph = obs_pl self._action = action self._scope_name = (tf.get_variable_scope().name if not scope_name else scope_name) super(NNPolicy, self).__init__(env_spec)
def __init__(self, inputs, name, hidden_layer_sizes): Parameterized.__init__(self) Serializable.quick_init(self, locals()) self._name = name self._inputs = inputs self._layer_sizes = list(hidden_layer_sizes) + [1] self._output = self._output_for(self._inputs)
def __setstate__(self, d): """Set Serializable state fo the RLAlgorithm instance.""" Serializable.__setstate__(self, d) self._qf1.set_param_values(d['qf1-params']) self._qf2.set_param_values(d['qf2-params']) self._vf.set_param_values(d['vf-params']) self._policy.set_param_values(d['policy-params']) self._pool.__setstate__(d['pool']) self._env.__setstate__(d['env'])
def __init__(self, env_spec=None, observation_space=None, action_space=None, opponent_action_space=None, hidden_layer_sizes=(100, 100), squash=False, squash_func=tf.tanh, name='conditional_policy', u_range=1., shift=None, scale=None, joint=False, agent_id=None, sampling=False): Serializable.quick_init(self, locals()) self.agent_id = agent_id if env_spec is None: self._observation_dim = observation_space.flat_dim self._action_dim = action_space.flat_dim self._opponent_action_dim = opponent_action_space.flat_dim else: assert isinstance(env_spec, MAEnvSpec) assert agent_id is not None self._action_dim = env_spec.action_space[agent_id].flat_dim self._observation_dim = env_spec.observation_space[ agent_id].flat_dim self._opponent_action_dim = env_spec.action_space.opponent_flat_dim( agent_id) print('opp dim', self._opponent_action_dim) self._layer_sizes = list(hidden_layer_sizes) + [ self._opponent_action_dim ] self._squash = squash self._squash_func = squash_func self._u_range = u_range self.shift = shift self.scale = scale self.sampling = sampling self._name = name + '_agent_{}'.format(agent_id) self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation_{}_agent_{}'.format(name, agent_id)) self._actions_ph = tf.placeholder(tf.float32, shape=[None, self._action_dim], name='actions_{}_agent_{}'.format( name, agent_id)) self._opponent_actions = self.actions_for(self._observation_ph, self._actions_ph) super(StochasticNNConditionalPolicy, self).__init__(env_spec, self._observation_ph, self._opponent_actions, self._name)
def __init__( self, observation_space, action_space): """ :type observation_space: Space :type action_space: Space """ Serializable.quick_init(self, locals()) self._observation_space = observation_space self._action_space = action_space
def __init__(self, env_spec=None, observation_space=None, action_space=None, nego_round=1, hidden_layer_sizes=(100, 100), squash=False, squash_func=tf.tanh, name='accstochastic_policy', u_range=1., shift=None, scale=None, joint=False, agent_id=None, sampling=False): Serializable.quick_init(self, locals()) if env_spec is None: self._observation_dim = observation_space self._action_dim = action_space elif isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim**2 self._layer_sizes = list(hidden_layer_sizes) + [1] self._squash = squash self._squash_func = squash_func self._u_range = u_range self.shift = shift self.scale = scale self._name = name + '_agent_{}'.format(agent_id) self.sampling = sampling self.agent_id = agent_id self.nego_round = nego_round self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation_{}_agent_{}'.format(name, agent_id)) # self._actions_ph = tf.placeholder( # tf.float32, # shape=[None, self._observation_dim], # name='actions_{}_agent_{}'.format(name, agent_id)) # self._observation_ph = None # self._actions = None self._actions = self.actions_for(self._observation_ph) super(ACConditionedStochasticNNPolicy, self).__init__(env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, agent_id, opponent=False, urange=[-1, 1.], if_softmax=False): Serializable.quick_init(self, locals()) self._action_dim = env_spec.action_space[agent_id].flat_dim self._urange = urange self._if_softmax = if_softmax if opponent: self._action_dim = env_spec.action_space.opponent_flat_dim( agent_id) self._name = 'uniform_policy_{}'.format(agent_id) super(UniformPolicy, self).__init__(env_spec)
def __init__( self, observation_spaces, action_spaces): """ :type observation_spaces: MASpace :type action_spaces: MASpace """ Serializable.quick_init(self, locals()) assert isinstance(observation_spaces, MASpace) assert isinstance(action_spaces, MASpace) self.agent_num = observation_spaces.agent_num self._observation_spaces = observation_spaces self._action_spaces = action_spaces self._env_specs = np.array(EnvSpec(observation_space, action_space) for observation_space, action_space in zip(observation_spaces, action_spaces))
def __init__(self, env_spec, K=2, hidden_layer_sizes=(100, 100), reg=1e-3, squash=True, reparameterize=False, qf=None, name='gmm_policy'): """ Args: env_spec (`rllab.EnvSpec`): Specification of the environment to create the policy for. K (`int`): Number of mixture components. hidden_layer_sizes (`list` of `int`): Sizes for the Multilayer perceptron hidden layers. reg (`float`): Regularization coeffiecient for the GMM parameters. squash (`bool`): If True, squash the GMM the gmm action samples between -1 and 1 with tanh. qf (`ValueFunction`): Q-function approximator. """ Serializable.quick_init(self, locals()) self._hidden_layers = hidden_layer_sizes self._action_dim = env_spec.action_space.flat_dim self._Ds = env_spec.observation_space.flat_dim self._K = K self._is_deterministic = False self._fixed_h = None self._squash = squash self._qf = qf self._reg = reg # We can only reparameterize if there was one component in the GMM, # in which case one should use sac.policies.GaussianPolicy assert not reparameterize self._reparameterize = reparameterize self.name = name self.build() self._scope_name = (tf.get_variable_scope().name + "/" + name).lstrip("/") # TODO.code_consolidation: This should probably call # `super(GMMPolicy, self).__init__` super(NNPolicy, self).__init__(env_spec)
def __init__(self, env_spec, agent_id=None, hidden_layer_sizes=(100, 100), name='vf'): Serializable.quick_init(self, locals()) self._observation_dim = env_spec.observation_space.flat_dim if agent_id is not None and agent_id != 'all': self._observation_dim = env_spec.observation_space[ agent_id].flat_dim self._obs_pl = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation', ) super(NNVFunction, self).__init__(name, (self._obs_pl, ), hidden_layer_sizes)
def __init__(self, env_spec, q_functions): Serializable.quick_init(self, locals()) self.q_functions = q_functions self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observations') self._actions_ph = tf.placeholder(tf.float32, shape=[None, self._action_dim], name='actions') self._output = self.output_for(self._observations_ph, self._actions_ph, reuse=True)
def __getstate__(self): """Get Serializable state of the RLALgorithm instance.""" d = Serializable.__getstate__(self) d.update({ 'qf1-params': self._qf1.get_param_values(), 'qf2-params': self._qf2.get_param_values(), 'vf-params': self._vf.get_param_values(), 'policy-params': self._policy.get_param_values(), 'pool': self._pool.__getstate__(), 'env': self._env.__getstate__(), }) return d
def __init__(self, agent_num, game_name='pbeauty', p=0.67, reward_type='abs', action_low=-1., action_high=1.): Serializable.quick_init(self, locals()) self.agent_num = agent_num self.p = p self.game_name = game_name self.reward_type = reward_type self.action_range = [action_low, action_high] lows = np.array( [np.array([action_low]) for _ in range(self.agent_num)]) highs = np.array( [np.array([action_high]) for _ in range(self.agent_num)]) self.action_spaces = MABox(lows=lows, highs=highs) self.observation_spaces = MADiscrete([1] * self.agent_num) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces) self.t = 0 self.rewards = np.zeros((self.agent_num, ))
def __init__(self, env_spec, base_policy, conditional_policy, opponent_conditional_policy, agent_id, k, name='level_k'): Serializable.quick_init(self, locals()) self._base_policy = base_policy self._conditional_policy = conditional_policy self._opponent_conditional_policy = opponent_conditional_policy self._k = k self._observation_dim = env_spec.observation_space[agent_id].flat_dim self._action_dim = env_spec.action_space[agent_id].flat_dim self._name = name + '_agent_{}'.format(agent_id) self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='{}_observation_agent_{}'.format(name, agent_id)) # self._observation_ph = None # self._actions = None self.agent_id = agent_id self._actions, self.all_actions = self.actions_for(self._observation_ph, reuse=True, all_action=True) super(MultiLevelPolicy, self).__init__( env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, layer_sizes=(128, 16), output_nonlinearity=None, name='observations_preprocessor'): Parameterized.__init__(self) Serializable.quick_init(self, locals()) self._name = name self._observation_dim = env_spec.observation_space.flat_dim obs_ph = tf.placeholder( tf.float32, shape=(None, self._observation_dim), name='observations', ) self._input_pls = (obs_ph, ) self._layer_sizes = layer_sizes self._output_nonlinearity = output_nonlinearity self._output_t = self.get_output_for(obs_ph, reuse=tf.AUTO_REUSE)
def __getstate__(self): d = Serializable.__getstate__(self) global load_params if load_params: d["params"] = self.get_param_values() return d
def __init__(self, env_spec=None, observation_space=None, action_space=None, hidden_layer_sizes=(100, 100), squash=False, squash_func=tf.tanh, name='policy', noise_level=0.0, u_range=1., shift=None, scale=None, joint=False, opponent_policy=False, agent_id=None, sampling=False, mu=0, theta=0.15, sigma=0.3): Serializable.quick_init(self, locals()) if env_spec is None: self._observation_dim = observation_space.flat_dim self._action_dim = action_space.flat_dim elif isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[ agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim if opponent_policy: print('opponent_policy', opponent_policy) self._action_dim = env_spec.action_space.opponent_flat_dim( agent_id) else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] print(self._layer_sizes) self._squash = squash self._squash_func = squash_func self.agent_id = agent_id self._u_range = u_range self.shift = shift self.scale = scale self._name = name + '_agent_{}'.format(agent_id) self.noise_level = noise_level self.sampling = sampling self.mu = mu self.theta = theta self.sigma = sigma self.state = np.ones(self._action_dim) * self.mu self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation_agent_{}'.format(agent_id)) self._actions = self.actions_for(self._observation_ph) super(DeterministicNNPolicy, self).__init__(env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, agent_id=None, observation_space=None, action_space=None, mode="train", squash=True, bijector_config=None, reparameterize=False, observations_preprocessor=None, fix_h_on_reset=False, q_function=None, n_map_action_candidates=100, name="lsp_policy"): """Initialize LatentSpacePolicy. Args: env_spec (`rllab.EnvSpec`): Specification of the environment to create the policy for. bijector_config (`dict`): Parameter configuration for bijector. squash (`bool`): If True, squash the action samples between -1 and 1 with tanh. n_map_action_candidates ('int'): Number of action candidates for estimating the maximum a posteriori (deterministic) action. """ Serializable.quick_init(self, locals()) self._env_spec = env_spec if env_spec is None: self._observation_dim = observation_space.flat_dim self._action_dim = action_space.flat_dim elif isinstance(env_spec, MAEnvSpec): assert agent_id is not None self._observation_dim = env_spec.observation_space[agent_id].flat_dim if joint: self._action_dim = env_spec.action_space.flat_dim else: self._action_dim = env_spec.action_space[agent_id].flat_dim else: self._action_dim = env_spec.action_space.flat_dim self._observation_dim = env_spec.observation_space.flat_dim # self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] self._bijector_config = bijector_config self._mode = mode self._squash = squash self._reparameterize = reparameterize self._fix_h_on_reset = fix_h_on_reset self._q_function = q_function self._n_map_action_candidates=n_map_action_candidates self._action_dim = env_spec.action_space.flat_dim self._Ds = env_spec.observation_space.flat_dim self._fixed_h = None self._is_deterministic = False self._observations_preprocessor = observations_preprocessor self.name = name + '_agent_{}'.format(agent_id) self.build() self._scope_name = ( tf.get_variable_scope().name + "/" + name ).lstrip("/") super(NNPolicy, self).__init__(env_spec)
def __init__( self, base_kwargs, agent_id, env, policy, initial_exploration_policy, qf1, qf2, vf, pool, target_entropy='auto', plotter=None, lr=3e-3, scale_reward=1, discount=0.99, tau=0.01, target_update_interval=1, action_prior='uniform', reparameterize=False, save_full_state=False, ): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. qf1 (`valuefunction`): First Q-function approximator. qf2 (`valuefunction`): Second Q-function approximator. Usage of two Q-functions improves performance by reducing overestimation bias. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during trai.ning. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. """ Serializable.quick_init(self, locals()) super(MASAC, self).__init__(**base_kwargs) self._env = env self._agent_id = agent_id self._policy = policy self._initial_exploration_policy = initial_exploration_policy self._qf1 = qf1 self._qf2 = qf2 # self._vf = vf self._pool = pool self._plotter = plotter self._policy_lr = lr self._qf_lr = lr self._vf_lr = lr self._scale_reward = scale_reward self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._target_entropy = (-np.prod(self._env.action_space.shape) if target_entropy == 'auto' else target_entropy) # Reparameterize parameter must match between the algorithm and the # policy actions are sampled from. assert reparameterize == self._policy._reparameterize self._reparameterize = reparameterize self._save_full_state = save_full_state self._observation_dim = self.env.observation_spaces[ self._agent_id].flat_dim self._action_dim = self.env.action_spaces[self._agent_id].flat_dim # just for two agent case self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim( self._agent_id) self._training_ops = list() self._init_placeholders() self._init_actor_update() self._init_critic_update() self._init_target_ops() # Initialize all uninitialized variables. This prevents initializing # pre-trained policy and qf and vf variables. uninit_vars = [] for var in tf.global_variables(): try: self._sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) self._sess.run(tf.variables_initializer(uninit_vars))