def test_clone_target(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack((observation1_np, observation2_np)).astype( np.float32 ) weights = self.policy.get_weights() actions_np = self.policy.get_actions_np([observations_np]) log_pis_np = self.policy.log_pis_np([observations_np], actions_np) target_name = "{}_{}".format("target", self.policy._name) target_policy = Serializable.clone(self.policy, name=target_name) weights_2 = target_policy.get_weights() log_pis_np_2 = target_policy.log_pis_np([observations_np], actions_np) self.assertEqual(target_policy._name, target_name) self.assertIsNot(weights, weights_2) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight.shape, weight_2.shape) np.testing.assert_array_equal(log_pis_np.shape, log_pis_np_2.shape) np.testing.assert_equal( actions_np.shape, self.policy.get_actions_np([observations_np]).shape )
def test_clone_target(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack( (observation1_np, observation2_np) ).astype(np.float32) weights = self.V.get_weights() values_np = self.V.get_values_np([observations_np]) target_name = '{}_{}'.format('target', self.V._name) target_V = Serializable.clone(self.V, name=target_name) weights_2 = target_V.get_weights() self.assertEqual(target_V._name, target_name) self.assertIsNot(weights, weights_2) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight.shape, weight_2.shape) np.testing.assert_equal( values_np.shape, target_V.get_values_np([observations_np]).shape)
def __init__(self, env_specs, # (obs_space, act_space) policy, qfs, vf, replay_buffer, policy_optimizer=tf.optimizers.Adam(), qfs_optimizers=(tf.optimizers.Adam(), tf.optimizers.Adam()), vf_optimizer=tf.optimizers.Adam(), exploration_strategy=None, exploration_interval=10, target_update_tau=0.01, target_update_period=10, td_errors_loss_fn=None, # TODO : fancy to me alpha=0.05, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name='SAC', agent_id=-1): self._Serializable__initialize(locals()) self._env_specs = env_specs observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space # observation_space = self._env_specs[0] # action_space = self._env_specs[1] self._exploration_strategy = exploration_strategy self._target_vf = Serializable.clone(vf, name='target_vf') self._policy_optimizer = policy_optimizer self._qfs_optimizers = qfs_optimizers self._vf_optimizer = vf_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = (td_errors_loss_fn or tf.losses.Huber) self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = exploration_interval self._exploration_status = False self.required_experiences = ['observation', 'actions', 'rewards', 'next_observations', 'terminals', 'annealing'] self._qfs = qfs # here we have duel q-functions, do it in SAC instead of OffPolicyAgent self._vf = vf super(SACAgent, self).__init__( observation_space, action_space, policy, qfs, replay_buffer, train_sequence_length=train_sequence_length, name=name, )
def __init__(self, env_specs, policy, qf, replay_buffer, opponent_policy, policy_optimizer=tf.optimizers.Adam(1e-3), qf_optimizer=tf.optimizers.Adam(1e-3), opponent_policy_optimizer=tf.optimizers.Adam(1e-3), opponent_prior_optimizer=tf.optimizers.Adam(1e-3), exploration_strategy=None, target_update_tau=0.01, target_update_period=1, td_errors_loss_fn=None, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name='PR2', agent_id=-1): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs if self._agent_id >= 0: observation_space = self._env_specs.observation_space[ self._agent_id] action_space = self._env_specs.action_space[self._agent_id] opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim( self._agent_id) opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim( self._agent_id) else: observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space self._exploration_strategy = exploration_strategy self._target_policy = None self._opponent_policy = opponent_policy self._prior = Serializable.clone(self._opponent_policy, name='prior_{}'.format( self._agent_id)) self._target_qf = Serializable.clone(qf, name='target_qf_agent_{}'.format( self._agent_id)) self._actor_optimizer = policy_optimizer self._critic_optimizer = qf_optimizer self._opponent_policy_optimizer = opponent_policy_optimizer self._opponent_prior_optimizer = opponent_prior_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = (td_errors_loss_fn or tf.losses.Huber) self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = 10 self._exploration_status = True self.required_experiences = [ 'observation', 'actions', 'rewards', 'next_observations', 'opponent_actions', 'terminals', 'annealing', 'recent_observations', 'recent_opponent_actions' ] super(PR2SoftAgent, self).__init__(observation_space, action_space, policy, qf, replay_buffer, train_sequence_length=train_sequence_length, name=name)
def __init__( self, env_specs, policy, qf, replay_buffer, policy_optimizer=tf.optimizers.Adam(), qf_optimizer=tf.optimizers.Adam(), exploration_strategy=None, exploration_interval=10, target_update_tau=0.01, target_update_period=1, td_errors_loss_fn=None, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name="MADDPG", agent_id=-1, ): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs if self._agent_id >= 0: observation_space = self._env_specs.observation_space[ self._agent_id] action_space = self._env_specs.action_space[self._agent_id] else: observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space self._exploration_strategy = exploration_strategy self._target_policy = Serializable.clone( policy, name="target_policy_agent_{}".format(self._agent_id)) self._target_qf = Serializable.clone(qf, name="target_qf_agent_{}".format( self._agent_id)) self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = exploration_interval self._exploration_status = False self.required_experiences = [ "observation", "actions", "rewards", "next_observations", "opponent_actions", "target_actions", ] super(MADDPGAgent, self).__init__( observation_space, action_space, policy, qf, replay_buffer, train_sequence_length=train_sequence_length, name=name, )
def __init__( self, env_specs, policy, qf, ind_qf, replay_buffer, opponent_policy, policy_optimizer=tf.optimizers.Adam(), qf_optimizer=tf.optimizers.Adam(), opponent_policy_optimizer=tf.optimizers.Adam(), value_n_particles=16, kernel_update_ratio=0.5, exploration_strategy=None, target_update_tau=0.01, target_update_period=1, td_errors_loss_fn=None, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, loss_type="svgd", name="PR2", agent_id=-1, ): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs self._value_n_particles = value_n_particles self._kernel_update_ratio = kernel_update_ratio self._loss_type = loss_type observation_space = self._env_specs.observation_space[self._agent_id] action_space = self._env_specs.action_space[self._agent_id] self._observation_flat_dim = self._env_specs.observation_space.agent_flat_dim( self._agent_id ) self._action_flat_dim = self._env_specs.action_space.agent_flat_dim( self._agent_id ) self._opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim( self._agent_id ) self._opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim( self._agent_id ) self._exploration_strategy = exploration_strategy self._target_policy = None self._ind_qf = ind_qf self._opponent_policy = opponent_policy self._prior = Serializable.clone( self._opponent_policy, name="prior_{}".format(self._agent_id) ) self._target_policy = Serializable.clone( policy, name="target_policy_agent_{}".format(self._agent_id) ) self._target_qf = Serializable.clone( qf, name="target_qf_agent_{}".format(self._agent_id) ) self._actor_optimizer = policy_optimizer self._critic_optimizer = qf_optimizer self._opponent_policy_optimizer = opponent_policy_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = 10 self._exploration_status = True self.required_experiences = [ "observation", "actions", "rewards", "next_observations", "opponent_actions", "terminals", "annealing", "recent_observations", "recent_opponent_actions", ] super(PR2Agent, self).__init__( observation_space, action_space, policy, qf, replay_buffer, train_sequence_length=train_sequence_length, name=name, )
def __init__( self, env_specs, main_policy, opponent_policy, prior_policy, opponent_prior_policy, qf, replay_buffer, k=3, mu=0, policy_optimizer=tf.optimizers.Adam(), qf_optimizer=tf.optimizers.Adam(), opponent_policy_optimizer=tf.optimizers.Adam(10e-3), prior_optimizer=tf.optimizers.Adam(10e-3), exploration_strategy=None, target_update_tau=0.01, target_update_period=1, td_errors_loss_fn=None, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name="PR2K", agent_id=-1, ): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs if self._agent_id >= 0: observation_space = self._env_specs.observation_space[ self._agent_id] action_space = self._env_specs.action_space[self._agent_id] opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim( self._agent_id) opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim( self._agent_id) else: observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space self._exploration_strategy = exploration_strategy self._target_policy = None self._mu = mu self._k = k self._opponent_policy = opponent_policy self._prior_policy = prior_policy self._opponent_prior_policy = opponent_prior_policy policy = LevelKPolicy( main_policy=main_policy, secondary_policy=opponent_policy, prior_policy=prior_policy, secondary_prior_policy=opponent_prior_policy, ) # self._prior = Serializable.clone(self._opponent_policy, name='prior_{}'.format(self._agent_id)) self._target_qf = Serializable.clone(qf, name="target_qf_agent_{}".format( self._agent_id)) self._actor_optimizer = policy_optimizer self._critic_optimizer = qf_optimizer self._opponent_policy_optimizer = opponent_policy_optimizer self._prior_optimizer = prior_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = 10 self._exploration_status = True self.required_experiences = [ "observation", "actions", "rewards", "next_observations", "opponent_actions", "terminals", "annealing", "recent_observations", "recent_opponent_actions", ] super(PR2KSoftAgent, self).__init__( observation_space, action_space, policy, qf, replay_buffer, train_sequence_length=train_sequence_length, name=name, )
def __init__( self, env_specs, policy, qf, replay_buffer, opponent_policy, policy_optimizer=tf.optimizers.Adam(0.01), qf_optimizer=tf.optimizers.Adam(0.01), opponent_policy_optimizer=tf.optimizers.Adam(0.01), opponent_prior_optimizer=tf.optimizers.Adam(0.01), exploration_strategy=None, target_update_tau=0.01, target_update_period=1, td_errors_loss_fn=None, gamma=0.95, reward_scale=1.0, gradient_clipping=None, train_sequence_length=None, name="ROMMEO", agent_id=-1, uniform=False, custom_b=False, bi=1.0, bj=1.0, ): self._Serializable__initialize(locals()) self._agent_id = agent_id self._env_specs = env_specs self._uniform = uniform self._custom_b = custom_b self._bj = tf.constant(bj, dtype=tf.float32) self._bi = tf.constant(bi, dtype=tf.float32) if self._agent_id >= 0: observation_space = self._env_specs.observation_space[self._agent_id] action_space = self._env_specs.action_space[self._agent_id] opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim( self._agent_id ) opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim( self._agent_id ) else: observation_space = self._env_specs.observation_space action_space = self._env_specs.action_space self._exploration_strategy = exploration_strategy self._target_policy = None self._opponent_policy = opponent_policy self._prior = Serializable.clone( self._opponent_policy, name="prior_{}".format(self._agent_id), repara=False ) self._target_qf = Serializable.clone( qf, name="target_qf_agent_{}".format(self._agent_id) ) self._opponent_policy_optimizer = opponent_policy_optimizer self._opponent_prior_optimizer = opponent_prior_optimizer self._actor_optimizer = policy_optimizer self._critic_optimizer = qf_optimizer self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber self._gamma = gamma self._reward_scale = reward_scale self._gradient_clipping = gradient_clipping self._train_step = 0 self._exploration_interval = 10 self._exploration_status = True self.required_experiences = [ "observation", "actions", "rewards", "next_observations", "opponent_actions", "terminals", "annealing", "recent_observations", "recent_opponent_actions", ] super(ROMMEOAgent, self).__init__( observation_space, action_space, policy, qf, replay_buffer, train_sequence_length=train_sequence_length, name=name, )