def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ feeds = {self._observations_ph: batch['observations']} sess = tf_utils.get_default_session() mus, log_sigs, log_ws, log_pis = sess.run(( self.distribution.mus_t, self.distribution.log_sigs_t, self.distribution.log_ws_t, self.distribution.log_p_t, ), feeds) logger.record_tabular('gmm-mus-mean', np.mean(mus)) logger.record_tabular('gmm-mus-min', np.min(mus)) logger.record_tabular('gmm-mus-max', np.max(mus)) logger.record_tabular('gmm-mus-std', np.std(mus)) logger.record_tabular('gmm-log-w-mean', np.mean(log_ws)) logger.record_tabular('gmm-log-w-min', np.min(log_ws)) logger.record_tabular('gmm-log-w-max', np.max(log_ws)) logger.record_tabular('gmm-log-w-std', np.std(log_ws)) logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs)) logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs)) logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs)) logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs)) logger.record_tabular('log_pi_mean', np.mean(log_pis)) logger.record_tabular('log_pi_max', np.max(log_pis)) logger.record_tabular('log_pi_min', np.min(log_pis))
def _eval(self, observations, actions): feeds = { self._observations_ph: observations, self._actions_ph: actions } return tf_utils.get_default_session().run(self._output, feeds)
def __init__( self, sampler, n_epochs=1000, n_train_repeat=1, n_initial_exploration_steps=10000, epoch_length=1000, eval_n_episodes=10, eval_deterministic=True, eval_render=False, control_interval=1 ): """ Args: n_epochs (`int`): Number of epochs to run the training for. n_train_repeat (`int`): Number of times to repeat the training for single time step. n_initial_exploration_steps: Number of steps in the beginning to take using actions drawn from a separate exploration policy. epoch_length (`int`): Epoch length. eval_n_episodes (`int`): Number of rollouts to evaluate. eval_deterministic (`int`): Whether or not to run the policy in deterministic mode when evaluating policy. eval_render (`int`): Whether or not to render the evaluation environment. """ self.sampler = sampler self._n_epochs = int(n_epochs) self._n_train_repeat = n_train_repeat self._epoch_length = epoch_length self._n_initial_exploration_steps = n_initial_exploration_steps self._control_interval = control_interval self._eval_n_episodes = eval_n_episodes self._eval_deterministic = eval_deterministic self._eval_render = eval_render self._sess = tf_utils.get_default_session() self._env = None self._policy = None self._pool = None
def _eval(self, inputs): feeds = {pl: val for pl, val in zip(self._inputs, inputs)} return tf_utils.get_default_session().run(self._output, feeds)
def __init__(self, base_kwargs, agent_id, env, pool, qf, target_qf, policy, target_policy, opponent_policy=None, plotter=None, policy_lr=1E-3, qf_lr=1E-3, joint=False, opponent_modelling=False, td_target_update_interval=1, discount=0.99, tau=0.01, reward_scale=1, use_saved_qf=False, use_saved_policy=False, save_full_state=False, train_qf=True, train_policy=True, joint_policy=False, SGA=False): super(MADDPG, self).__init__(**base_kwargs) self._env = env self._pool = pool self.qf = qf self.target_qf = target_qf # self.target_qf._name = 'target_' + self.traget_qf._name self._policy = policy self._target_policy = target_policy self.opponent_policy = opponent_policy # self._target_policy._name = 'target_' + self._target_policy._name self.plotter = plotter self._agent_id = agent_id self.joint = joint self.opponent_modelling = opponent_modelling self._qf_lr = qf_lr self._policy_lr = policy_lr self._discount = discount self._tau = tau self._reward_scale = reward_scale self.SGA = SGA self._qf_target_update_interval = td_target_update_interval self._save_full_state = save_full_state self._train_qf = train_qf self._train_policy = train_policy self.joint_policy = joint_policy self._observation_dim = self.env.observation_spaces[ self._agent_id].flat_dim self._opponent_observation_dim = self.env.observation_spaces.opponent_flat_dim( self._agent_id) self._action_dim = self.env.action_spaces[self._agent_id].flat_dim self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim( self._agent_id) self._create_placeholders() self._training_ops = [] self._target_ops = [] self._create_q_update() self._create_p_update() if self.opponent_modelling: self._create_opponent_p_update() self._create_target_ops() if use_saved_qf: saved_qf_params = qf.get_param_values() if use_saved_policy: saved_policy_params = policy.get_param_values() self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer()) if use_saved_qf: self.qf.set_param_values(saved_qf_params) if use_saved_policy: self.policy.set_param_values(saved_policy_params)
def __init__(self, base_kwargs, agent_id, env, pool, qf, target_qf, policy, plotter=None, policy_lr=1E-3, qf_lr=1E-3, tau=0.01, value_n_particles=16, td_target_update_interval=1, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, discount=0.99, reward_scale=.1, joint=False, use_saved_qf=False, use_saved_policy=False, save_full_state=False, train_qf=True, train_policy=True, joint_policy=True, opponent_action_range=None, opponent_action_range_normalize=True): super(MASQL, self).__init__(**base_kwargs) self._env = env self._pool = pool self.qf = qf self.target_qf = target_qf self._policy = policy self.plotter = plotter self.agent_id = agent_id self._qf_lr = qf_lr self._policy_lr = policy_lr self._tau = tau self._discount = discount self._reward_scale = reward_scale self.joint_policy = joint_policy self.opponent_action_range = opponent_action_range self.opponent_action_range_normalize = opponent_action_range_normalize self.joint = joint self._value_n_particles = value_n_particles self._qf_target_update_interval = td_target_update_interval self._kernel_fn = kernel_fn self._kernel_n_particles = kernel_n_particles self._kernel_update_ratio = kernel_update_ratio self._save_full_state = save_full_state self._train_qf = train_qf self._train_policy = train_policy self._observation_dim = self.env.observation_spaces[ self.agent_id].flat_dim self._action_dim = self.env.action_spaces[self.agent_id].flat_dim # just for two agent case self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim( self.agent_id) self._create_placeholders() self._training_ops = [] self._target_ops = [] self._create_td_update() self._create_svgd_update() self._create_target_ops() if use_saved_qf: saved_qf_params = qf.get_param_values() if use_saved_policy: saved_policy_params = policy.get_param_values() self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer()) if use_saved_qf: self.qf.set_param_values(saved_qf_params) if use_saved_policy: self.policy.set_param_values(saved_policy_params)