Beispiel #1
0
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        sess = tf_utils.get_default_session()
        mus, log_sigs, log_ws, log_pis = sess.run((
            self.distribution.mus_t,
            self.distribution.log_sigs_t,
            self.distribution.log_ws_t,
            self.distribution.log_p_t,
        ), feeds)

        logger.record_tabular('gmm-mus-mean', np.mean(mus))
        logger.record_tabular('gmm-mus-min', np.min(mus))
        logger.record_tabular('gmm-mus-max', np.max(mus))
        logger.record_tabular('gmm-mus-std', np.std(mus))
        logger.record_tabular('gmm-log-w-mean', np.mean(log_ws))
        logger.record_tabular('gmm-log-w-min', np.min(log_ws))
        logger.record_tabular('gmm-log-w-max', np.max(log_ws))
        logger.record_tabular('gmm-log-w-std', np.std(log_ws))
        logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs))
        logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs))
        logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs))
        logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs))
        logger.record_tabular('log_pi_mean', np.mean(log_pis))
        logger.record_tabular('log_pi_max', np.max(log_pis))
        logger.record_tabular('log_pi_min', np.min(log_pis))
Beispiel #2
0
    def _eval(self, observations, actions):
        feeds = {
            self._observations_ph: observations,
            self._actions_ph: actions
        }

        return tf_utils.get_default_session().run(self._output, feeds)
Beispiel #3
0
    def __init__(
            self,
            sampler,
            n_epochs=1000,
            n_train_repeat=1,
            n_initial_exploration_steps=10000,
            epoch_length=1000,
            eval_n_episodes=10,
            eval_deterministic=True,
            eval_render=False,
            control_interval=1
    ):
        """
        Args:
            n_epochs (`int`): Number of epochs to run the training for.
            n_train_repeat (`int`): Number of times to repeat the training
                for single time step.
            n_initial_exploration_steps: Number of steps in the beginning to
                take using actions drawn from a separate exploration policy.
            epoch_length (`int`): Epoch length.
            eval_n_episodes (`int`): Number of rollouts to evaluate.
            eval_deterministic (`int`): Whether or not to run the policy in
                deterministic mode when evaluating policy.
            eval_render (`int`): Whether or not to render the evaluation
                environment.
        """
        self.sampler = sampler

        self._n_epochs = int(n_epochs)
        self._n_train_repeat = n_train_repeat
        self._epoch_length = epoch_length
        self._n_initial_exploration_steps = n_initial_exploration_steps
        self._control_interval = control_interval

        self._eval_n_episodes = eval_n_episodes
        self._eval_deterministic = eval_deterministic
        self._eval_render = eval_render

        self._sess = tf_utils.get_default_session()

        self._env = None
        self._policy = None
        self._pool = None
Beispiel #4
0
    def _eval(self, inputs):
        feeds = {pl: val for pl, val in zip(self._inputs, inputs)}

        return tf_utils.get_default_session().run(self._output, feeds)
Beispiel #5
0
    def __init__(self,
                 base_kwargs,
                 agent_id,
                 env,
                 pool,
                 qf,
                 target_qf,
                 policy,
                 target_policy,
                 opponent_policy=None,
                 plotter=None,
                 policy_lr=1E-3,
                 qf_lr=1E-3,
                 joint=False,
                 opponent_modelling=False,
                 td_target_update_interval=1,
                 discount=0.99,
                 tau=0.01,
                 reward_scale=1,
                 use_saved_qf=False,
                 use_saved_policy=False,
                 save_full_state=False,
                 train_qf=True,
                 train_policy=True,
                 joint_policy=False,
                 SGA=False):
        super(MADDPG, self).__init__(**base_kwargs)

        self._env = env
        self._pool = pool
        self.qf = qf
        self.target_qf = target_qf
        # self.target_qf._name = 'target_' + self.traget_qf._name
        self._policy = policy
        self._target_policy = target_policy
        self.opponent_policy = opponent_policy
        # self._target_policy._name = 'target_' + self._target_policy._name
        self.plotter = plotter

        self._agent_id = agent_id
        self.joint = joint
        self.opponent_modelling = opponent_modelling

        self._qf_lr = qf_lr
        self._policy_lr = policy_lr
        self._discount = discount
        self._tau = tau
        self._reward_scale = reward_scale
        self.SGA = SGA

        self._qf_target_update_interval = td_target_update_interval

        self._save_full_state = save_full_state
        self._train_qf = train_qf
        self._train_policy = train_policy
        self.joint_policy = joint_policy

        self._observation_dim = self.env.observation_spaces[
            self._agent_id].flat_dim
        self._opponent_observation_dim = self.env.observation_spaces.opponent_flat_dim(
            self._agent_id)
        self._action_dim = self.env.action_spaces[self._agent_id].flat_dim
        self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim(
            self._agent_id)

        self._create_placeholders()

        self._training_ops = []
        self._target_ops = []

        self._create_q_update()
        self._create_p_update()
        if self.opponent_modelling:
            self._create_opponent_p_update()
        self._create_target_ops()

        if use_saved_qf:
            saved_qf_params = qf.get_param_values()
        if use_saved_policy:
            saved_policy_params = policy.get_param_values()

        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())

        if use_saved_qf:
            self.qf.set_param_values(saved_qf_params)
        if use_saved_policy:
            self.policy.set_param_values(saved_policy_params)
Beispiel #6
0
    def __init__(self,
                 base_kwargs,
                 agent_id,
                 env,
                 pool,
                 qf,
                 target_qf,
                 policy,
                 plotter=None,
                 policy_lr=1E-3,
                 qf_lr=1E-3,
                 tau=0.01,
                 value_n_particles=16,
                 td_target_update_interval=1,
                 kernel_fn=adaptive_isotropic_gaussian_kernel,
                 kernel_n_particles=16,
                 kernel_update_ratio=0.5,
                 discount=0.99,
                 reward_scale=.1,
                 joint=False,
                 use_saved_qf=False,
                 use_saved_policy=False,
                 save_full_state=False,
                 train_qf=True,
                 train_policy=True,
                 joint_policy=True,
                 opponent_action_range=None,
                 opponent_action_range_normalize=True):
        super(MASQL, self).__init__(**base_kwargs)

        self._env = env
        self._pool = pool
        self.qf = qf
        self.target_qf = target_qf
        self._policy = policy
        self.plotter = plotter

        self.agent_id = agent_id

        self._qf_lr = qf_lr
        self._policy_lr = policy_lr
        self._tau = tau
        self._discount = discount
        self._reward_scale = reward_scale
        self.joint_policy = joint_policy
        self.opponent_action_range = opponent_action_range
        self.opponent_action_range_normalize = opponent_action_range_normalize

        self.joint = joint
        self._value_n_particles = value_n_particles
        self._qf_target_update_interval = td_target_update_interval

        self._kernel_fn = kernel_fn
        self._kernel_n_particles = kernel_n_particles
        self._kernel_update_ratio = kernel_update_ratio

        self._save_full_state = save_full_state
        self._train_qf = train_qf
        self._train_policy = train_policy

        self._observation_dim = self.env.observation_spaces[
            self.agent_id].flat_dim
        self._action_dim = self.env.action_spaces[self.agent_id].flat_dim
        # just for two agent case
        self._opponent_action_dim = self.env.action_spaces.opponent_flat_dim(
            self.agent_id)

        self._create_placeholders()

        self._training_ops = []
        self._target_ops = []

        self._create_td_update()
        self._create_svgd_update()
        self._create_target_ops()

        if use_saved_qf:
            saved_qf_params = qf.get_param_values()
        if use_saved_policy:
            saved_policy_params = policy.get_param_values()

        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())

        if use_saved_qf:
            self.qf.set_param_values(saved_qf_params)
        if use_saved_policy:
            self.policy.set_param_values(saved_policy_params)