コード例 #1
0
def get_pr2_agent(env,
                  agent_id,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="deter"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    if policy_type == "dete":
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return PR2Agent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        ind_qf=MLPValueFunction(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="ind_qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
コード例 #2
0
ファイル: test_value_function.py プロジェクト: kiminh/malib
 def setUp(self):
     self.env = gym.envs.make('MountainCarContinuous-v0')
     self.hidden_layer_sizes = (128, 128)
     self.Q = MLPValueFunction(
         input_shapes=(self.env.observation_space.shape, self.env.action_space.shape),
         output_shape=(1,),
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='Q'
     )
     self.V = MLPValueFunction(
         input_shapes=(self.env.observation_space.shape,),
         output_shape=(1,),
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='V'
     )
コード例 #3
0
def get_ddpgtom_agent(env, agent_id, hidden_layer_sizes,
                      max_replay_buffer_size):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    return DDPGToMAgent(
        env_specs=env.env_specs,
        policy=DeterministicMLPPolicy(input_shapes=(observation_space.shape, (
            env.env_specs.action_space.opponent_flat_dim(agent_id), )),
                                      output_shape=action_space.shape,
                                      hidden_layer_sizes=hidden_layer_sizes,
                                      name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(
            input_shapes=(observation_space.shape,
                          (env.env_specs.action_space.flat_dim, )),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='qf_agent_{}'.format(agent_id)),
        opponent_policy=DeterministicMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=(
                env.env_specs.action_space.opponent_flat_dim(agent_id), ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='opponent_policy_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            opponent_action_dim=env.env_specs.action_space.opponent_flat_dim(
                agent_id),
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=OUExploration(action_space),
        gradient_clipping=10.,
        agent_id=agent_id,
    )
コード例 #4
0
def get_ddpg_agent(env,
                   agent_id,
                   hidden_layer_sizes,
                   max_replay_buffer_size,
                   policy_type='dete'):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    if policy_type == 'dete':
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == 'gumble':
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return DDPGAgent(
        env_specs=env.env_specs,
        policy=policy_fn(input_shapes=(observation_space.shape, ),
                         output_shape=action_space.shape,
                         hidden_layer_sizes=hidden_layer_sizes,
                         name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(input_shapes=(observation_space.shape,
                                          action_space.shape),
                            output_shape=(1, ),
                            hidden_layer_sizes=hidden_layer_sizes,
                            name='qf_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.,
        agent_id=agent_id,
    )
コード例 #5
0
def get_rommeo_agent(
    env,
    agent_id,
    hidden_layer_sizes,
    max_replay_buffer_size,
    policy_type="gaussian",
    uniform=False,
    custom_b=False,
    bi=1.0,
    bj=1.0,
):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return ROMMEOAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
            repara=True,
            # smoothing_coefficient=0.5
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
            repara=True,
        ),
        gradient_clipping=10,
        agent_id=agent_id,
        name="ROMMEO_{}".format(agent_id),
        uniform=uniform,
        custom_b=custom_b,
        bi=bi,
        bj=bj,
    )
コード例 #6
0
    def test_multi_output(self):
        Q5 = MLPValueFunction(
            input_shapes=(
                self.env.observation_space.shape,
                self.env.action_space.shape,
            ),
            output_shape=(5, ),
            hidden_layer_sizes=self.hidden_layer_sizes,
            name="Q5",
        )
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        action1_np = self.env.action_space.sample()
        action2_np = self.env.action_space.sample()
        observations_np = np.stack(
            (observation1_np, observation2_np)).astype(np.float32)

        actions_np = np.stack((action1_np, action2_np))

        conditions = [observations_np, actions_np]

        q_values_np = Q5.get_values_np(conditions)
        q_values = Q5.get_values(conditions)

        self.assertEqual(q_values_np.shape, (2, 5))
        self.assertEqual(q_values.shape, (2, 5))
コード例 #7
0
def get_pr2k_soft_agent(env,
                        agent_id,
                        hidden_layer_sizes,
                        max_replay_buffer_size,
                        k=2,
                        mu=0):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    return PR2KSoftAgent(
        env_specs=env.env_specs,
        main_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        opponent_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="prior_policy_agent_{}".format(agent_id),
        ),
        opponent_prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_prior_policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        k=k,
        mu=mu,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
コード例 #8
0
def get_sac_agent(env,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="gaussian"):
    """
    SAC agent for single player learning.
    """
    observation_space = env.env_specs.observation_space[0]
    action_space = env.env_specs.action_space[0]
    env_specs = env.env_specs
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    # print('observation_space.shape', observation_space.shape)
    return SACAgent(
        env_specs=env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="{}_policy".format(policy_type),
        ),
        qfs=[
            MLPValueFunction(
                input_shapes=(observation_space.shape, action_space.shape),
                output_shape=(1, ),
                hidden_layer_sizes=hidden_layer_sizes,
                name="qf_{}".format(qf_id),
            ) for qf_id in range(2)
        ],
        vf=MLPValueFunction(
            input_shapes=(observation_space.shape, ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="vf",
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
        ),
    )
コード例 #9
0
def get_pr2_soft_agent(env,
                       agent_id,
                       hidden_layer_sizes,
                       max_replay_buffer_size,
                       policy_type="gaussian"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return PR2SoftAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        gradient_clipping=10.0,
        agent_id=agent_id,
    )