Esempio n. 1
0
def get_ddpgtom_agent(env, agent_id, hidden_layer_sizes,
                      max_replay_buffer_size):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    return DDPGToMAgent(
        env_specs=env.env_specs,
        policy=DeterministicMLPPolicy(input_shapes=(observation_space.shape, (
            env.env_specs.action_space.opponent_flat_dim(agent_id), )),
                                      output_shape=action_space.shape,
                                      hidden_layer_sizes=hidden_layer_sizes,
                                      name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(
            input_shapes=(observation_space.shape,
                          (env.env_specs.action_space.flat_dim, )),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='qf_agent_{}'.format(agent_id)),
        opponent_policy=DeterministicMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=(
                env.env_specs.action_space.opponent_flat_dim(agent_id), ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='opponent_policy_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            opponent_action_dim=env.env_specs.action_space.opponent_flat_dim(
                agent_id),
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=OUExploration(action_space),
        gradient_clipping=10.,
        agent_id=agent_id,
    )
Esempio n. 2
0
def get_rommeo_agent(
    env,
    agent_id,
    hidden_layer_sizes,
    max_replay_buffer_size,
    policy_type="gaussian",
    uniform=False,
    custom_b=False,
    bi=1.0,
    bj=1.0,
):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return ROMMEOAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
            repara=True,
            # smoothing_coefficient=0.5
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
            repara=True,
        ),
        gradient_clipping=10,
        agent_id=agent_id,
        name="ROMMEO_{}".format(agent_id),
        uniform=uniform,
        custom_b=custom_b,
        bi=bi,
        bj=bj,
    )
Esempio n. 3
0
def get_ddpg_agent(env,
                   agent_id,
                   hidden_layer_sizes,
                   max_replay_buffer_size,
                   policy_type='dete'):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    if policy_type == 'dete':
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == 'gumble':
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return DDPGAgent(
        env_specs=env.env_specs,
        policy=policy_fn(input_shapes=(observation_space.shape, ),
                         output_shape=action_space.shape,
                         hidden_layer_sizes=hidden_layer_sizes,
                         name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(input_shapes=(observation_space.shape,
                                          action_space.shape),
                            output_shape=(1, ),
                            hidden_layer_sizes=hidden_layer_sizes,
                            name='qf_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.,
        agent_id=agent_id,
    )
Esempio n. 4
0
def get_pr2k_soft_agent(env,
                        agent_id,
                        hidden_layer_sizes,
                        max_replay_buffer_size,
                        k=2,
                        mu=0):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    return PR2KSoftAgent(
        env_specs=env.env_specs,
        main_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        opponent_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="prior_policy_agent_{}".format(agent_id),
        ),
        opponent_prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_prior_policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        k=k,
        mu=mu,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
Esempio n. 5
0
def get_pr2_agent(env,
                  agent_id,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="deter"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    if policy_type == "dete":
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return PR2Agent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        ind_qf=MLPValueFunction(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="ind_qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
Esempio n. 6
0
def get_pr2_soft_agent(env,
                       agent_id,
                       hidden_layer_sizes,
                       max_replay_buffer_size,
                       policy_type="gaussian"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return PR2SoftAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
Esempio n. 7
0
def get_sac_agent(env,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="gaussian"):
    """
    SAC agent for single player learning.
    """
    observation_space = env.env_specs.observation_space[0]
    action_space = env.env_specs.action_space[0]
    env_specs = env.env_specs
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    # print('observation_space.shape', observation_space.shape)
    return SACAgent(
        env_specs=env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="{}_policy".format(policy_type),
        ),
        qfs=[
            MLPValueFunction(
                input_shapes=(observation_space.shape, action_space.shape),
                output_shape=(1, ),
                hidden_layer_sizes=hidden_layer_sizes,
                name="qf_{}".format(qf_id),
            ) for qf_id in range(2)
        ],
        vf=MLPValueFunction(
            input_shapes=(observation_space.shape, ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="vf",
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
        ),
    )
Esempio n. 8
0
def get_commnet_agent(env,
                      agent_id,
                      hidden_layer_sizes,
                      max_replay_buffer_size,
                      policy_type="deter"):
    observation_space = env.env_specs.observation_space[agent_id]
    n = env.env_specs.agent_num
    action_space = env.env_specs.action_space[agent_id]
    if policy_type == "deter":
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return FullyCentralizedAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=((n, ) + observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=CommNetValueFunction(
            input_shapes=((n, ) + observation_space.shape,
                          (n, ) + action_space.shape),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=n * observation_space.shape[0],
            action_dim=n * action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            reward_dim=n,
            terminal_dim=n,
        ),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )