Ejemplo n.º 1
0
def experiment(variant):
    num_agent = variant['num_agent']
    from differential_game import DifferentialGame
    expl_env = DifferentialGame(game_name=args.exp_name)
    eval_env = DifferentialGame(game_name=args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    for i in range(num_agent):
        from rlkit.torch.networks import FlattenMlp
        qf = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
        )
        target_qf = copy.deepcopy(qf)
        from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * 2,
        )
        target_policy = copy.deepcopy(policy)
        eval_policy = policy
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            from rlkit.exploration_strategies.ou_strategy import OUStrategy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=OUStrategy(
                    action_space=expl_env.action_space),
                policy=policy,
            )

        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)
        if variant['trainer_kwargs']['double_q']:
            qf2 = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
            )
            target_qf2 = copy.deepcopy(qf2)
            qf2_n.append(qf2)
            target_qf2_n.append(target_qf2)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            qf2_n=qf2_n,
                            target_qf2_n=target_qf2_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 2
0
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_obs = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    obs_gnn_1 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )

    graph_builder_eval = FullGraphBuilder(
        input_node_dim=graph_builder_obs.output_node_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    if variant['concat_emb']:
        gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] *
                          variant['graph_kwargs']['num_conv_layers'])
    else:
        gnn_out_dim = variant['graph_kwargs']['node_dim']
    from rlkit.torch.networks.networks import FlattenMlp
    post_mlp1 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet
    qf1 = R2GQNet(
        obs_gnn=obs_gnn_1,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp1,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf1 = copy.deepcopy(qf1)

    obs_gnn_2 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    post_mlp2 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    qf2 = R2GQNet(
        obs_gnn=obs_gnn_2,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp2,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    cgca = GNNNet(
        graph_builder_ca,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    graph_builder_policy = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_encoder=obs_gnn_1)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_encoder=obs_gnn_1)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
Ejemplo n.º 3
0
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    gnn1 = GNNNet(
        graph_builder_1,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = nn.Sequential(
        gnn1,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        ))
    target_qf1 = copy.deepcopy(qf1)

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_2 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    gnn2 = GNNNet(
        graph_builder_2,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    qf2 = nn.Sequential(
        gnn2,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        ))
    target_qf2 = copy.deepcopy(qf2)

    policy_n, eval_policy_n, expl_policy_n = [], [], []
    for i in range(num_agent):
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.masac.masac_gnn import MASACGNNTrainer
    trainer = MASACGNNTrainer(env=expl_env,
                              qf1=qf1,
                              target_qf1=target_qf1,
                              qf2=qf2,
                              target_qf2=target_qf2,
                              policy_n=policy_n,
                              **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 4
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(
                input_node_dim=obs_dim+action_dim,
                num_node=num_agent,
                batch_size=variant['algorithm_kwargs']['batch_size'],
                contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cg1 = GraphContextNet(
            graph_builder_1,
            obs_dim,
            action_dim,
            output_activation='lrelu0.2',
            **variant['graph_kwargs']
        )
    target_cg1 = copy.deepcopy(cg1)
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim,
                    output_size=1,
                    hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1),
                    hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                    )
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = FullGraphBuilder(
                input_node_dim=obs_dim+action_dim,
                num_node=num_agent,
                batch_size=variant['algorithm_kwargs']['batch_size'],
                contain_self_loop=False)
    cg2 = GraphContextNet(
            graph_builder_2,
            obs_dim,
            action_dim,
            output_activation='lrelu0.2',
            **variant['graph_kwargs']
        )
    target_cg2 = copy.deepcopy(cg2)
    qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim,
                    output_size=1,
                    hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1),
                    hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                    )
    target_qf2 = copy.deepcopy(qf2)


    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                        output_activation=nn.LeakyReLU(negative_slope=0.2),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                                    exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0),
                                    policy=policy,
                                )
        else:
            expl_policy = policy
        
        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)
        
    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.masac.masac_gnn_gcontext import MASACGNNTrainer
    trainer = MASACGNNTrainer(
        env=expl_env,
        cg1=cg1,
        target_cg1=target_cg1,
        qf1=qf1,
        target_qf1=target_qf1,
        cg2=cg2,
        target_cg2=target_cg2,
        qf2=qf2,
        target_qf2=target_qf2,
        policy_n=policy_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
Ejemplo n.º 5
0
def experiment(variant):
    num_agent = variant['num_agent']
    from differential_game import DifferentialGame
    expl_env = DifferentialGame(game_name=args.exp_name)
    eval_env = DifferentialGame(game_name=args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(input_node_dim=obs_dim + action_dim,
                                       num_node=num_agent,
                                       contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cg1 = GraphContextNet(
        graph_builder_1,
        obs_dim,
        action_dim,
        node_dim=variant['graph_kwargs']['hidden_dim'],
        output_activation='relu',
    )
    target_cg1 = copy.deepcopy(cg1)
    qf1 = nn.Sequential(
        nn.Linear(variant['graph_kwargs']['hidden_dim'] + action_dim,
                  variant['qf_kwargs']['hidden_dim']), nn.ReLU(),
        nn.Linear(variant['qf_kwargs']['hidden_dim'], 1))
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = FullGraphBuilder(input_node_dim=obs_dim + action_dim,
                                       num_node=num_agent,
                                       contain_self_loop=False)
    cg2 = GraphContextNet(
        graph_builder_2,
        obs_dim,
        action_dim,
        node_dim=variant['graph_kwargs']['hidden_dim'],
        output_activation='relu',
    )
    target_cg2 = copy.deepcopy(cg2)
    qf2 = nn.Sequential(
        nn.Linear(variant['graph_kwargs']['hidden_dim'] + action_dim,
                  variant['qf_kwargs']['hidden_dim']), nn.ReLU(),
        nn.Linear(variant['qf_kwargs']['hidden_dim'], 1))
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = FullGraphBuilder(input_node_dim=obs_dim + action_dim,
                                        num_node=num_agent,
                                        contain_self_loop=False)
    cgca = GraphContextNet(
        graph_builder_ca,
        obs_dim,
        action_dim,
        node_dim=variant['graph_kwargs']['hidden_dim'],
        output_activation='relu',
    )
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        nn.Linear(variant['graph_kwargs']['hidden_dim'],
                  variant['cactor_kwargs']['hidden_dim']), nn.ReLU(),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            nn.Linear(obs_dim, variant['policy_kwargs']['hidden_dim']),
            nn.ReLU(),
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      variant['policy_kwargs']['hidden_dim']), nn.ReLU(),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn2 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            cg1=cg1,
                            target_cg1=target_cg1,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            cg2=cg2,
                            target_cg2=target_cg2,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 6
0
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, vf_n = [], []
    policy_optimizer_n, vf_optimizer_n = None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        
        vf = FlattenMlp(
            input_size=obs_dim,
            output_size=1,
            hidden_sizes=[variant['vf_kwargs']['hidden_dim']]*variant['vf_kwargs']['num_layer'],
        )

        policy_n.append(policy)
        vf_n.append(vf)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    
    if variant['random_exploration']:
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
        expl_policy_n = [PolicyWrappedWithExplorationStrategy(
                                exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0),
                                policy=policy,
                            ) for i in range(num_agent)]
    else:
        expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.irl.irl_ppo import IRLPPOTrainer
    trainer = IRLPPOTrainer(
        env = expl_env,
        policy_n=policy_n,
        vf_n=vf_n,
        policy_optimizer_n=policy_optimizer_n,
        vf_optimizer_n=vf_optimizer_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 7
0
def experiment(variant):
    import sys
    sys.path.append("./particle-graph-envs")
    from make_env import make_env
    expl_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    eval_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    num_agent = expl_env.num_agents
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from particle_graph import ParticleGraphBuilder
    graph_builder_1 = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    from rlkit.torch.networks.gnn_networks import GNNNet
    gnn1 = GNNNet(
        graph_builder_1,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SelectLayer
    qf1 = nn.Sequential(
        gnn1, SelectLayer(dim=1, index=torch.arange(num_agent)),
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        ))
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    gnn2 = GNNNet(
        graph_builder_2,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    qf2 = nn.Sequential(
        gnn2, SelectLayer(dim=1, index=torch.arange(num_agent)),
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        ))
    target_qf2 = copy.deepcopy(qf2)

    policy_n, eval_policy_n, expl_policy_n = [], [], []
    for agent in range(num_agent):
        graph_builder_policy = ParticleGraphBuilder(
            num_agents=expl_env.scenario.num_agents,
            num_landmarks=expl_env.scenario.num_landmarks,
            batch_size=variant['algorithm_kwargs']['batch_size'],
            append_action=False,
            contain_self_loop=False,
        )
        gnn_policy = GNNNet(
            graph_builder_policy,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        )
        from rlkit.torch.networks.layers import SplitLayer, FlattenLayer
        policy = nn.Sequential(
            gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(),
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_obs=True)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_obs=True)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent,
                                      shared_obs=True)

    from rlkit.torch.masac.masac_gnn import MASACGNNTrainer
    trainer = MASACGNNTrainer(env=expl_env,
                              qf1=qf1,
                              target_qf1=target_qf1,
                              qf2=qf2,
                              target_qf2=target_qf2,
                              policy_n=policy_n,
                              shared_obs=True,
                              **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 8
0
def experiment(variant):
    num_agent = variant['num_agent']
    from sequential_differential_game import SequentialDifferentialGame
    expl_env = SequentialDifferentialGame(**variant['env_kwargs'])
    eval_env = SequentialDifferentialGame(**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, eval_policy_n, expl_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], [], [], []
    for i in range(num_agent):
        from rlkit.torch.layers import SplitLayer, ReshapeLayer
        weight_head = nn.Linear(variant['policy_kwargs']['hidden_dim'],
                                variant['policy_kwargs']['m'])
        mean_head = nn.Sequential(
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      action_dim * variant['policy_kwargs']['m']),
            ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim]))
        logstd_head = nn.Sequential(
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      action_dim * variant['policy_kwargs']['m']),
            ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim]))
        policy = nn.Sequential(
            nn.Linear(obs_dim, variant['policy_kwargs']['hidden_dim']),
            nn.ReLU(),
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      variant['policy_kwargs']['hidden_dim']), nn.ReLU(),
            SplitLayer(layers=[weight_head, mean_head, logstd_head]))
        from rlkit.torch.policies.mix_tanh_gaussian_policy import MixTanhGaussianPolicy
        policy = MixTanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy
        from rlkit.torch.networks import FlattenMlp
        qf1 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
        )
        target_qf2 = copy.deepcopy(qf2)
        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.masac.masac import MASACTrainer
    trainer = MASACTrainer(env=expl_env,
                           qf1_n=qf1_n,
                           target_qf1_n=target_qf1_n,
                           qf2_n=qf2_n,
                           target_qf2_n=target_qf2_n,
                           policy_n=policy_n,
                           **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 9
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    from rlkit.envs.ma_wrappers import MAProbDiscreteEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf1_n, qf2_n, cactor_n, policy_n, target_qf1_n, target_qf2_n, target_policy_n, expl_policy_n, eval_policy_n = \
        [], [], [], [], [], [], [], [], []
    for i in range(num_agent):
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        cactor = TanhGaussianPolicy(obs_dim=(obs_dim * num_agent + action_dim *
                                             (num_agent - 1)),
                                    action_dim=action_dim,
                                    **variant['cactor_kwargs'])
        policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                    action_dim=action_dim,
                                    **variant['policy_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        target_qf2 = copy.deepcopy(qf2)
        target_policy = copy.deepcopy(policy)
        expl_policy = policy
        eval_policy = MakeDeterministic(policy)
        qf1_n.append(qf1)
        qf2_n.append(qf2)
        cactor_n.append(cactor)
        policy_n.append(policy)
        target_qf1_n.append(target_qf1)
        target_qf2_n.append(target_qf2)
        target_policy_n.append(target_policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = PRGTrainer(env=expl_env,
                         qf1_n=qf1_n,
                         target_qf1_n=target_qf1_n,
                         qf2_n=qf2_n,
                         target_qf2_n=target_qf2_n,
                         policy_n=policy_n,
                         target_policy_n=target_policy_n,
                         cactor_n=cactor_n,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 10
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, vf_n = [], []
    policy_optimizer_n, vf_optimizer_n = None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy, return_raw_action=True)

        vf = FlattenMlp(
            input_size=obs_dim,
            output_size=1,
            hidden_sizes=[variant['vf_kwargs']['hidden_dim']] *
            variant['vf_kwargs']['num_layer'],
        )

        policy_n.append(policy)
        vf_n.append(vf)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             collect_raw_actions=True)

    from rlkit.torch.irl.irl_ppo import IRLPPOTrainer
    trainer = IRLPPOTrainer(env=expl_env,
                            policy_n=policy_n,
                            vf_n=vf_n,
                            policy_optimizer_n=policy_optimizer_n,
                            vf_optimizer_n=vf_optimizer_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 11
0
def experiment(variant):
    import gym
    import robosumo.envs
    from robosumo_env_wrapper import RoboSumoEnv
    expl_env = RoboSumoEnv(
        gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)),
        **variant['world_args'])
    eval_env = RoboSumoEnv(
        gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)),
        **variant['world_args'])
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, qf_n = [], []
    policy_optimizer_n, qf_optimizer_n = None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy, return_raw_action=True)

        qf = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            variant['qf_kwargs']['num_layer'],
        )

        policy_n.append(policy)
        qf_n.append(qf)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             collect_raw_actions=True)

    from rlkit.torch.coma.coma import COMATrainer
    trainer = COMATrainer(env=expl_env,
                          policy_n=policy_n,
                          qf_n=qf_n,
                          policy_optimizer_n=policy_optimizer_n,
                          qf_optimizer_n=qf_optimizer_n,
                          **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 12
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], []
    log_alpha_n = None
    qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \
        None, None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        
        qf1 = FlattenMlp(
            input_size=(obs_dim+action_dim),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim+action_dim),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf2 = copy.deepcopy(qf2)

        policy_n.append(policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.irl.irl_sac import IRLSACTrainer
    trainer = IRLSACTrainer(
        env = expl_env,
        qf1_n=qf1_n,
        target_qf1_n=target_qf1_n,
        qf2_n=qf2_n,
        target_qf2_n=target_qf2_n,
        policy_n=policy_n,
        log_alpha_n=log_alpha_n,
        qf1_optimizer_n=qf1_optimizer_n,
        qf2_optimizer_n=qf2_optimizer_n,
        policy_optimizer_n=policy_optimizer_n,
        alpha_optimizer_n=alpha_optimizer_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 13
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_epoch = variant['load_kwargs']['load_epoch']
        load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch),
                               map_location='cpu')
        qf1_n = load_data['trainer/qf1_n']
        target_qf1_n = load_data['trainer/target_qf1_n']
        qf2_n = load_data['trainer/qf2_n']
        target_qf2_n = load_data['trainer/target_qf2_n']
        cactor_n = load_data['trainer/cactor_n']
        policy_n = load_data['trainer/policy_n']
        log_alpha_n = load_data['trainer/log_alpha_n']

        qf1_optimizer_n = load_data['trainer/qf1_optimizer_n']
        qf2_optimizer_n = load_data['trainer/qf2_optimizer_n']
        policy_optimizer_n = load_data['trainer/policy_optimizer_n']
        cactor_optimizer_n = load_data['trainer/cactor_optimizer_n']
        alpha_optimizer_n = load_data['trainer/alpha_optimizer_n']
        if args.ce:
            log_calpha_n = load_data['trainer/log_calpha_n']
            calpha_optimizer_n = load_data['trainer/calpha_optimizer_n']

        replay_buffer = load_data['replay_buffer']
    else:
        qf1_n, qf2_n, cactor_n, policy_n = [], [], [], []
        target_qf1_n, target_qf2_n = [], []
        log_alpha_n, log_calpha_n = None, None
        qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n  = \
            None, None, None, None, None, None
        for i in range(num_agent):
            from rlkit.torch.networks import FlattenMlp
            qf1 = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf1 = copy.deepcopy(qf1)
            qf2 = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf2 = copy.deepcopy(qf2)
            from rlkit.torch.layers import SplitLayer
            if variant['trainer_kwargs']['dec_cactor']:
                input_size = obs_dim + action_dim * (num_agent - 1)
            else:
                input_size = obs_dim * num_agent + action_dim * (num_agent - 1)
            cactor = nn.Sequential(
                FlattenMlp(
                    input_size=input_size,
                    output_size=variant['cactor_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
                    (variant['cactor_kwargs']['num_layer'] - 1),
                ),
                SplitLayer(layers=[
                    nn.Linear(variant['cactor_kwargs']['hidden_dim'],
                              action_dim),
                    nn.Linear(variant['cactor_kwargs']['hidden_dim'],
                              action_dim)
                ]))
            from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
            cactor = TanhGaussianPolicy(module=cactor)

            policy = nn.Sequential(
                FlattenMlp(
                    input_size=obs_dim,
                    output_size=variant['policy_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                    (variant['policy_kwargs']['num_layer'] - 1),
                ),
                SplitLayer(layers=[
                    nn.Linear(variant['policy_kwargs']['hidden_dim'],
                              action_dim),
                    nn.Linear(variant['policy_kwargs']['hidden_dim'],
                              action_dim)
                ]))
            policy = TanhGaussianPolicy(module=policy)

            qf1_n.append(qf1)
            qf2_n.append(qf2)
            cactor_n.append(cactor)
            policy_n.append(policy)
            target_qf1_n.append(target_qf1)
            target_qf2_n.append(target_qf2)

            from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
            replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                              expl_env,
                                              num_agent=num_agent)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.prg.prg import PRGTrainer
    trainer = PRGTrainer(env=expl_env,
                         qf1_n=qf1_n,
                         target_qf1_n=target_qf1_n,
                         qf2_n=qf2_n,
                         target_qf2_n=target_qf2_n,
                         policy_n=policy_n,
                         cactor_n=cactor_n,
                         log_alpha_n=log_alpha_n,
                         log_calpha_n=log_calpha_n,
                         qf1_optimizer_n=qf1_optimizer_n,
                         qf2_optimizer_n=qf2_optimizer_n,
                         policy_optimizer_n=policy_optimizer_n,
                         cactor_optimizer_n=cactor_optimizer_n,
                         alpha_optimizer_n=alpha_optimizer_n,
                         calpha_optimizer_n=calpha_optimizer_n,
                         **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 14
0
def experiment(variant):
    num_agent = variant['num_agent']
    from sequential_differential_game import SequentialDifferentialGame
    expl_env = SequentialDifferentialGame(**variant['env_kwargs'])
    eval_env = SequentialDifferentialGame(**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_ca = FullGraphBuilder(
                input_node_dim=obs_dim+action_dim,
                num_node=num_agent,
                batch_size=variant['algorithm_kwargs']['batch_size'],
                contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cgca = GraphContextNet(
            graph_builder_ca,
            obs_dim,
            action_dim,
            use_attention=variant['graph_kwargs']['use_attention'],
            num_layer=variant['graph_kwargs']['num_layer'],
            node_dim=variant['graph_kwargs']['hidden_dim'],
            output_activation='relu',
        )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim'],
                    output_size=variant['cactor_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1),
                    ),
        nn.ReLU(),
        SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                            nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
        )
    cactor = TanhGaussianPolicy(module=cactor)


    policy_n, expl_policy_n, eval_policy_n = [], [], []
    qf1_n, qf2_n, target_qf1_n, target_qf2_n = [], [], [], []
    for i in range(num_agent):
        qf1 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf2 = copy.deepcopy(qf2)

        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        expl_policy = policy
        
        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)
        qf1_n.append(qf1)
        qf2_n.append(qf2)
        target_qf1_n.append(target_qf1)
        target_qf2_n.append(target_qf2)
        
    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn3_onlyca import R2GGNNTrainer
    trainer = R2GGNNTrainer(
        env=expl_env,
        qf1_n=qf1_n,
        target_qf1_n=target_qf1_n,
        qf2_n=qf2_n,
        target_qf2_n=target_qf2_n,
        cactor=cactor,
        policy_n=policy_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 15
0
def experiment(variant):
    num_agent = variant['num_agent']
    from rlkit.envs.zmq_env import ZMQEnv
    expl_env = ZMQEnv(variant['port'])
    eval_env = expl_env
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf_n, qf2_n, policy_n, target_qf_n, target_qf2_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], [], []
    for i in range(num_agent):
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            **variant['qf_kwargs']
        )
        qf2 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            **variant['qf_kwargs']
        )
        policy = GumbelSoftmaxMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            **variant['policy_kwargs']
        )
        target_qf = copy.deepcopy(qf)
        target_qf2 = copy.deepcopy(qf2)
        target_policy = copy.deepcopy(policy)
        eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        qf_n.append(qf)
        qf2_n.append(qf2)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_qf2_n.append(target_qf2)
        target_policy_n.append(target_policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)
    trainer = MADDPGTrainer(
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        qf2_n=qf2_n,
        target_qf2_n=target_qf2_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 16
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=True,
                 discrete_action_input=True))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=True,
                 discrete_action_input=True))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    policy_n, target_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], [], []
    for i in range(num_agent):
        policy = SoftmaxMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
        target_policy = copy.deepcopy(policy)
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf2 = copy.deepcopy(qf2)
        eval_policy = ArgmaxDiscretePolicy(policy)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        policy_n.append(policy)
        target_policy_n.append(target_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = PRGDiscreteTrainer(env=expl_env,
                                 qf1_n=qf1_n,
                                 target_qf1_n=target_qf1_n,
                                 qf2_n=qf2_n,
                                 target_qf2_n=target_qf2_n,
                                 policy_n=policy_n,
                                 target_policy_n=target_policy_n,
                                 **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 17
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from simple_spread_graph import SimpleSpreadGraphBuilder
    graph_builder_1 = SimpleSpreadGraphBuilder(
                                num_agents=expl_env.scenario.num_agents,
                                num_landmarks=expl_env.scenario.num_landmarks,
                                batch_size=variant['algorithm_kwargs']['batch_size'],
                                append_action=True,
                                single_observe=False,
                                contain_self_loop=False,
                            )
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    from rlkit.torch.networks.layers import SelectLayer
    cg1= nn.Sequential(
        GraphContextNet(
                graph_builder_1,
                graph_builder_1.output_node_dim-action_dim,
                action_dim,
                output_activation='lrelu0.2',
                **variant['graph_kwargs']
            ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
        )
    target_cg1 = copy.deepcopy(cg1)
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim,
                    output_size=1,
                    hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1),
                    hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                    )
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = SimpleSpreadGraphBuilder(
                                num_agents=expl_env.scenario.num_agents,
                                num_landmarks=expl_env.scenario.num_landmarks,
                                batch_size=variant['algorithm_kwargs']['batch_size'],
                                append_action=True,
                                single_observe=False,
                                contain_self_loop=False,
                            )
    cg2 = nn.Sequential(
        GraphContextNet(
                graph_builder_2,
                graph_builder_2.output_node_dim-action_dim,
                action_dim,
                output_activation='lrelu0.2',
                **variant['graph_kwargs']
            ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
        )
    target_cg2 = copy.deepcopy(cg2)
    qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim,
                    output_size=1,
                    hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1),
                    hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                    )
    target_qf2 = copy.deepcopy(qf2)


    graph_builder_ca = SimpleSpreadGraphBuilder(
                                num_agents=expl_env.scenario.num_agents,
                                num_landmarks=expl_env.scenario.num_landmarks,
                                batch_size=variant['algorithm_kwargs']['batch_size'],
                                append_action=True,
                                single_observe=False,
                                contain_self_loop=False,
                            )
    from rlkit.torch.networks.gnn_networks import GNNNet
    cgca = nn.Sequential(
            GNNNet(
                pre_graph_builder=graph_builder_ca,
                node_dim=variant['graph_kwargs']['node_dim'],
                conv_type='GSage',
                num_conv_layers=variant['graph_kwargs']['num_layer'],
                hidden_activation='lrelu0.2',
                output_activation='lrelu0.2',
            ),
            SelectLayer(dim=1, index=torch.arange(num_agent)),
        )
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        FlattenMlp(input_size=variant['graph_kwargs']['node_dim'],
                    output_size=variant['cactor_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1),
                    hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                    output_activation=nn.LeakyReLU(negative_slope=0.2),
                    ),
        nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                            nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
        )
    cactor = TanhGaussianPolicy(module=cactor)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                        output_activation=nn.LeakyReLU(negative_slope=0.2),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        expl_policy = policy
        
        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)
        
    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer
    trainer = R2GGNNTrainer(
        env=expl_env,
        cg1=cg1,
        target_cg1=target_cg1,
        qf1=qf1,
        target_qf1=target_qf1,
        cg2=cg2,
        target_cg2=target_cg2,
        qf2=qf2,
        target_qf2=target_qf2,
        cgca=cgca,
        cactor=cactor,
        policy_n=policy_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)
    
    algorithm.train()
Ejemplo n.º 18
0
def experiment(variant):
    import gym
    import robosumo.envs
    from robosumo_env_wrapper import RoboSumoEnv
    expl_env = RoboSumoEnv(
        gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)),
        **variant['world_args'])
    eval_env = RoboSumoEnv(
        gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)),
        **variant['world_args'])
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf1_n, qf2_n, cactor_n, policy_n = [], [], [], []
    target_qf1_n, target_qf2_n = [], []
    log_alpha_n, log_calpha_n = None, None
    qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n  = \
        None, None, None, None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        qf1 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            variant['qf_kwargs']['num_layer'],
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            variant['qf_kwargs']['num_layer'],
        )
        target_qf2 = copy.deepcopy(qf2)
        from rlkit.torch.networks.layers import SplitLayer
        if variant['trainer_kwargs']['dec_cactor']:
            input_size = obs_dim + action_dim * (num_agent - 1)
        else:
            input_size = obs_dim * num_agent + action_dim * (num_agent - 1)
        cactor = nn.Sequential(
            FlattenMlp(
                input_size=input_size,
                output_size=variant['cactor_kwargs']['hidden_dim'],
                hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
                (variant['cactor_kwargs']['num_layer'] - 1),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        cactor = TanhGaussianPolicy(module=cactor)

        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)

        qf1_n.append(qf1)
        qf2_n.append(qf2)
        cactor_n.append(cactor)
        policy_n.append(policy)
        target_qf1_n.append(target_qf1)
        target_qf2_n.append(target_qf2)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.r2g.r2g import R2GTrainer
    trainer = R2GTrainer(env=expl_env,
                         qf1_n=qf1_n,
                         target_qf1_n=target_qf1_n,
                         qf2_n=qf2_n,
                         target_qf2_n=target_qf2_n,
                         policy_n=policy_n,
                         cactor_n=cactor_n,
                         log_alpha_n=log_alpha_n,
                         log_calpha_n=log_calpha_n,
                         qf1_optimizer_n=qf1_optimizer_n,
                         qf2_optimizer_n=qf2_optimizer_n,
                         policy_optimizer_n=policy_optimizer_n,
                         cactor_optimizer_n=cactor_optimizer_n,
                         alpha_optimizer_n=alpha_optimizer_n,
                         calpha_optimizer_n=calpha_optimizer_n,
                         **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 19
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \
        [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    for i in range(num_agent):
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            **variant['qf_kwargs']
        )
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            **variant['policy_kwargs']
        )
        target_qf = copy.deepcopy(qf)
        target_policy = copy.deepcopy(policy)
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=OUStrategy(action_space=expl_env.action_space),
            policy=policy,
        )
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        exploration_policy_n.append(exploration_policy)
        if variant['trainer_kwargs']['double_q']:
            qf2 = FlattenMlp(
                input_size=(obs_dim*num_agent+action_dim*num_agent),
                output_size=1,
                **variant['qf_kwargs']
            )
            target_qf2 = copy.deepcopy(qf2)
            qf2_n.append(qf2)
            target_qf2_n.append(target_qf2)

    eval_path_collector = MAMdpPathCollector(eval_env, policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)
    trainer = MADDPGTrainer(
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        qf2_n = qf2_n,
        target_qf2_n = target_qf2_n,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 20
0
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], []
    log_alpha_n = None
    qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \
        None, None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(input_size=obs_dim,
                        output_size=variant['policy_kwargs']['hidden_dim'],
                        hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1),
                        ),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        
        qf1 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf2 = copy.deepcopy(qf2)

        policy_n.append(policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]

    if variant['random_exploration']:
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
        expl_policy_n = [PolicyWrappedWithExplorationStrategy(
                                exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0),
                                policy=policy,
                            ) for i in range(num_agent)]
    else:
        expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.masac.masac import MASACTrainer
    trainer = MASACTrainer(
        env = expl_env,
        qf1_n=qf1_n,
        target_qf1_n=target_qf1_n,
        qf2_n=qf2_n,
        target_qf2_n=target_qf2_n,
        policy_n=policy_n,
        log_alpha_n=log_alpha_n,
        qf1_optimizer_n=qf1_optimizer_n,
        qf2_optimizer_n=qf2_optimizer_n,
        policy_optimizer_n=policy_optimizer_n,
        alpha_optimizer_n=alpha_optimizer_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 21
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_epoch = variant['load_kwargs']['load_epoch']
        load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch),
                               map_location='cpu')
        qf_n = load_data['trainer/qf_n']
        target_qf_n = load_data['trainer/target_qf_n']
        qf2_n, target_qf2_n = [], []
        policy_n = load_data['trainer/policy_n']
        target_policy_n = load_data['trainer/target_policy_n']

        qf_optimizer_n = load_data['trainer/qf_optimizer_n']
        qf2_optimizer_n = None
        policy_optimizer_n = load_data['trainer/policy_optimizer_n']

        replay_buffer = load_data['replay_buffer']
    else:
        qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
            [], [], [], [], [], []
        qf2_n, target_qf2_n = [], []
        qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None
        for i in range(num_agent):
            from rlkit.torch.networks.networks import FlattenMlp
            qf = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf = copy.deepcopy(qf)
            from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
            policy = TanhMlpPolicy(
                input_size=obs_dim,
                output_size=action_dim,
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                variant['policy_kwargs']['num_layer'],
            )
            target_policy = copy.deepcopy(policy)

            qf_n.append(qf)
            policy_n.append(policy)
            target_qf_n.append(target_qf)
            target_policy_n.append(target_policy)

        from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
        replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                          expl_env,
                                          num_agent=num_agent)

    from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    expl_policy_n = [
        PolicyWrappedWithExplorationStrategy(
            exploration_strategy=OUStrategy(
                action_space=expl_env.action_space),
            policy=policy,
        ) for policy in policy_n
    ]
    eval_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            qf2_n=qf2_n,
                            target_qf2_n=target_qf2_n,
                            qf_optimizer_n=qf_optimizer_n,
                            qf2_optimizer_n=qf2_optimizer_n,
                            policy_optimizer_n=policy_optimizer_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 22
0
def experiment(variant):
    import sys
    sys.path.append("./particle-graph-envs")
    from make_env import make_env
    expl_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    eval_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    num_agent = expl_env.num_agents
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from particle_graph import ParticleGraphBuilder
    graph_builder_obs = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=False,
        contain_self_loop=False,
    )
    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_eval = FullGraphBuilder(
        input_node_dim=graph_builder_obs.output_node_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.networks import FlattenMlp
    post_mlp1 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'],
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    from rlkit.torch.networks.graph_r2g_qnet import R2GQNet
    qf1 = R2GQNet(
        obs_graph_builder=graph_builder_obs,
        eval_graph_builder=graph_builder_eval,
        obs_dim=graph_builder_obs.output_node_dim,
        action_dim=action_dim,
        post_mlp=post_mlp1,
        normalize_emb=False,
        output_activation=None,
        **variant['graph_kwargs'],
    )
    target_qf1 = copy.deepcopy(qf1)

    post_mlp2 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'],
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    from rlkit.torch.networks.graph_r2g_qnet import R2GQNet
    qf2 = R2GQNet(
        obs_graph_builder=graph_builder_obs,
        eval_graph_builder=graph_builder_eval,
        obs_dim=graph_builder_obs.output_node_dim,
        action_dim=action_dim,
        post_mlp=post_mlp2,
        normalize_emb=False,
        output_activation=None,
        **variant['graph_kwargs'],
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    from rlkit.torch.networks.gnn_networks import GNNNet
    from rlkit.torch.networks.layers import SelectLayer
    cgca = nn.Sequential(
        GNNNet(
            graph_builder_ca,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
    )
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for agent in range(num_agent):
        graph_builder_policy = ParticleGraphBuilder(
            num_agents=expl_env.scenario.num_agents,
            num_landmarks=expl_env.scenario.num_landmarks,
            batch_size=variant['algorithm_kwargs']['batch_size'],
            append_action=False,
            contain_self_loop=False,
        )
        gnn_policy = GNNNet(
            graph_builder_policy,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        )
        from rlkit.torch.networks.layers import SplitLayer, FlattenLayer
        policy = nn.Sequential(
            gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(),
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_obs=True)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_obs=True)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent,
                                      shared_obs=True)

    from rlkit.torch.r2g.r2g_gnn11 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            shared_obs=True,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
Ejemplo n.º 23
0
def experiment(variant):
    import gym
    import robosumo.envs
    from robosumo_env_wrapper import RoboSumoEnv
    expl_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args'])
    eval_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args'])
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf = copy.deepcopy(qf)
        from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*variant['policy_kwargs']['num_layer'],
        )
        target_policy = copy.deepcopy(policy)
        
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    expl_policy_n = [PolicyWrappedWithExplorationStrategy(
                            exploration_strategy=OUStrategy(action_space=expl_env.action_space),
                            policy=policy,
                        ) for policy in policy_n]
    eval_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        qf2_n = qf2_n,
        target_qf2_n = target_qf2_n,
        qf_optimizer_n=qf_optimizer_n,
        qf2_optimizer_n=qf2_optimizer_n,
        policy_optimizer_n=policy_optimizer_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 24
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cg1 = GraphContextNet(graph_builder_1,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg1 = copy.deepcopy(cg1)

    graph_builder_2 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    cg2 = GraphContextNet(graph_builder_2,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg2 = copy.deepcopy(cg2)

    graph_builder_ca = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    cgca = GraphContextNet(graph_builder_ca,
                           obs_dim,
                           action_dim,
                           output_activation='lrelu0.2',
                           **variant['graph_kwargs'])

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    qf1_n, target_qf1_n, qf2_n, target_qf2_n = [], [], [], []
    cactor_n = []
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        qf1 = FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'] + action_dim,
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'] + action_dim,
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            (variant['qf_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
        )
        target_qf2 = copy.deepcopy(qf2)

        from rlkit.torch.networks.layers import SplitLayer
        cactor = nn.Sequential(
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['cactor_kwargs']['hidden_dim'],
                hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
                (variant['cactor_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            ), nn.LeakyReLU(negative_slope=0.2),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        cactor = TanhGaussianPolicy(module=cactor)

        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)
        cactor_n.append(cactor)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn4 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            cg1=cg1,
                            target_cg1=target_cg1,
                            qf1_n=qf1_n,
                            target_qf1_n=target_qf1_n,
                            cg2=cg2,
                            target_cg2=target_cg2,
                            qf2_n=qf2_n,
                            target_qf2_n=target_qf2_n,
                            cgca=cgca,
                            cactor_n=cactor_n,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 25
0
def experiment(variant):
    from matrix_game import MatrixGame
    expl_env = MatrixGame(game_name=args.exp_name)
    eval_env = MatrixGame(game_name=args.exp_name)
    num_agent = eval_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], []
    for i in range(num_agent):
        policy = SoftmaxMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            **variant['policy_kwargs']
        )
        qf1 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*(num_agent-1)),
            output_size=action_dim,
            **variant['qf_kwargs']
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*(num_agent-1)),
            output_size=action_dim,
            **variant['qf_kwargs']
        )
        target_qf2 = copy.deepcopy(qf2)
        eval_policy = ArgmaxDiscretePolicy(policy)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        policy_n.append(policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)
    trainer = MASACDiscreteTrainer(
        env = expl_env,
        qf1_n=qf1_n,
        target_qf1_n=target_qf1_n,
        qf2_n=qf2_n,
        target_qf2_n=target_qf2_n,
        policy_n=policy_n,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()