コード例 #1
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cg1 = GraphContextNet(graph_builder_1,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg1 = copy.deepcopy(cg1)
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    cg2 = GraphContextNet(graph_builder_2,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg2 = copy.deepcopy(cg2)
    qf2 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    cgca = GNNNet(
        pre_graph_builder=graph_builder_ca,
        node_dim=variant['graph_kwargs']['node_dim'],
        conv_type='GSage',
        num_conv_layers=variant['graph_kwargs']['num_layer'],
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
    )
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            cg1=cg1,
                            target_cg1=target_cg1,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            cg2=cg2,
                            target_cg2=target_cg2,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cgca=cgca,
                            cactor=cactor,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
コード例 #2
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    num_agent = expl_env.num_agents
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_obs = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    obs_gnn_1 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )

    graph_builder_eval = FullGraphBuilder(
        input_node_dim=graph_builder_obs.output_node_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    if variant['concat_emb']:
        gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] *
                          variant['graph_kwargs']['num_conv_layers'])
    else:
        gnn_out_dim = variant['graph_kwargs']['node_dim']
    from rlkit.torch.networks.networks import FlattenMlp
    post_mlp1 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet
    qf1 = R2GQNet(
        obs_gnn=obs_gnn_1,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp1,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf1 = copy.deepcopy(qf1)

    obs_gnn_2 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    post_mlp2 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    qf2 = R2GQNet(
        obs_gnn=obs_gnn_2,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp2,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    cgca = GNNNet(
        graph_builder_ca,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    graph_builder_policy = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_encoder=obs_gnn_1)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_encoder=obs_gnn_1)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
コード例 #3
0
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_1 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.graph_context_network import GraphContextNet
    cg1 = GraphContextNet(graph_builder_1,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg1 = copy.deepcopy(cg1)
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_2 = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    cg2 = GraphContextNet(graph_builder_2,
                          obs_dim,
                          action_dim,
                          output_activation='lrelu0.2',
                          **variant['graph_kwargs'])
    target_cg2 = copy.deepcopy(cg2)
    qf2 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf2 = copy.deepcopy(qf2)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        from rlkit.torch.networks.layers import SplitLayer
        policy = nn.Sequential(
            FlattenMlp(
                input_size=obs_dim,
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.masac.masac_gnn_gcontext import MASACGNNTrainer
    trainer = MASACGNNTrainer(env=expl_env,
                              cg1=cg1,
                              target_cg1=target_cg1,
                              qf1=qf1,
                              target_qf1=target_qf1,
                              cg2=cg2,
                              target_cg2=target_cg2,
                              qf2=qf2,
                              target_qf2=target_qf2,
                              policy_n=policy_n,
                              **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
コード例 #4
0
ファイル: r2g_gnn10_gaussian.py プロジェクト: maxiaoba/rlkit
def experiment(variant):
    import sys
    sys.path.append("./particle-graph-envs")
    from make_env import make_env
    expl_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    eval_env = make_env(args.exp_name,
                        discrete_action_space=False,
                        world_args=variant['world_args'])
    num_agent = expl_env.num_agents
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from particle_graph import ParticleGraphBuilder
    graph_builder_cg1 = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    from rlkit.torch.networks.gnn_networks import GNNNet
    from rlkit.torch.networks.layers import SelectLayer
    cg1 = nn.Sequential(
        GNNNet(
            graph_builder_cg1,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
    )
    target_cg1 = copy.deepcopy(cg1)
    from rlkit.torch.networks.networks import FlattenMlp
    qf1 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf1 = copy.deepcopy(qf1)

    graph_builder_cg2 = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    from rlkit.torch.networks.gnn_networks import GNNNet
    cg2 = nn.Sequential(
        GNNNet(
            graph_builder_cg2,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
    )
    target_cg2 = copy.deepcopy(cg2)
    qf2 = FlattenMlp(
        input_size=variant['graph_kwargs']['node_dim'] + action_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = ParticleGraphBuilder(
        num_agents=expl_env.scenario.num_agents,
        num_landmarks=expl_env.scenario.num_landmarks,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        append_action=True,
        contain_self_loop=False,
    )
    cgca = nn.Sequential(
        GNNNet(
            graph_builder_ca,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        ),
        SelectLayer(dim=1, index=torch.arange(num_agent)),
    )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for agent in range(num_agent):
        graph_builder_policy = ParticleGraphBuilder(
            num_agents=expl_env.scenario.num_agents,
            num_landmarks=expl_env.scenario.num_landmarks,
            batch_size=variant['algorithm_kwargs']['batch_size'],
            append_action=False,
            contain_self_loop=False,
        )
        gnn_policy = GNNNet(
            graph_builder_policy,
            hidden_activation='lrelu0.2',
            output_activation='lrelu0.2',
            **variant['graph_kwargs'],
        )
        from rlkit.torch.networks.layers import SplitLayer, FlattenLayer
        policy = nn.Sequential(
            gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(),
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_obs=True)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_obs=True)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent,
                                      shared_obs=True)

    from rlkit.torch.r2g.r2g_gnn10 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            cg1=cg1,
                            target_cg1=target_cg1,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            cg2=cg2,
                            target_cg2=target_cg2,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            shared_obs=True,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
コード例 #5
0
def awac_rig_experiment(
    max_path_length,
    qf_kwargs,
    trainer_kwargs,
    replay_buffer_kwargs,
    policy_kwargs,
    algo_kwargs,
    train_vae_kwargs,
    policy_class=TanhGaussianPolicy,
    env_id=None,
    env_class=None,
    env_kwargs=None,
    reward_kwargs=None,
    observation_key='latent_observation',
    desired_goal_key='latent_desired_goal',
    state_observation_key='state_observation',
    state_goal_key='state_desired_goal',
    image_goal_key='image_desired_goal',
    path_loader_class=MDPPathLoader,
    demo_replay_buffer_kwargs=None,
    path_loader_kwargs=None,
    env_demo_path='',
    env_offpolicy_data_path='',
    debug=False,
    epsilon=1.0,
    exploration_policy_kwargs=None,
    evaluation_goal_sampling_mode=None,
    exploration_goal_sampling_mode=None,
    add_env_demos=False,
    add_env_offpolicy_data=False,
    save_paths=False,
    load_demos=False,
    pretrain_policy=False,
    pretrain_rl=False,
    save_pretrained_algorithm=False,

    # Video parameters
    save_video=True,
    save_video_kwargs=None,
    renderer_kwargs=None,
    imsize=84,
    pretrained_vae_path="",
    presampled_goals_path="",
    init_camera=None,
    qf_class=ConcatMlp,
):

    #Kwarg Definitions
    if exploration_policy_kwargs is None:
        exploration_policy_kwargs = {}
    if demo_replay_buffer_kwargs is None:
        demo_replay_buffer_kwargs = {}
    if path_loader_kwargs is None:
        path_loader_kwargs = {}
    if not save_video_kwargs:
        save_video_kwargs = {}
    if not renderer_kwargs:
        renderer_kwargs = {}

    if debug:
        max_path_length = 5
        algo_kwargs['batch_size'] = 5
        algo_kwargs['num_epochs'] = 5
        algo_kwargs['num_eval_steps_per_epoch'] = 100
        algo_kwargs['num_expl_steps_per_train_loop'] = 100
        algo_kwargs['num_trains_per_train_loop'] = 10
        algo_kwargs['min_num_steps_before_training'] = 100
        algo_kwargs['min_num_steps_before_training'] = 100
        trainer_kwargs['bc_num_pretrain_steps'] = min(
            10, trainer_kwargs.get('bc_num_pretrain_steps', 0))
        trainer_kwargs['q_num_pretrain1_steps'] = min(
            10, trainer_kwargs.get('q_num_pretrain1_steps', 0))
        trainer_kwargs['q_num_pretrain2_steps'] = min(
            10, trainer_kwargs.get('q_num_pretrain2_steps', 0))

    #Enviorment Wrapping
    renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs)

    def contextual_env_distrib_and_reward(env_id, env_class, env_kwargs,
                                          goal_sampling_mode,
                                          presampled_goals_path):
        state_env = get_gym_env(env_id,
                                env_class=env_class,
                                env_kwargs=env_kwargs)
        renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs)
        img_env = InsertImageEnv(state_env, renderer=renderer)

        # encoded_env = EncoderWrappedEnv(
        #     img_env,
        #     model,
        #     dict(image_observation="latent_observation", ),
        # )
        # if goal_sampling_mode == "vae_prior":
        #     latent_goal_distribution = PriorDistribution(
        #         model.representation_size,
        #         desired_goal_key,
        #     )
        #     diagnostics = StateImageGoalDiagnosticsFn({}, )
        # elif goal_sampling_mode == "presampled":
        #     diagnostics = state_env.get_contextual_diagnostics
        #     image_goal_distribution = PresampledPathDistribution(
        #         presampled_goals_path,
        #     )

        #     latent_goal_distribution = AddLatentDistribution(
        #         image_goal_distribution,
        #         image_goal_key,
        #         desired_goal_key,
        #         model,
        #     )
        # elif goal_sampling_mode == "reset_of_env":
        #     state_goal_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs)
        #     state_goal_distribution = GoalDictDistributionFromMultitaskEnv(
        #         state_goal_env,
        #         desired_goal_keys=[state_goal_key],
        #     )
        #     image_goal_distribution = AddImageDistribution(
        #         env=state_env,
        #         base_distribution=state_goal_distribution,
        #         image_goal_key=image_goal_key,
        #         renderer=renderer,
        #     )
        #     latent_goal_distribution = AddLatentDistribution(
        #         image_goal_distribution,
        #         image_goal_key,
        #         desired_goal_key,
        #         model,
        #     )
        #     no_goal_distribution = PriorDistribution(
        #         representation_size=0,
        #         key="no_goal",
        #     )
        #     diagnostics = state_goal_env.get_contextual_diagnostics
        # else:
        #     error
        diagnostics = StateImageGoalDiagnosticsFn({}, )
        no_goal_distribution = PriorDistribution(
            representation_size=0,
            key="no_goal",
        )

        reward_fn = GraspingRewardFn(
            # img_env, # state_env,
            # observation_key=observation_key,
            # desired_goal_key=desired_goal_key,
            # **reward_kwargs
        )

        env = ContextualEnv(
            img_env,  # state_env,
            context_distribution=no_goal_distribution,
            reward_fn=reward_fn,
            observation_key=observation_key,
            contextual_diagnostics_fns=[diagnostics],
        )
        return env, no_goal_distribution, reward_fn

    #VAE Setup
    if pretrained_vae_path:
        model = load_local_or_remote_file(pretrained_vae_path)
    else:
        model = train_vae(train_vae_kwargs, env_kwargs, env_id, env_class,
                          imsize, init_camera)
    path_loader_kwargs['model_path'] = pretrained_vae_path

    #Enviorment Definitions
    expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward(
        env_id, env_class, env_kwargs, exploration_goal_sampling_mode,
        presampled_goals_path)
    eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward(
        env_id, env_class, env_kwargs, evaluation_goal_sampling_mode,
        presampled_goals_path)
    path_loader_kwargs['env'] = eval_env

    #AWAC Code
    if add_env_demos:
        path_loader_kwargs["demo_paths"].append(env_demo_path)
    if add_env_offpolicy_data:
        path_loader_kwargs["demo_paths"].append(env_offpolicy_data_path)

    #Key Setting
    context_key = desired_goal_key
    obs_dim = (expl_env.observation_space.spaces[observation_key].low.size +
               expl_env.observation_space.spaces[context_key].low.size)
    action_dim = expl_env.action_space.low.size

    state_rewards = reward_kwargs.get('reward_type', 'dense') == 'wrapped_env'
    # if state_rewards:
    #     mapper = RemapKeyFn({context_key: observation_key, state_goal_key: state_observation_key})
    #     obs_keys = [state_observation_key, observation_key]
    #     cont_keys = [state_goal_key, context_key]
    # else:
    mapper = RemapKeyFn({context_key: observation_key})
    obs_keys = [observation_key]
    cont_keys = [context_key]

    #Replay Buffer
    def concat_context_to_obs(batch, replay_buffer, obs_dict, next_obs_dict,
                              new_contexts):
        obs = batch['observations']
        next_obs = batch['next_observations']
        context = batch[context_key]
        batch['observations'] = np.concatenate([obs, context], axis=1)
        batch['next_observations'] = np.concatenate([next_obs, context],
                                                    axis=1)
        return batch

    replay_buffer = ContextualRelabelingReplayBuffer(
        env=eval_env,
        context_keys=cont_keys,
        observation_keys=obs_keys,
        observation_key=observation_key,
        context_distribution=expl_context_distrib,
        sample_context_from_obs_dict_fn=mapper,
        reward_fn=eval_reward,
        post_process_batch_fn=concat_context_to_obs,
        **replay_buffer_kwargs)
    replay_buffer_kwargs.update(demo_replay_buffer_kwargs)
    demo_train_buffer = ContextualRelabelingReplayBuffer(
        env=eval_env,
        context_keys=cont_keys,
        observation_keys=obs_keys,
        observation_key=observation_key,
        context_distribution=expl_context_distrib,
        sample_context_from_obs_dict_fn=mapper,
        reward_fn=eval_reward,
        post_process_batch_fn=concat_context_to_obs,
        **replay_buffer_kwargs)
    demo_test_buffer = ContextualRelabelingReplayBuffer(
        env=eval_env,
        context_keys=cont_keys,
        observation_keys=obs_keys,
        observation_key=observation_key,
        context_distribution=expl_context_distrib,
        sample_context_from_obs_dict_fn=mapper,
        reward_fn=eval_reward,
        post_process_batch_fn=concat_context_to_obs,
        **replay_buffer_kwargs)

    #Neural Network Architecture
    def create_qf():
        # return ConcatMlp(
        #     input_size=obs_dim + action_dim,
        #     output_size=1,
        #     **qf_kwargs
        # )
        if qf_class is ConcatMlp:
            qf_kwargs["input_size"] = obs_dim + action_dim
        if qf_class is ConcatCNN:
            qf_kwargs["added_fc_input_size"] = action_dim
        return qf_class(output_size=1, **qf_kwargs)

    qf1 = create_qf()
    qf2 = create_qf()
    target_qf1 = create_qf()
    target_qf2 = create_qf()

    policy = policy_class(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **policy_kwargs,
    )

    #Path Collectors
    eval_path_collector = ContextualPathCollector(
        eval_env,
        MakeDeterministic(policy),
        observation_key=observation_key,
        context_keys_for_policy=[
            context_key,
        ],
    )
    exploration_policy = create_exploration_policy(expl_env, policy,
                                                   **exploration_policy_kwargs)
    expl_path_collector = ContextualPathCollector(
        expl_env,
        exploration_policy,
        observation_key=observation_key,
        context_keys_for_policy=[
            context_key,
        ],
    )

    #Algorithm
    trainer = AWACTrainer(env=eval_env,
                          policy=policy,
                          qf1=qf1,
                          qf2=qf2,
                          target_qf1=target_qf1,
                          target_qf2=target_qf2,
                          **trainer_kwargs)

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        max_path_length=max_path_length,
        **algo_kwargs)

    algorithm.to(ptu.device)

    #Video Saving
    if save_video:

        expl_video_func = RIGVideoSaveFunction(
            model,
            expl_path_collector,
            "train",
            # decode_goal_image_key="image_decoded_goal",
            # reconstruction_key="image_reconstruction",
            rows=2,
            columns=5,
            unnormalize=True,
            imsize=imsize,
            image_format=renderer.output_image_format,
            **save_video_kwargs)
        algorithm.post_train_funcs.append(expl_video_func)

        eval_video_func = RIGVideoSaveFunction(
            model,
            eval_path_collector,
            "eval",
            # goal_image_key=image_goal_key,
            # decode_goal_image_key="image_decoded_goal",
            # reconstruction_key="image_reconstruction",
            num_imgs=4,
            rows=2,
            columns=5,
            unnormalize=True,
            imsize=imsize,
            image_format=renderer.output_image_format,
            **save_video_kwargs)
        algorithm.post_train_funcs.append(eval_video_func)

    #AWAC CODE
    if save_paths:
        algorithm.post_train_funcs.append(save_paths)

    if load_demos:
        path_loader = path_loader_class(
            trainer,
            replay_buffer=replay_buffer,
            demo_train_buffer=demo_train_buffer,
            demo_test_buffer=demo_test_buffer,
            # reward_fn=eval_reward, # omit reward because its recomputed later
            **path_loader_kwargs)
        path_loader.load_demos()
    if pretrain_policy:
        trainer.pretrain_policy_with_bc(
            policy,
            demo_train_buffer,
            demo_test_buffer,
            trainer.bc_num_pretrain_steps,
        )
    if pretrain_rl:
        trainer.pretrain_q_with_bc_data()

    if save_pretrained_algorithm:
        p_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.p')
        pt_path = osp.join(logger.get_snapshot_dir(), 'pretrain_algorithm.pt')
        data = algorithm._get_snapshot()
        data['algorithm'] = algorithm
        torch.save(data, open(pt_path, "wb"))
        torch.save(data, open(p_path, "wb"))

    algorithm.train()