def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_obs = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) if variant['concat_emb']: gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim']*variant['graph_kwargs']['num_conv_layers']) else: gnn_out_dim = variant['graph_kwargs']['node_dim'] from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp(input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf1 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) post_mlp2 = FlattenMlp(input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf2 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) shared_gnn = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_encoder=shared_gnn) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_encoder=shared_gnn) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn11 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, shared_gnn=shared_gnn, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): import sys sys.path.append("./particle-graph-envs") from make_env import make_env expl_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) eval_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from particle_graph import ParticleGraphBuilder graph_builder_obs = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, contain_self_loop=False, ) from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf1 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=graph_builder_obs.output_node_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) post_mlp2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf2 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=graph_builder_obs.output_node_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet from rlkit.torch.networks.layers import SelectLayer cgca = nn.Sequential( GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for agent in range(num_agent): graph_builder_policy = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, contain_self_loop=False, ) gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_obs=True) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_obs=True) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent, shared_obs=True) from rlkit.torch.r2g.r2g_gnn11 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, shared_obs=True, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()