def experiment(variant): import gym import robosumo.envs from robosumo_env_wrapper import RoboSumoEnv expl_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) eval_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [] log_alpha_n = None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \ None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.masac.masac import MASACTrainer trainer = MASACTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, log_alpha_n=log_alpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from simple_spread_graph import SimpleSpreadGraphBuilder graph_builder_1 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SelectLayer qf1 = nn.Sequential( gnn1, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=True, ) gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) policy_n, eval_policy_n, expl_policy_n = [], [], [] for i in range(num_agent): graph_builder_policy = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=True, contain_self_loop=True, ) gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=0), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./particle-graph-envs") from make_env import make_env expl_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) eval_env = make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args']) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from particle_graph import ParticleGraphBuilder graph_builder_cg1 = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet from rlkit.torch.networks.layers import SelectLayer cg1 = nn.Sequential( GNNNet( graph_builder_cg1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_cg2 = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet cg2 = nn.Sequential( GNNNet( graph_builder_cg2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, contain_self_loop=False, ) cgca = nn.Sequential( GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for agent in range(num_agent): graph_builder_policy = ParticleGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, contain_self_loop=False, ) gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_obs=True) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_obs=True) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent, shared_obs=True) from rlkit.torch.r2g.r2g_gnn10 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, shared_obs=True, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp qf1 = nn.Sequential( gnn1, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) shared_gnn = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) policy_n, eval_policy_n, expl_policy_n = [], [], [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_encoder=shared_gnn) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_encoder=shared_gnn) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, shared_gnn=shared_gnn, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from differential_game import DifferentialGame expl_env = DifferentialGame(game_name=args.exp_name) eval_env = DifferentialGame(game_name=args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, nn.Linear(variant['graph_kwargs']['hidden_dim'],variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n, qf1_n, qf2_n, target_qf1_n, target_qf2_n = \ [], [], [], [], [], [], [] for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf2 = copy.deepcopy(qf2) policy = nn.Sequential( nn.Linear(obs_dim,variant['policy_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['policy_kwargs']['hidden_dim'],variant['policy_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) qf2_n.append(qf2) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3_onlyca import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from differential_game import DifferentialGame expl_env = DifferentialGame(game_name=args.exp_name) eval_env = DifferentialGame(game_name=args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) policy_n, expl_policy_n, eval_policy_n = [], [], [] qf1_n, target_qf1_n, qf2_n, target_qf2_n = [], [], [], [] cactor_n = [] for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), nn.ReLU(), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) cactor_n.append(cactor) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn4 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1_n=qf1_n, target_qf1_n=target_qf1_n, cg2=cg2, target_cg2=target_cg2, qf2_n=qf2_n, target_qf2_n=target_qf2_n, cgca=cgca, cactor_n=cactor_n, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [] log_alpha_n = None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \ None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent)] else: expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.masac.masac import MASACTrainer trainer = MASACTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, log_alpha_n=log_alpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import gym import robosumo.envs from robosumo_env_wrapper import RoboSumoEnv expl_env = RoboSumoEnv( gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)), **variant['world_args']) eval_env = RoboSumoEnv( gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name, args.exp_name)), **variant['world_args']) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, vf_n = [], [] policy_optimizer_n, vf_optimizer_n = None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy, return_raw_action=True) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[variant['vf_kwargs']['hidden_dim']] * variant['vf_kwargs']['num_layer'], ) policy_n.append(policy) vf_n.append(vf) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, collect_raw_actions=True) from rlkit.torch.irl.irl_ppo import IRLPPOTrainer trainer = IRLPPOTrainer(env=expl_env, policy_n=policy_n, vf_n=vf_n, policy_optimizer_n=policy_optimizer_n, vf_optimizer_n=vf_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim * (num_agent - 1) else: input_size = obs_dim * num_agent + action_dim * (num_agent - 1) cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.r2g.r2g import R2GTrainer trainer = R2GTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf2 = copy.deepcopy(qf2) policy_n, expl_policy_n, eval_policy_n = [], [], [] cactor_n = [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim * (num_agent - 1) else: input_size = obs_dim * num_agent + action_dim * (num_agent - 1) cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) cactor_n.append(cactor) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3_onlyq import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor_n=cactor_n, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import gym import robosumo.envs from robosumo_env_wrapper import RoboSumoEnv expl_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) eval_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy_n = [PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) for policy in policy_n] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer( qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_cg1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cg1 = GNNNet( graph_builder_cg1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) target_cg1 = copy.deepcopy(cg1) # graph_builder_qf1 = FullGraphBuilder( # input_node_dim=action_dim+variant['graph_kwargs']['node_dim'], # num_node=num_agent, # batch_size=variant['algorithm_kwargs']['batch_size'], # contain_self_loop=False) # qf1 = nn.Sequential( # GNNNet( # graph_builder_qf1, # hidden_activation='lrelu0.2', # output_activation='lrelu0.2', # **variant['graph_kwargs'], # ), # nn.Linear(variant['graph_kwargs']['node_dim'],1) # ) # target_qf1 = copy.deepcopy(qf1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_cg2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cg2 = GNNNet( graph_builder_cg2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) target_cg2 = copy.deepcopy(cg2) # graph_builder_qf2 = FullGraphBuilder( # input_node_dim=action_dim+variant['graph_kwargs']['node_dim'], # num_node=num_agent, # batch_size=variant['algorithm_kwargs']['batch_size'], # contain_self_loop=False) # qf2 = nn.Sequential( # GNNNet( # graph_builder_qf2, # hidden_activation='lrelu0.2', # output_activation='lrelu0.2', # **variant['graph_kwargs'], # ), # nn.Linear(variant['graph_kwargs']['node_dim'],1) # ) # target_qf2 = copy.deepcopy(qf2) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn10 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch), map_location='cpu') qf_n = load_data['trainer/qf_n'] target_qf_n = load_data['trainer/target_qf_n'] qf2_n, target_qf2_n = [], [] policy_n = load_data['trainer/policy_n'] target_policy_n = load_data['trainer/target_policy_n'] qf_optimizer_n = load_data['trainer/qf_optimizer_n'] qf2_optimizer_n = None policy_optimizer_n = load_data['trainer/policy_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) for policy in policy_n ] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir,load_epoch),map_location='cpu') qf1_n = load_data['trainer/qf1_n'] target_qf1_n = load_data['trainer/target_qf1_n'] qf2_n = load_data['trainer/qf2_n'] target_qf2_n = load_data['trainer/target_qf2_n'] cactor_n = load_data['trainer/cactor_n'] policy_n = load_data['trainer/policy_n'] log_alpha_n = load_data['trainer/log_alpha_n'] qf1_optimizer_n = load_data['trainer/qf1_optimizer_n'] qf2_optimizer_n = load_data['trainer/qf2_optimizer_n'] policy_optimizer_n = load_data['trainer/policy_optimizer_n'] cactor_optimizer_n = load_data['trainer/cactor_optimizer_n'] alpha_optimizer_n = load_data['trainer/alpha_optimizer_n'] if args.ce: log_calpha_n = load_data['trainer/log_calpha_n'] calpha_optimizer_n = load_data['trainer/calpha_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim+action_dim*(num_agent-1) else: input_size = obs_dim*num_agent+action_dim*(num_agent-1) cactor = nn.Sequential( FlattenMlp(input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.r2g.r2g import R2GTrainer trainer = R2GTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, vf_n = [], [] policy_optimizer_n, vf_optimizer_n = None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[variant['vf_kwargs']['hidden_dim']]*variant['vf_kwargs']['num_layer'], ) policy_n.append(policy) vf_n.append(vf) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent)] else: expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.irl.irl_ppo import IRLPPOTrainer trainer = IRLPPOTrainer( env = expl_env, policy_n=policy_n, vf_n=vf_n, policy_optimizer_n=policy_optimizer_n, vf_optimizer_n=vf_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_obs = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet obs_gnn_1 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) if variant['concat_emb']: gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] * variant['graph_kwargs']['num_conv_layers']) else: gnn_out_dim = variant['graph_kwargs']['node_dim'] from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet qf1 = R2GQNet( obs_gnn=obs_gnn_1, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) obs_gnn_2 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) post_mlp2 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) qf2 = R2GQNet( obs_gnn=obs_gnn_2, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_encoder=obs_gnn_1) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_encoder=obs_gnn_1) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent) ] else: expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) for policy in policy_n ] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( pre_graph_builder=graph_builder_ca, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf_n = [], [] policy_optimizer_n, qf_optimizer_n = None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy, return_raw_action=True) qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) policy_n.append(policy) qf_n.append(qf) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, collect_raw_actions=True) from rlkit.torch.coma.coma import COMATrainer trainer = COMATrainer(env=expl_env, policy_n=policy_n, qf_n=qf_n, policy_optimizer_n=policy_optimizer_n, qf_optimizer_n=qf_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()