def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=True,discrete_action_input=True)) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=True,discrete_action_input=True)) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*(num_agent-1)), output_size=action_dim, **variant['qf_kwargs'] ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*(num_agent-1)), output_size=action_dim, **variant['qf_kwargs'] ) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACDiscreteTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( pre_graph_builder=graph_builder_ca, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
if (not os.path.isfile(log_file)): results = {} else: import joblib results = joblib.load(log_file) import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv world_args = dict(num_agents=args.num_ag, num_adversaries=args.num_adv, num_landmarks=args.num_l, boundary=([[-1., -1.], [1., 1.]] if args.boundary else None)) env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=world_args)) for seed in seeds: print('seed: ', seed) if seed in results.keys(): pass else: results[seed] = dict() with torch.no_grad(): players = [] for pid in range(len(P_paths)): d_path = pre_dir + '/' + P_paths[pid] + '/seed' + str(seed) if args.epoch: d_path += '/itr_{}.pkl'.format(args.epoch)
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from simple_spread_graph import SimpleSpreadGraphBuilder graph_builder_1 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SelectLayer qf1 = nn.Sequential( gnn1, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=True, ) gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) policy_n, eval_policy_n, expl_policy_n = [], [], [] for i in range(num_agent): graph_builder_policy = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=True, contain_self_loop=True, ) gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=0), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf2 = copy.deepcopy(qf2) policy_n, expl_policy_n, eval_policy_n = [], [], [] cactor_n = [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim * (num_agent - 1) else: input_size = obs_dim * num_agent + action_dim * (num_agent - 1) cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) cactor_n.append(cactor) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3_onlyq import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor_n=cactor_n, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir,load_epoch),map_location='cpu') qf1_n = load_data['trainer/qf1_n'] target_qf1_n = load_data['trainer/target_qf1_n'] qf2_n = load_data['trainer/qf2_n'] target_qf2_n = load_data['trainer/target_qf2_n'] cactor_n = load_data['trainer/cactor_n'] policy_n = load_data['trainer/policy_n'] log_alpha_n = load_data['trainer/log_alpha_n'] qf1_optimizer_n = load_data['trainer/qf1_optimizer_n'] qf2_optimizer_n = load_data['trainer/qf2_optimizer_n'] policy_optimizer_n = load_data['trainer/policy_optimizer_n'] cactor_optimizer_n = load_data['trainer/cactor_optimizer_n'] alpha_optimizer_n = load_data['trainer/alpha_optimizer_n'] if args.ce: log_calpha_n = load_data['trainer/log_calpha_n'] calpha_optimizer_n = load_data['trainer/calpha_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim+action_dim*(num_agent-1) else: input_size = obs_dim*num_agent+action_dim*(num_agent-1) cactor = nn.Sequential( FlattenMlp(input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.r2g.r2g_sequential import R2GTrainer trainer = R2GTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
data_path = './Data/{}_mpl{}/{}/seed{}/params.pkl'.format( args.exp_name, args.mpl, args.cp_path, args.seed) data = torch.load(data_path, map_location='cpu') cactor_n = data['trainer/cactor_n'] cactor_n = [MakeDeterministic(cactor) for cactor in cactor_n] cp = list(map(int, args.cp.split("_"))) print(cp) import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv # env = ParticleEnv(make_env(args.exp_name,discrete_action_space=True,discrete_action_input=True)) env = ParticleEnv(make_env(args.exp_name, discrete_action_space=False)) o_n = env.reset() num_agent = env.num_agent max_path_length = args.mpl path_length = 0 done = np.array([False] * num_agent) c_r = np.zeros(num_agent) with torch.no_grad(): while True: path_length += 1 a_n = [] for (policy, o) in zip(policy_n, o_n): a, _ = policy.get_action(o) a_n.append(a) ca_n = []
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch), map_location='cpu') qf_n = load_data['trainer/qf_n'] target_qf_n = load_data['trainer/target_qf_n'] qf2_n, target_qf2_n = [], [] policy_n = load_data['trainer/policy_n'] target_policy_n = load_data['trainer/target_policy_n'] qf_optimizer_n = load_data['trainer/qf_optimizer_n'] qf2_optimizer_n = None policy_optimizer_n = load_data['trainer/policy_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) for policy in policy_n ] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, vf_n = [], [] policy_optimizer_n, vf_optimizer_n = None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy, return_raw_action=True) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[variant['vf_kwargs']['hidden_dim']] * variant['vf_kwargs']['num_layer'], ) policy_n.append(policy) vf_n.append(vf) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, collect_raw_actions=True) from rlkit.torch.irl.irl_ppo import IRLPPOTrainer trainer = IRLPPOTrainer(env=expl_env, policy_n=policy_n, vf_n=vf_n, policy_optimizer_n=policy_optimizer_n, vf_optimizer_n=vf_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from simple_spread_graph import SimpleSpreadGraphBuilder og_builder_1 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=False, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet from rlkit.torch.networks.layers import SelectLayer og1 = nn.Sequential( GNNNet( og_builder_1, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_og1 = copy.deepcopy(og1) from rlkit.torch.networks.graph_builders import FullGraphBuilder cg_builder_1 = FullGraphBuilder( input_node_dim=variant['graph_kwargs']['node_dim'] + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(cg_builder_1, variant['graph_kwargs']['node_dim'], action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) og_builder_2 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=False, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet og2 = nn.Sequential( GNNNet( og_builder_2, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) target_og2 = copy.deepcopy(og2) cg_builder_2 = FullGraphBuilder( input_node_dim=variant['graph_kwargs']['node_dim'] + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(cg_builder_2, variant['graph_kwargs']['node_dim'], action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) og_builder_ca = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=False, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet ogca = nn.Sequential( GNNNet( og_builder_ca, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) cg_builder_ca = FullGraphBuilder( input_node_dim=variant['graph_kwargs']['node_dim'] + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet(cg_builder_ca, variant['graph_kwargs']['node_dim'], action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): graph_builder_policy = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=False, single_observe=True, contain_self_loop=True, ) from rlkit.torch.networks.gnn_networks import GNNNet gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', conv_type='GSage', node_dim=variant['graph_kwargs']['node_dim'], num_conv_layers=args.glayer, ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=0), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn8 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, og1=og1, target_og1=target_og1, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, og2=og2, target_og2=target_og2, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, ogca=ogca, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
parser.add_argument('--num_adv', type=int, default=None) parser.add_argument('--num_l', type=int, default=None) args = parser.parse_args() world_args = dict(num_agents=args.num_ag, num_adversaries=args.num_adv, num_landmarks=args.num_l, obsid=args.obsid, boundary=([[-1., -1.], [1., 1.]] if args.boundary else None)) import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=world_args)) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size num_agent = env.num_agent from rlkit.torch.networks.graph_builders import FullGraphBuilder gb = FullGraphBuilder(input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=256, contain_self_loop=False) obs1 = env.reset() obs2 = env.reset() obs_batch = torch.tensor([obs1, obs2])
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [] log_alpha_n = None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \ None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) qf1 = FlattenMlp( input_size=(obs_dim+action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim+action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.irl.irl_sac import IRLSACTrainer trainer = IRLSACTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, log_alpha_n=log_alpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
parser.add_argument('--single_observe', action='store_true', default=False) args = parser.parse_args() import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv world_args=dict( num_agents=args.num_ag, num_adversaries=args.num_adv, num_landmarks=args.num_l, obsid=False, absobs=True, boundary=([[-1.,-1.],[1.,1.]] if args.boundary else None) ) env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=world_args)) o_n = env.reset() num_agent = env.num_agent def check_graph(gb, obs): print('obs: ',obs) if args.single_observe: obs_batch = torch.tensor([obs[0]]) else: obs_batch = torch.tensor([obs]) x, edge_index = gb(obs_batch) print('x: ',x) print('edge_index: ',edge_index) data = Data(x=x, edge_index=edge_index)