def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf_n = [], [] policy_optimizer_n, qf_optimizer_n = None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy, return_raw_action=True) qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) policy_n.append(policy) qf_n.append(qf) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, collect_raw_actions=True) from rlkit.torch.coma.coma import COMATrainer trainer = COMATrainer(env=expl_env, policy_n=policy_n, qf_n=qf_n, policy_optimizer_n=policy_optimizer_n, qf_optimizer_n=qf_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchOnlineRLAlgorithm algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_obs = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf1 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=graph_builder_obs.output_node_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) post_mlp2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet import R2GQNet qf2 = R2GQNet( obs_graph_builder=graph_builder_obs, eval_graph_builder=graph_builder_eval, obs_dim=graph_builder_obs.output_node_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet from rlkit.torch.networks.layers import SelectLayer cgca = nn.Sequential( GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ), SelectLayer(dim=1, index=torch.arange(num_agent)), ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) policy_n, expl_policy_n, eval_policy_n = [], [], [] for agent in range(num_agent): gnn_policy = GNNNet( graph_builder_policy, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_obs=True) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_obs=True) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent, shared_obs=False) from rlkit.torch.r2g.r2g_gnn11 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, shared_obs=True, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim max_path_length = variant['trainer_kwargs']['max_path_length'] if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_data = torch.load(load_dir + '/params.pkl', map_location='cpu') policy = load_data['trainer/policy'] vf = load_data['trainer/value_function'] else: hidden_dim = variant['lstm_kwargs']['hidden_dim'] num_layers = variant['lstm_kwargs']['num_layers'] a_0 = np.zeros(action_dim) h_0 = np.zeros(hidden_dim * num_layers) c_0 = np.zeros(hidden_dim * num_layers) latent_0 = (h_0, c_0) from lstm_net import LSTMNet lstm_net = LSTMNet(obs_dim, action_dim, hidden_dim, num_layers) decoder = nn.Linear(hidden_dim, action_dim) from layers import ReshapeLayer sup_learner = nn.Sequential( nn.Linear(hidden_dim, int(label_num * label_dim)), ReshapeLayer(shape=(label_num, label_dim)), ) from sup_softmax_lstm_policy import SupSoftmaxLSTMPolicy policy = SupSoftmaxLSTMPolicy( a_0=a_0, latent_0=latent_0, obs_dim=obs_dim, action_dim=action_dim, lstm_net=lstm_net, decoder=decoder, sup_learner=sup_learner, ) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, action_dim=action_dim, label_dim=label_num, max_replay_buffer_size=int(1e6), max_path_length=max_path_length, recurrent=True, ) from rlkit.torch.vpg.ppo_sup_vanilla import PPOSupVanillaTrainer trainer = PPOSupVanillaTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, recurrent=True, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
args.epoch) + '.pkl' plot_file = pre_path + '/' + 'seed' + str(args.seed) + '/cactor_ep' + str( args.epoch) else: d_path = pre_path + '/' + 'seed' + str(args.seed) + '/params.pkl' plot_file = pre_path + '/' + 'seed' + str(args.seed) + '/cactor' data = torch.load(d_path, map_location='cpu') if 'trainer/ogca' in data.keys(): ogca = data['trainer/ogca'] else: ogca = None if 'trainer/cactor' in data.keys(): cactor = data['trainer/cactor'] from rlkit.torch.policies.make_deterministic import MakeDeterministic cactor = MakeDeterministic(cactor) else: cactor = None cactor_n = data['trainer/cactor_n'] from rlkit.torch.policies.make_deterministic import MakeDeterministic cactor_n = [MakeDeterministic(cactor) for cactor in cactor_n] if 'trainer/cgca' in data.keys(): cgca = data['trainer/cgca'] else: cgca = None if 'trainer/cgca_n' in data.keys(): cgca_n = data['trainer/cgca_n'] with torch.no_grad(): for agent in range(args.num_ag):
def experiment(variant): num_agent = variant['num_agent'] from differential_game import DifferentialGame expl_env = DifferentialGame(game_name=args.exp_name) eval_env = DifferentialGame(game_name=args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) qf1 = nn.Sequential( nn.Linear(variant['graph_kwargs']['hidden_dim'] + action_dim, variant['qf_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['qf_kwargs']['hidden_dim'], 1)) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) qf2 = nn.Sequential( nn.Linear(variant['graph_kwargs']['hidden_dim'] + action_dim, variant['qf_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['qf_kwargs']['hidden_dim'], 1)) target_qf2 = copy.deepcopy(qf2) policy_n, cactor_n, expl_policy_n, eval_policy_n = [], [], [], [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( nn.Linear(obs_dim, variant['policy_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['policy_kwargs']['hidden_dim'], variant['policy_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy cactor = nn.Sequential( nn.Linear((obs_dim + action_dim * (num_agent - 1)), variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['cactor_kwargs']['hidden_dim'], variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) cactor_n.append(cactor) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3_onlyq import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor_n=cactor_n, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim else: input_size = obs_dim * num_agent + action_dim cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim * (num_agent - 1)), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim * (num_agent - 1)) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent) ] else: expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.pr2.pr2 import PR2Trainer trainer = PR2Trainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from simple_spread_graph import SimpleSpreadGraphBuilder graph_builder_1 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=False, ) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SelectLayer qf1 = nn.Sequential( gnn1, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = SimpleSpreadGraphBuilder( num_agents=expl_env.scenario.num_agents, num_landmarks=expl_env.scenario.num_landmarks, batch_size=variant['algorithm_kwargs']['batch_size'], append_action=True, single_observe=False, contain_self_loop=False, ) gnn2 = GNNNet( graph_builder_2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) policy_n, eval_policy_n, expl_policy_n = [], [], [] for i in range(num_agent): from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_cg1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cg1 = GNNNet( graph_builder_cg1, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_cg2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cg2 = GNNNet( graph_builder_cg2, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn10 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from traffic.make_env import make_env expl_env = make_env(args.exp_name, **variant['env_kwargs']) eval_env = make_env(args.exp_name, **variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n label_num = expl_env.label_num label_dim = expl_env.label_dim max_path_length = variant['trainer_kwargs']['max_path_length'] if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_data = torch.load(load_dir + '/params.pkl', map_location='cpu') policy = load_data['trainer/policy'] vf = load_data['trainer/value_function'] else: hidden_dim = variant['lstm_kwargs']['hidden_dim'] num_lstm_layers = variant['lstm_kwargs']['num_layers'] node_dim = variant['gnn_kwargs']['node_dim'] node_num = expl_env.max_veh_num + 1 input_node_dim = int(obs_dim / node_num) a_0 = np.zeros(action_dim) h1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers)) c1_0 = np.zeros((node_num, hidden_dim * num_lstm_layers)) h2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers)) c2_0 = np.zeros((node_num, hidden_dim * num_lstm_layers)) latent_0 = (h1_0, c1_0, h2_0, c2_0) from lstm_net import LSTMNet lstm1_ego = LSTMNet(input_node_dim, action_dim, hidden_dim, num_lstm_layers) lstm1_other = LSTMNet(input_node_dim, 0, hidden_dim, num_lstm_layers) lstm2_ego = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers) lstm2_other = LSTMNet(node_dim, 0, hidden_dim, num_lstm_layers) from graph_builder import TrafficGraphBuilder gb = TrafficGraphBuilder( input_dim=hidden_dim, node_num=node_num, ego_init=torch.tensor([0., 1.]), other_init=torch.tensor([1., 0.]), ) from gnn_net import GNNNet gnn = GNNNet( pre_graph_builder=gb, node_dim=variant['gnn_kwargs']['node_dim'], conv_type=variant['gnn_kwargs']['conv_type'], num_conv_layers=variant['gnn_kwargs']['num_layers'], hidden_activation=variant['gnn_kwargs']['activation'], ) from gnn_lstm2_net import GNNLSTM2Net policy_net = GNNLSTM2Net(node_num, gnn, lstm1_ego, lstm1_other, lstm2_ego, lstm2_other) from layers import FlattenLayer, SelectLayer decoder = nn.Sequential(SelectLayer(-2, 0), FlattenLayer(2), nn.ReLU(), nn.Linear(hidden_dim, action_dim)) from layers import ReshapeLayer sup_learner = nn.Sequential( SelectLayer(-2, np.arange(1, node_num)), nn.ReLU(), nn.Linear(hidden_dim, label_dim), ) from sup_softmax_lstm_policy import SupSoftmaxLSTMPolicy policy = SupSoftmaxLSTMPolicy( a_0=a_0, latent_0=latent_0, obs_dim=obs_dim, action_dim=action_dim, lstm_net=policy_net, decoder=decoder, sup_learner=sup_learner, ) print('parameters: ', np.sum([p.view(-1).shape[0] for p in policy.parameters()])) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) from sup_replay_buffer import SupReplayBuffer replay_buffer = SupReplayBuffer( observation_dim=obs_dim, action_dim=action_dim, label_dim=label_num, max_replay_buffer_size=int(1e6), max_path_length=max_path_length, recurrent=True, ) from rlkit.torch.vpg.ppo_sup import PPOSupTrainer trainer = PPOSupTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, replay_buffer=replay_buffer, recurrent=True, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, log_path_function=get_traffic_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch), map_location='cpu') qf1_n = load_data['trainer/qf1_n'] target_qf1_n = load_data['trainer/target_qf1_n'] qf2_n = load_data['trainer/qf2_n'] target_qf2_n = load_data['trainer/target_qf2_n'] cactor_n = load_data['trainer/cactor_n'] policy_n = load_data['trainer/policy_n'] log_alpha_n = load_data['trainer/log_alpha_n'] qf1_optimizer_n = load_data['trainer/qf1_optimizer_n'] qf2_optimizer_n = load_data['trainer/qf2_optimizer_n'] policy_optimizer_n = load_data['trainer/policy_optimizer_n'] cactor_optimizer_n = load_data['trainer/cactor_optimizer_n'] alpha_optimizer_n = load_data['trainer/alpha_optimizer_n'] if args.ce: log_calpha_n = load_data['trainer/log_calpha_n'] calpha_optimizer_n = load_data['trainer/calpha_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n = [], [] log_alpha_n, log_calpha_n = None, None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n = \ None, None, None, None, None, None for i in range(num_agent): from rlkit.torch.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.layers import SplitLayer if variant['trainer_kwargs']['dec_cactor']: input_size = obs_dim + action_dim * (num_agent - 1) else: input_size = obs_dim * num_agent + action_dim * (num_agent - 1) cactor = nn.Sequential( FlattenMlp( input_size=input_size, output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.prg.prg import PRGTrainer trainer = PRGTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, cactor_optimizer_n=cactor_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, calpha_optimizer_n=calpha_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) eval_env = ParticleEnv(make_env(args.exp_name,discrete_action_space=False,world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [] log_alpha_n = None qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, alpha_optimizer_n = \ None, None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) qf1 = FlattenMlp( input_size=(obs_dim+action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim+action_dim), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf2 = copy.deepcopy(qf2) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy_n = [MakeDeterministic(policy) for policy in policy_n] expl_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.irl.irl_sac import IRLSACTrainer trainer = IRLSACTrainer( env = expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, log_alpha_n=log_alpha_n, qf1_optimizer_n=qf1_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, alpha_optimizer_n=alpha_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder(input_node_dim=obs_dim + action_dim, num_node=num_agent, contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cg1 = GNNNet( graph_builder_1, node_dim=variant['graph_kwargs']['hidden_dim'], conv_type=variant['graph_kwargs']['conv_type'], num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='relu', output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder(input_node_dim=obs_dim + action_dim, num_node=num_agent, contain_self_loop=False) cg2 = GNNNet( graph_builder_2, node_dim=variant['graph_kwargs']['hidden_dim'], conv_type=variant['graph_kwargs']['conv_type'], num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='relu', output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder(input_node_dim=obs_dim + action_dim, num_node=num_agent, contain_self_loop=False) cgca = GNNNet( graph_builder_2, node_dim=variant['graph_kwargs']['hidden_dim'], conv_type=variant['graph_kwargs']['conv_type'], num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='relu', output_activation='relu', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['hidden_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), ), nn.ReLU(), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn2 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) policy_n, expl_policy_n, eval_policy_n = [], [], [] qf1_n, target_qf1_n, qf2_n, target_qf2_n = [], [], [], [] cactor_n = [] for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer cactor = nn.Sequential( FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), ), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) cactor_n.append(cactor) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn4 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1_n=qf1_n, target_qf1_n=target_qf1_n, cg2=cg2, target_cg2=target_cg2, qf2_n=qf2_n, target_qf2_n=target_qf2_n, cgca=cgca, cactor_n=cactor_n, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
from make_env import make_env from particle_env_wrapper import ParticleEnv world_args = dict(num_agents=args.num_ag, num_adversaries=args.num_adv, num_landmarks=args.num_l, obsid=args.obsid, boundary=([[-1., -1.], [1., 1.]] if args.boundary else None)) env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=world_args)) o_n = env.reset() num_agent = env.num_agent cactor_n = data['trainer/cactor_n'] from rlkit.torch.policies.make_deterministic import MakeDeterministic cactor_n = [MakeDeterministic(cactor) for cactor in cactor_n] xs = np.linspace(-1, 1, 100) ys = np.linspace(-1, 1, 100) cxs = dict() cys = dict() o_n = env.reset() o_n = [torch.tensor(o) for o in o_n] with torch.no_grad(): for i, x in enumerate(xs): for j, y in enumerate(ys): actions = torch.tensor([[x, y]] * num_agent).reshape(-1).float() for agent in range(num_agent): if not (agent in cxs.keys()): cxs[agent] = np.zeros((100, 100))
game_name=args.exp_name, agent_num=args.num_ag, ) from multi_differential_game import MultiDifferentialGame env = MultiDifferentialGame(**env_kwargs) opponent_as = np.linspace(-1, 1, 100) c1s = [] c2s = [] d_path = pre_path + '/' + 'seed' + str(args.seed) + '/params.pkl' data = torch.load(d_path, map_location='cpu') cnets = data['trainer/cactor_n'] from rlkit.torch.policies.make_deterministic import MakeDeterministic cnets = [MakeDeterministic(cnet) for cnet in cnets] cs = dict() with torch.no_grad(): for agent in range(args.num_ag): cs[agent] = [] if agent < int(args.num_ag / 2): for a2 in opponent_as: o_n = env.reset() o_n = [torch.tensor(o) for o in o_n] actions = torch.zeros(args.num_ag) actions[:int(args.num_ag / 2)] = 0. actions[int(args.num_ag / 2):] = a2 c_input = torch.cat( [*o_n, actions[:agent], actions[agent + 1:]], dim=-1).float()
def experiment(variant): num_agent = variant['num_agent'] from differential_game import DifferentialGame expl_env = DifferentialGame(game_name=args.exp_name) eval_env = DifferentialGame(game_name=args.exp_name) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1_n, qf2_n, cactor_n, policy_n, target_qf1_n, target_qf2_n, expl_policy_n, eval_policy_n = \ [], [], [], [], [], [], [], [] for i in range(num_agent): from rlkit.torch.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.networks.layers import SplitLayer cactor = nn.Sequential( nn.Linear((obs_dim*num_agent+action_dim*(num_agent-1)),variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['cactor_kwargs']['hidden_dim'],variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( nn.Linear(obs_dim,variant['policy_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['policy_kwargs']['hidden_dim'],variant['policy_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g import R2GTrainer trainer = R2GTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, policy_n=policy_n, cactor_n=cactor_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
pre_dir = './Data/'+args.exp_name\ +('bd' if args.boundary else '')\ +(('ag'+str(args.num_ag)) if args.num_ag else '')\ +(('adv'+str(args.num_adv)) if args.num_adv else '')\ +(('l'+str(args.num_l)) if args.num_l else '')\ +'_mpl'+str(args.mpl) if args.epoch: data_path = '{}/{}/seed{}/itr_{}.pkl'.format(pre_dir, args.log_dir, args.seed, args.epoch) else: data_path = '{}/{}/seed{}/params.pkl'.format(pre_dir, args.log_dir, args.seed) data = torch.load(data_path, map_location='cpu') policy_n = data['trainer/policy_n'] if isinstance(policy_n[0], TanhGaussianPolicy): policy_n = [MakeDeterministic(policy) for policy in policy_n] elif isinstance(policy_n[0], GumbelSoftmaxMlpPolicy): policy_n = [ ArgmaxDiscretePolicy(policy, use_preactivation=True) for policy in policy_n ] if 'trainer/shared_gnn' in data.keys(): shared_gnn = data['trainer/shared_gnn'] elif 'R2GGNN12Share' in args.log_dir: shared_gnn = data['trainer/qf1'].obs_gnn else: shared_gnn = None world_args = dict( num_agents=args.num_ag,
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_q = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet gnn1 = GNNNet( graph_builder_q, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SelectLayer qf1 = nn.Sequential( gnn1, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf1 = copy.deepcopy(qf1) gnn2 = GNNNet( graph_builder_q, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) qf2 = nn.Sequential( gnn2, SelectLayer(dim=1, index=torch.arange(num_agent)), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), )) target_qf2 = copy.deepcopy(qf2) policy_n, eval_policy_n, expl_policy_n = [], [], [] for agent in range(num_agent): graph_builder = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) gnn_policy = GNNNet( graph_builder, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.layers import SplitLayer, FlattenLayer policy = nn.Sequential( gnn_policy, SelectLayer(dim=1, index=agent), FlattenLayer(), FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_obs=True) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_obs=True) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent, shared_obs=False) from rlkit.torch.masac.masac_gnn import MASACGNNTrainer trainer = MASACGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, policy_n=policy_n, shared_obs=True, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()