def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) policy = GumbelSoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv from rlkit.envs.ma_wrappers import MAProbDiscreteEnv expl_env = MAProbDiscreteEnv(CartPoleEnv(mode=4)) eval_env = MAProbDiscreteEnv(CartPoleEnv(mode=4)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \ [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = policy qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) exploration_policy_n.append(exploration_policy) eval_path_collector = MAMdpPathCollector(eval_env, policy_n) expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) for i in range(num_agent) ] else: expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) for policy in policy_n ] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import gym import robosumo.envs from robosumo_env_wrapper import RoboSumoEnv expl_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) eval_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args']) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy_n = [PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) for policy in policy_n] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer( qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size if variant['load_kwargs']['load']: load_dir = variant['load_kwargs']['load_dir'] load_epoch = variant['load_kwargs']['load_epoch'] load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch), map_location='cpu') qf_n = load_data['trainer/qf_n'] target_qf_n = load_data['trainer/target_qf_n'] qf2_n, target_qf2_n = [], [] policy_n = load_data['trainer/policy_n'] target_policy_n = load_data['trainer/target_policy_n'] qf_optimizer_n = load_data['trainer/qf_optimizer_n'] qf2_optimizer_n = None policy_optimizer_n = load_data['trainer/policy_optimizer_n'] replay_buffer = load_data['replay_buffer'] else: qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [] qf2_n, target_qf2_n = [], [] qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None for i in range(num_agent): from rlkit.torch.networks.networks import FlattenMlp qf = FlattenMlp( input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * variant['qf_kwargs']['num_layer'], ) target_qf = copy.deepcopy(qf) from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * variant['policy_kwargs']['num_layer'], ) target_policy = copy.deepcopy(policy) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy expl_policy_n = [ PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=expl_env.action_space), policy=policy, ) for policy in policy_n ] eval_policy_n = policy_n from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.torch.maddpg.maddpg import MADDPGTrainer trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, qf_optimizer_n=qf_optimizer_n, qf2_optimizer_n=qf2_optimizer_n, policy_optimizer_n=policy_optimizer_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \ [], [], [], [], [] qf2_n, target_qf2_n = [], [] for i in range(num_agent): qf = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) exploration_policy_n.append(exploration_policy) if variant['trainer_kwargs']['double_q']: qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, **variant['qf_kwargs'] ) target_qf2 = copy.deepcopy(qf2) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_path_collector = MAMdpPathCollector(eval_env, policy_n) expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MADDPGTrainer( qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()