def sac_lr(self, train_config, device, dtype): # not used for training, only used for testing apis c = train_config actor = smw( Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device), device, device, ) critic = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic2 = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) critic2_t = smw( Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=logger) with pytest.raises(TypeError, match="missing .+ positional argument"): _ = SAC( actor, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR, ) sac = SAC( actor, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), (lr_func, ), (lr_func, )), ) return sac
def dqn_per_train(self, train_config): c = train_config # cpu is faster for testing full training. q_net = smw(QNet(c.observe_dim, c.action_num), "cpu", "cpu") q_net_t = smw(QNet(c.observe_dim, c.action_num), "cpu", "cpu") dqn_per = DQNPer( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return dqn_per
def a2c_train(self, train_config): c = train_config # cpu is faster for testing full training. actor = smw(Actor(c.observe_dim, c.action_num), "cpu", "cpu") critic = smw(Critic(c.observe_dim), "cpu", "cpu") a2c = A2C( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return a2c
def test_mode(self, train_config, device, dtype): c = train_config q_net = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device) q_net_t = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device) with pytest.raises(ValueError, match="Unknown DQN mode"): _ = DQN( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, mode="invalid_mode", ) with pytest.raises(ValueError, match="Unknown DQN mode"): dqn = DQN( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, mode="double", ) old_state = state = t.zeros([1, c.observe_dim], dtype=dtype) action = t.zeros([1, 1], dtype=t.int) dqn.store_episode([{ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": 0, "terminal": False, } for _ in range(3)]) dqn.mode = "invalid_mode" dqn.update(update_value=True, update_target=True, concatenate_samples=True)
def dqn(self, train_config, request): c = train_config q_net = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) q_net_t = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) dqn = DQN(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, mode=request.param) return dqn
def ppo(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size) return ppo
def dqn_per(self, train_config, device, dtype): c = train_config q_net = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device) q_net_t = smw( QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device) dqn_per = DQNPer(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size) return dqn_per
def rainbow(self, train_config): c = train_config q_net = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) q_net_t = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) rainbow = RAINBOW(q_net, q_net_t, t.optim.Adam, c.value_min, c.value_max, reward_future_steps=c.reward_future_steps, replay_device="cpu", replay_size=c.replay_size) return rainbow
def a2c(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device ) critic = smw(Critic(c.observe_dim).type(dtype).to(device), device, device) a2c = A2C( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return a2c
def maddpg_lr(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.test_observe_dim, c.test_action_dim, c.test_action_range) .type(dtype) .to(device), device, device, ) critic = smw( Critic( c.test_observe_dim * c.test_agent_num, c.test_action_dim * c.test_agent_num, ) .type(dtype) .to(device), device, device, ) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=logger) with pytest.raises(TypeError, match="missing .+ positional argument"): _ = MADDPG( [deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR, ) maddpg = MADDPG( [deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, lr_scheduler=LambdaLR, lr_scheduler_args=( [(lr_func,)] * c.test_agent_num, [(lr_func,)] * c.test_agent_num, ), ) return maddpg
def rainbow_train(self, train_config): c = train_config # cpu is faster for testing full training. q_net = smw(QNet(c.observe_dim, c.action_num), "cpu", "cpu") q_net_t = smw(QNet(c.observe_dim, c.action_num), "cpu", "cpu") rainbow = RAINBOW( q_net, q_net_t, t.optim.Adam, c.value_min, c.value_max, reward_future_steps=c.reward_future_steps, replay_device="cpu", replay_size=c.replay_size, ) return rainbow
def dqn_vis(self, train_config, device, dtype, tmpdir, request): c = train_config tmp_dir = tmpdir.make_numbered_dir() q_net = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) q_net_t = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) dqn = DQN(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, mode=request.param, visualize=True, visualize_dir=str(tmp_dir)) return dqn
def hddpg(self, train_config): c = train_config actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) hddpg = HDDPG(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size) return hddpg
def dqn_per_vis(self, train_config, tmpdir): c = train_config tmp_dir = tmpdir.make_numbered_dir() q_net = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) q_net_t = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) dqn_per = DQNPer(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return dqn_per
def ppo_vis(self, train_config, tmpdir): # not used for training, only used for testing apis c = train_config tmp_dir = tmpdir.make_numbered_dir() actor = smw(Actor(c.observe_dim, c.action_num) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim) .to(c.device), c.device, c.device) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return ppo
def rainbow_vis(self, train_config, device, dtype, tmpdir): c = train_config tmp_dir = tmpdir.make_numbered_dir() q_net = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) q_net_t = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) rainbow = RAINBOW(q_net, q_net_t, t.optim.Adam, c.value_min, c.value_max, reward_future_steps=c.reward_future_steps, replay_device="cpu", replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return rainbow
def a3c(): c = TestA3C.c actor = smw(Actor(c.observe_dim, c.action_num) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim) .to(c.device), c.device, c.device) # in all test scenarios, all processes will be used as reducers servers = grad_server_helper( [lambda: Actor(c.observe_dim, c.action_num), lambda: Critic(c.observe_dim)], learning_rate=5e-3 ) a3c = A3C(actor, critic, nn.MSELoss(reduction='sum'), servers, replay_device="cpu", replay_size=c.replay_size) return a3c
def a2c_vis(self, train_config, device, dtype, tmpdir): # not used for training, only used for testing apis c = train_config tmp_dir = tmpdir.make_numbered_dir() actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) a2c = A2C(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return a2c
def sac(self, train_config): c = train_config actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic2 = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic2_t = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) sac = SAC(actor, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size) return sac
def ddpg_train(self, train_config): c = train_config # cpu is faster for testing full training. actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") critic = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") ddpg = DDPG( actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return ddpg
def dqn_apex(device, dtype): c = TestDQNApex.c q_net = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) q_net_t = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("apex", ["0", "1", "2"]) dqn_apex = DQNApex(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return dqn_apex
def gail_lr(self, train_config, device, dtype): # not used for training, only used for testing apis c = train_config actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) discriminator = smw( Discriminator(c.observe_dim, c.action_num).type(dtype).to(device), device, device, ) ppo = PPO( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=logger) with pytest.raises(TypeError, match="missing .+ positional argument"): _ = GAIL( discriminator, ppo, t.optim.Adam, expert_replay_device="cpu", expert_replay_size=c.replay_size, lr_scheduler=LambdaLR, ) gail = GAIL( discriminator, ppo, t.optim.Adam, expert_replay_device="cpu", expert_replay_size=c.replay_size, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), ), ) return gail
def maddpg_cont(self, train_config, device, dtype): c = train_config actor = smw( Actor(c.test_observe_dim, c.test_action_dim, c.test_action_range).type(dtype).to(device), device, device) critic = smw( Critic(c.test_observe_dim * c.test_agent_num, c.test_action_dim * c.test_agent_num).type(dtype).to(device), device, device) maddpg = MADDPG([deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(actor) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], [deepcopy(critic) for _ in range(c.test_agent_num)], [list(range(c.test_agent_num))] * c.test_agent_num, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size) return maddpg
def test_criterion(self, train_config): c = train_config q_net = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) q_net_t = smw( QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device) with pytest.raises(RuntimeError, match="Criterion does not have the " "'reduction' property"): def criterion(a, b): return a - b _ = DQNPer(q_net, q_net_t, t.optim.Adam, criterion, replay_device="cpu", replay_size=c.replay_size, mode="invalid_mode")
def sac_train(self, train_config): c = train_config actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu") critic = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic2 = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") critic2_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu") sac = SAC( actor, critic, critic_t, critic2, critic2_t, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) return sac
def maddpg_train(self, train_config): c = train_config # for simplicity, prey will be trained with predators, # Predator can get the observation of prey, same for prey. actor = smw(ActorDiscrete(c.observe_dim, c.action_num), "cpu", "cpu") critic = smw( Critic(c.observe_dim * c.agent_num, c.action_num * c.agent_num), "cpu", "cpu") # set visible indexes to [[0], [1], [2]] is equivalent to using DDPG maddpg = MADDPG([deepcopy(actor) for _ in range(3)], [deepcopy(actor) for _ in range(3)], [deepcopy(critic) for _ in range(3)], [deepcopy(critic) for _ in range(3)], [[0, 1, 2], [0, 1, 2], [0, 1, 2]], t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, pool_type="thread") return maddpg
def hddpg_vis(self, train_config, tmpdir): # not used for training, only used for testing apis c = train_config tmp_dir = tmpdir.make_numbered_dir() actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) hddpg = HDDPG(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device="cpu", replay_size=c.replay_size, visualize=True, visualize_dir=str(tmp_dir)) return hddpg
def test_criterion(self, train_config): c = train_config actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .to(c.device), c.device, c.device) critic = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .to(c.device), c.device, c.device) with pytest.raises(RuntimeError, match="Criterion does not have the " "'reduction' property"): def criterion(a, b): return a - b _ = DDPGPer(actor, actor_t, critic, critic_t, t.optim.Adam, criterion, replay_device="cpu", replay_size=c.replay_size)
def ddpg_apex(device, dtype, discrete=False): c = TestDDPGApex.c if not discrete: actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .type(dtype).to(device), device, device) actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .type(dtype).to(device), device, device) else: actor = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) actor_t = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) servers = model_server_helper(model_num=2) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("worker", ["0", "1", "2"]) ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return ddpg_apex
def gail_train(self, train_config): c = train_config actor = smw(Actor(c.observe_dim, c.action_num), "cpu", "cpu") critic = smw(Critic(c.observe_dim), "cpu", "cpu") discriminator = smw(Discriminator(c.observe_dim, c.action_num), "cpu", "cpu") ppo = PPO( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), replay_device="cpu", replay_size=c.replay_size, ) gail = GAIL( discriminator, ppo, t.optim.Adam, expert_replay_device="cpu", expert_replay_size=c.replay_size, ) return gail