Exemple #1
0
 def train_config(self):
     disable_view_window()
     c = Config()
     # the cooperative environment environment provided in
     # https://github.com/openai/multiagent-particle-envs
     c.env_name = "simple_spread"
     c.env = create_env(c.env_name)
     c.env.discrete_action_input = True
     c.agent_num = 3
     c.action_num = c.env.action_space[0].n
     c.observe_dim = c.env.observation_space[0].shape[0]
     # for contiguous tests
     c.test_action_dim = 5
     c.test_action_range = 1
     c.test_observe_dim = 5
     c.test_agent_num = 3
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     # from https://github.com/wsjeon/maddpg-rllib/tree/master/plots
     # PROBLEM: I have no idea how they calculate the rewards
     # I cannot replicate their reward curve
     c.solved_reward = -15
     c.solved_repeat = 5
     return c
Exemple #2
0
    def train_config(self, gpu):
        disable_view_window()
        c = Config()
        # Note: online policy algorithms such as PPO and A2C does not
        # work well in Pendulum (reason unknown)
        # and MountainCarContinuous (sparse returns)
        c.env_name = "CartPole-v0"
        c.env = unwrap_time_limit(gym.make(c.env_name))
        c.observe_dim = 4
        c.action_num = 2
        # maximum and minimum of reward value
        # since reward is 1 for every step, maximum q value should be
        # below 20(reward_future_steps) * (1 + discount ** n_steps) < 40
        c.value_max = 40
        c.value_min = 0
        c.reward_future_steps = 20
        c.max_episodes = 1000
        c.max_steps = 200
        c.replay_size = 100000

        # RAINBOW is not very stable (without dueling and noisy linear)
        # compared to other DQNs
        c.solved_reward = 180
        c.solved_repeat = 5
        c.device = gpu
        return c
 def train_config(self):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     c.solved_reward = -400
     c.solved_repeat = 5
     return c
Exemple #4
0
 def train_config(self):
     disable_view_window()
     c = Config()
     # Note: online policy algorithms such as PPO and A2C does not
     # work well in Pendulum (reason unknown)
     # and MountainCarContinuous (sparse returns)
     c.env_name = "CartPole-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 4
     c.action_num = 2
     c.max_episodes = 2000  # the actor learns a little bit slower
     c.max_steps = 200
     c.replay_size = 10000
     c.solved_reward = 150
     c.solved_repeat = 5
     return c
Exemple #5
0
 def train_config(self, pytestconfig):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.noise_param = (0, 0.2)
     c.noise_mode = "normal"
     c.noise_interval = 2
     c.replay_size = 100000
     c.solved_reward = -150
     c.solved_repeat = 5
     c.device = "cpu"
     return c
Exemple #6
0
class TestIMPALA(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def impala(device, dtype, use_lr_sch=False):
        c = TestIMPALA.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device,
            device)
        critic = smw(
            Critic(c.observe_dim).type(dtype).to(device), device, device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        impala_group = world.create_rpc_group("impala", ["0", "1", "2"])

        if use_lr_sch:
            lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                             logger=default_logger)
            impala = IMPALA(actor,
                            critic,
                            t.optim.Adam,
                            nn.MSELoss(reduction='sum'),
                            impala_group,
                            servers,
                            lr_scheduler=LambdaLR,
                            lr_scheduler_args=((lr_func, ), (lr_func, )))
        else:
            impala = IMPALA(actor, critic, t.optim.Adam,
                            nn.MSELoss(reduction='sum'), impala_group, servers)
        return impala

    ########################################################################
    # Test for IMPALA acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala.act({"state": state})
        return True

    ########################################################################
    # Test for IMPALA action evaluation
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_eval_action(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        impala._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for IMPALA criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala._criticize({"state": state})
        return True

    ########################################################################
    # Test for IMPALA storage
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_store_step(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)

        with pytest.raises(NotImplementedError):
            impala.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            })
        return True

    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_store_episode(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        episode = [{
            "state": {
                "state": old_state
            },
            "action": {
                "action": action
            },
            "next_state": {
                "state": state
            },
            "reward": 0,
            "action_log_prob": 0.1,
            "terminal": False
        } for _ in range(3)]
        impala.store_episode(episode)
        return True

    ########################################################################
    # Test for IMPALA update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank == 0:
            # episode length = 3
            impala.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            } for _ in range(3)])
        elif rank == 1:
            # episode length = 2
            impala.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False
            } for _ in range(2)])
        if rank == 2:
            sleep(2)
            impala.update(update_value=True,
                          update_target=True,
                          concatenate_samples=True)
        return True

    ########################################################################
    # Test for IMPALA save & load
    ########################################################################
    # Skipped, it is the same as base framework

    ########################################################################
    # Test for IMPALA lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["device", "dtype"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_lr_scheduler(_, device, dtype):
        impala = TestIMPALA.impala(device, dtype)

        impala.update_lr_scheduler()
        return True

    ########################################################################
    # Test for IMPALA full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "action_log_prob":
                            action_log_prob.item(),
                            "terminal":
                            terminal or step == c.max_steps
                        })
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            # Note: the number of entries in buffer means "episodes"
            # rather than steps here!
            while impala.replay_buffer.all_size() < 5:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                impala.update()
            return True

        raise RuntimeError("IMPALA Training failed.")
Exemple #7
0
from machin.utils.logging import default_logger as logger
from torch.distributions import Categorical

import gym
import torch as t
import torch.nn as nn

from util import convert

# configurations
env = gym.make("Frostbite-ram-v0")
action_num = env.action_space.n
max_episodes = 20000

# disable view window in rendering
disable_view_window()


class RecurrentActor(nn.Module):
    def __init__(self, action_num):
        super(RecurrentActor, self).__init__()
        self.gru = nn.GRU(128, 256, batch_first=True)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, action_num)

    def forward(self, mem, hidden, action=None):
        hidden = hidden.transpose(0, 1)
        a, hidden = self.gru(mem.unsqueeze(1), hidden)
        a = self.fc2(t.relu(self.fc1(t.relu(a.flatten(start_dim=1)))))
        probs = t.softmax(a, dim=1)
        dist = Categorical(probs=probs)
Exemple #8
0
class TestDQNApex(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A2C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 190
    c.solved_repeat = 5

    @staticmethod
    def dqn_apex():
        c = TestDQNApex.c
        q_net = smw(
            QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device)
        q_net_t = smw(
            QNet(c.observe_dim, c.action_num).to(c.device), c.device, c.device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("apex", ["0", "1", "2"])
        dqn_apex = DQNApex(q_net,
                           q_net_t,
                           t.optim.Adam,
                           nn.MSELoss(reduction='sum'),
                           apex_group,
                           servers,
                           replay_device="cpu",
                           replay_size=c.replay_size)
        return dqn_apex

    ########################################################################
    # Test for DQNApex acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_act(_, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        state = t.zeros([1, c.observe_dim])
        dqn_apex.act_discrete({"state": state})
        dqn_apex.act_discrete({"state": state}, True)
        dqn_apex.act_discrete_with_noise({"state": state})
        dqn_apex.act_discrete_with_noise({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_criticize(_, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        state = t.zeros([1, c.observe_dim])
        dqn_apex._criticize({"state": state})
        dqn_apex._criticize({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex storage
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        old_state = state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (0, 1):
            dqn_apex.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False
            } for _ in range(3)])
            dqn_apex.manual_sync()
        if rank == 2:
            sleep(2)
            dqn_apex.update(update_value=True,
                            update_target=True,
                            concatenate_samples=True)
        return True

    ########################################################################
    # Test for DQNApex save & load
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gpu):
        c = TestDQNApex.c
        c.device = gpu
        dqn_apex = TestDQNApex.dqn_apex()
        # perform manual syncing to decrease the number of rpc calls
        dqn_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)

                dqn_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        # agent model inference
                        action = dqn_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device).flatten()
                        total_reward += float(reward)

                        dqn_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps
                        })
                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} total reward={:.2f}".format(
                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while dqn_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                dqn_apex.update()
            return True

        raise RuntimeError("DQN-Apex Training failed.")
Exemple #9
0
class TestDDPGApex(object):
    # configs and definitions
    disable_view_window()
    c = Config()
    c.env_name = "Pendulum-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 3
    c.action_dim = 1
    c.action_range = 2
    c.max_episodes = 2000
    c.max_steps = 200
    c.noise_param = (0, 0.2)
    c.noise_mode = "normal"
    c.replay_size = 100000
    # takes too much computing resource
    # decrease standard for faster validation
    c.solved_reward = -300
    c.solved_repeat = 5

    @staticmethod
    def ddpg_apex(discrete=False):
        c = TestDDPGApex.c
        if not discrete:
            actor = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).to(c.device), c.device, c.device)
            actor_t = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).to(c.device), c.device, c.device)
        else:
            actor = smw(
                ActorDiscrete(c.observe_dim, c.action_dim).to(c.device),
                c.device, c.device)
            actor_t = smw(
                ActorDiscrete(c.observe_dim, c.action_dim).to(c.device),
                c.device, c.device)
        critic = smw(
            Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
            c.device)
        critic_t = smw(
            Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
            c.device)

        servers = model_server_helper(model_num=2)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("worker", ["0", "1", "2"])
        ddpg_apex = DDPGApex(actor,
                             actor_t,
                             critic,
                             critic_t,
                             t.optim.Adam,
                             nn.MSELoss(reduction='sum'),
                             apex_group,
                             servers,
                             replay_device="cpu",
                             replay_size=c.replay_size)
        return ddpg_apex

    ########################################################################
    # Test for DDPGApex contiguous domain acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_contiguous_act(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        state = t.zeros([1, c.observe_dim])
        ddpg_apex.act({"state": state})
        ddpg_apex.act({"state": state}, use_target=True)
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform")
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform",
                                 use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex discrete domain acting
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_discrete_act(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(discrete=True)
        state = t.zeros([1, c.observe_dim])
        ddpg_apex.act_discrete({"state": state})
        ddpg_apex.act_discrete({"state": state}, use_target=True)
        ddpg_apex.act_discrete_with_noise({"state": state})
        ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test__criticize(_, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, c.action_dim])
        ddpg_apex._criticize({"state": state}, {"action": action})
        ddpg_apex._criticize({"state": state}, {"action": action},
                             use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex storage
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex update
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=180)
    @WorldTestBase.setup_world
    def test_update(rank, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        old_state = state = t.zeros([1, c.observe_dim])
        action = t.zeros([1, c.action_dim])
        if rank in (0, 1):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 2:
            sleep(2)
            ddpg_apex.update(update_value=True,
                             update_policy=True,
                             update_target=True,
                             concatenate_samples=True)
        return True

    ########################################################################
    # Test for DDPGApex save & load
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True],
               pass_through=["gpu"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gpu):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex()
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair("{}_running".format(rank), True)
        default_logger.info("{}, pid {}".format(rank, os.getpid()))
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32, device=c.device)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action = ddpg_apex.act_with_noise(
                            {"state": old_state.unsqueeze(0)},
                            noise_param=c.noise_param,
                            mode=c.noise_mode)

                        state, reward, terminal, _ = env.step(
                            action.cpu().numpy())
                        state = t.tensor(state,
                                         dtype=t.float32,
                                         device=c.device).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": action
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps
                        })

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair("{}_running".format(rank))
                        while (all_group.is_paired("0_running")
                               or all_group.is_paired("1_running")):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while (all_group.is_paired("0_running")
                   or all_group.is_paired("1_running")):
                ddpg_apex.update()
            return True

        raise RuntimeError("DDPG-Apex Training failed.")
Exemple #10
0
class TestIMPALA:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def impala(device, dtype, use_lr_sch=False):
        c = TestIMPALA.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        critic = smw(Critic(c.observe_dim).type(dtype).to(device), device, device)
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        impala_group = world.create_rpc_group("impala", ["0", "1", "2"])

        if use_lr_sch:
            lr_func = gen_learning_rate_func(
                [(0, 1e-3), (200000, 3e-4)], logger=default_logger
            )
            impala = IMPALA(
                actor,
                critic,
                t.optim.Adam,
                nn.MSELoss(reduction="sum"),
                impala_group,
                servers,
                lr_scheduler=LambdaLR,
                lr_scheduler_args=((lr_func,), (lr_func,)),
            )
        else:
            impala = IMPALA(
                actor,
                critic,
                t.optim.Adam,
                nn.MSELoss(reduction="sum"),
                impala_group,
                servers,
            )
        return impala

    ########################################################################
    # Test for IMPALA acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_act(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala.act({"state": state})
        return True

    ########################################################################
    # Test for IMPALA action evaluation
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_eval_action(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        impala._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for IMPALA criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test__criticize(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        state = t.zeros([1, c.observe_dim], dtype=dtype)
        impala._criticize({"state": state})
        return True

    ########################################################################
    # Test for IMPALA storage
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_store_episode(_, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        episode = [
            {
                "state": {"state": old_state},
                "action": {"action": action},
                "next_state": {"state": state},
                "reward": 0,
                "action_log_prob": 0.1,
                "terminal": False,
            }
            for _ in range(3)
        ]
        impala.store_episode(episode)
        return True

    ########################################################################
    # Test for IMPALA update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_update(rank, device, dtype):
        c = TestIMPALA.c
        impala = TestIMPALA.impala(device, dtype)

        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank == 0:
            # episode length = 3
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
        elif rank == 1:
            # episode length = 2
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(2)
                ]
            )
        if rank == 2:
            sleep(2)
            impala.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for IMPALA save & load
    ########################################################################
    # Skipped, it is the same as base framework

    ########################################################################
    # Test for IMPALA lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_lr_scheduler(_, device, dtype):
        impala = TestIMPALA.impala(device, dtype)

        impala.update_lr_scheduler()
        return True

    ########################################################################
    # Test for IMPALA config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @setup_world
    def test_config_init(rank):
        c = TestIMPALA.c
        config = IMPALA.generate_config({})
        config["frame_config"]["models"] = ["Actor", "Critic"]
        config["frame_config"]["model_kwargs"] = [
            {"state_dim": c.observe_dim, "action_num": c.action_num},
            {"state_dim": c.observe_dim},
        ]
        impala = IMPALA.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)

        if rank == 0:
            # episode length = 3
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
        elif rank == 1:
            # episode length = 2
            impala.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "action_log_prob": 0.1,
                        "terminal": False,
                    }
                    for _ in range(2)
                ]
            )
        if rank == 2:
            sleep(2)
            impala.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for IMPALA full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @setup_world
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestIMPALA.c
        impala = TestIMPALA.impala("cpu", t.float32)

        # perform manual syncing to decrease the number of rpc calls
        impala.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                impala.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, action_log_prob, *_ = impala.act(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "action_log_prob": action_log_prob.item(),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                impala.store_episode(tmp_observations)

                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} "
                    "total reward={:.2f}".format(rank, episode, smoother.value)
                )

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()
            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        impala.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("IMPALA Training failed.")
Exemple #11
0
class TestA3C:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A3C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 3000
    c.max_steps = 200
    c.replay_size = 10000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def a3c(device, dtype):
        c = TestA3C.c
        actor = smw(
            Actor(c.observe_dim, c.action_num).type(dtype).to(device), device,
            device)
        critic = smw(
            Critic(c.observe_dim).type(dtype).to(device), device, device)
        # in all test scenarios, all processes will be used as reducers
        servers = grad_server_helper(
            [
                lambda: Actor(c.observe_dim, c.action_num),
                lambda: Critic(c.observe_dim)
            ],
            learning_rate=5e-3,
        )
        a3c = A3C(
            actor,
            critic,
            nn.MSELoss(reduction="sum"),
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return a3c

    ########################################################################
    # Test for A3C acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        a3c.act({"state": state})
        return True

    ########################################################################
    # Test for A3C action evaluation
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_eval_action(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        a3c._eval_act({"state": state}, {"action": action})
        return True

    ########################################################################
    # Test for A3C criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestA3C.c
        a3c = TestA3C.a3c(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        a3c._criticize({"state": state})
        return True

    ########################################################################
    # Test for A3C storage
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestA3C.c
        c.device = gpu
        a3c = TestA3C.a3c(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)

        begin = time()
        while time() - begin < 5:
            a3c.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            } for _ in range(3)])
            a3c.update(
                update_value=True,
                update_policy=True,
                update_target=True,
                concatenate_samples=True,
            )
            sleep(0.01)

        if rank == 1:
            # pull the newest model
            a3c.manual_sync()
        return True

    ########################################################################
    # Test for A3C save & load
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C lr_scheduler
    ########################################################################
    # Skipped, it is the same as A2C

    ########################################################################
    # Test for A3C config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(rank):
        c = TestA3C.c
        config = A3C.generate_config({})
        config["frame_config"]["models"] = ["Actor", "Critic"]
        config["frame_config"]["model_kwargs"] = [
            {
                "state_dim": c.observe_dim,
                "action_num": c.action_num
            },
            {
                "state_dim": c.observe_dim
            },
        ]
        a3c = A3C.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)

        begin = time()
        while time() - begin < 5:
            a3c.store_episode([{
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            } for _ in range(3)])
            a3c.update()
            sleep(0.01)

        if rank == 1:
            # pull the newest model
            a3c.manual_sync()

        return True

    ########################################################################
    # Test for A3C full training.
    ########################################################################
    @staticmethod
    @pytest.mark.parametrize("gae_lambda", [0.0, 0.5, 1.0])
    @run_multi(expected_results=[True, True, True],
               pass_through=["gae_lambda"],
               timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank, gae_lambda):
        c = TestA3C.c
        a3c = TestA3C.a3c("cpu", t.float32)
        a3c.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            # batch size = 1
            total_reward = 0
            state = t.tensor(env.reset(), dtype=t.float32)

            a3c.manual_sync()
            tmp_observations = []
            while not terminal and step <= c.max_steps:
                step.count()
                with t.no_grad():
                    old_state = state
                    # agent model inference
                    action = a3c.act({"state": old_state.unsqueeze(0)})[0]
                    state, reward, terminal, _ = env.step(action.item())
                    state = t.tensor(state, dtype=t.float32).flatten()
                    total_reward += float(reward)

                    tmp_observations.append({
                        "state": {
                            "state": old_state.unsqueeze(0)
                        },
                        "action": {
                            "action": action
                        },
                        "next_state": {
                            "state": state.unsqueeze(0)
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        terminal or step == c.max_steps,
                    })

            # update
            a3c.store_episode(tmp_observations)
            a3c.update()

            smoother.update(total_reward)
            step.reset()
            terminal = False

            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    return True
            else:
                reward_fulfilled.reset()

        raise RuntimeError("A3C Training failed.")
Exemple #12
0
class TestDDPGApex:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Use cartpole-v0 instead since pendulum training is too slow on test machine.
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    # use dim instead of num here
    c.action_dim = 2
    c.action_range = 1
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 150
    c.solved_repeat = 5

    # only for continuous mode testings
    c.action_range = 1

    @staticmethod
    def ddpg_apex(device, dtype, discrete=False):
        c = TestDDPGApex.c
        if not discrete:
            actor = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).type(dtype).to(device),
                device,
                device,
            )
            actor_t = smw(
                Actor(c.observe_dim, c.action_dim,
                      c.action_range).type(dtype).to(device),
                device,
                device,
            )
        else:
            actor = smw(
                ActorDiscrete(c.observe_dim,
                              c.action_dim).type(dtype).to(device),
                device,
                device,
            )
            actor_t = smw(
                ActorDiscrete(c.observe_dim,
                              c.action_dim).type(dtype).to(device),
                device,
                device,
            )
        critic = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
            device)
        critic_t = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
            device)

        servers = model_server_helper(model_num=2)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("worker", ["0", "1", "2"])
        ddpg_apex = DDPGApex(
            actor,
            actor_t,
            critic,
            critic_t,
            t.optim.Adam,
            nn.MSELoss(reduction="sum"),
            apex_group,
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return ddpg_apex

    ########################################################################
    # Test for DDPGApex contiguous domain acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_contiguous_act(_, device, dtype):
        c = TestDDPGApex.c
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ddpg_apex.act({"state": state})
        ddpg_apex.act({"state": state}, use_target=True)
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform")
        ddpg_apex.act_with_noise({"state": state},
                                 noise_param=(0, 1.0),
                                 mode="uniform",
                                 use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex discrete domain acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_discrete_act(_, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype, discrete=True)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ddpg_apex.act_discrete({"state": state})
        ddpg_apex.act_discrete({"state": state}, use_target=True)
        ddpg_apex.act_discrete_with_noise({"state": state})
        ddpg_apex.act_discrete_with_noise({"state": state}, use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test__criticize(_, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, c.action_dim], dtype=dtype)
        ddpg_apex._criticize({"state": state}, {"action": action})
        ddpg_apex._criticize({"state": state}, {"action": action},
                             use_target=True)
        return True

    ########################################################################
    # Test for DDPGApex storage
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(rank, device, dtype):
        c = TestDDPGApex.c
        c.device = gpu
        ddpg_apex = TestDDPGApex.ddpg_apex(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, c.action_dim], dtype=dtype)
        if rank in (0, 1):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 2:
            sleep(2)
            ddpg_apex.update(
                update_value=True,
                update_policy=True,
                update_target=True,
                concatenate_samples=True,
            )
        return True

    ########################################################################
    # Test for DDPGApex save & load
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DDPG

    ########################################################################
    # Test for DDPGApex config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(rank):
        c = TestDDPGApex.c
        config = DDPGApex.generate_config({})
        config["frame_config"]["models"] = [
            "Actor", "Actor", "Critic", "Critic"
        ]
        config["frame_config"][
            "model_kwargs"] = [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim,
                "action_range": c.action_range,
            }] * 2 + [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim
            }] * 2
        ddpg_apex = DDPGApex.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, c.action_dim], dtype=t.float32)
        if rank in (1, 2):
            ddpg_apex.store_transition({
                "state": {
                    "state": old_state
                },
                "action": {
                    "action": action
                },
                "next_state": {
                    "state": state
                },
                "reward": 0,
                "terminal": False,
            })
            sleep(5)
            ddpg_apex.manual_sync()
        if rank == 0:
            sleep(2)
            ddpg_apex.update()

        return True

    ########################################################################
    # Test for DDPGApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestDDPGApex.c
        ddpg_apex = TestDDPGApex.ddpg_apex("cpu", t.float32, discrete=True)
        # perform manual syncing to decrease the number of rpc calls
        ddpg_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        avg_step = Smooth()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        world = get_world()
        all_group = world.create_rpc_group("all", ["0", "1", "2"])
        all_group.pair(f"{rank}_running", True)
        default_logger.info(f"{rank}, pid {os.getpid()}")
        if rank == 0:
            all_group.pair("episode", episode)

        if rank in (0, 1):
            while episode < c.max_episodes:
                # wait for trainer to keep up
                sleep(0.2)
                episode.count()

                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                ddpg_apex.manual_sync()
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        action, probs = ddpg_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)})

                        state, reward, terminal, _ = env.step(
                            action.cpu().item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        ddpg_apex.store_transition({
                            "state": {
                                "state": old_state.unsqueeze(0)
                            },
                            "action": {
                                "action": probs
                            },
                            "next_state": {
                                "state": state.unsqueeze(0)
                            },
                            "reward":
                            float(reward),
                            "terminal":
                            terminal or step == c.max_steps,
                        })

                smoother.update(total_reward)
                avg_step.update(step.get())
                step.reset()
                terminal = False

                default_logger.info("Process {} Episode {} "
                                    "total reward={:.2f}".format(
                                        rank, episode, smoother.value))

                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")

                        all_group.unpair(f"{rank}_running")
                        while all_group.is_paired(
                                "0_running") or all_group.is_paired(
                                    "1_running"):
                            # wait for all workers to join
                            sleep(1)
                        # wait for trainer
                        sleep(5)
                        return True
                else:
                    reward_fulfilled.reset()
        else:
            # wait for some samples
            while ddpg_apex.replay_buffer.all_size() < 500:
                sleep(0.1)
            while all_group.is_paired("0_running") or all_group.is_paired(
                    "1_running"):
                ddpg_apex.update()
                default_logger.info(f"Updated")
            return True

        raise RuntimeError("DDPG-Apex Training failed.")
Exemple #13
0
class TestARS:
    # configs and definitions
    # Cartpole-v0 can be solved:
    # within 200 episodes, using single layer Actor
    # within 400 episodes, using double layer Actor

    # However, ARS fails to deal with pendulum v0:
    # Actor((st, 16)->(16, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam)
    # reaches mean score = -700 at 10000 episodes
    # Actor((st, a)), noise_std=0.01, lr=0.05, rollout=9, optim=Adam)
    # reaches mean score = -1100 at 15000 episodes
    # and Adam optimizer is better than SGD
    disable_view_window()
    c = Config()
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 1000
    c.max_steps = 200
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def ars(device, dtype):
        c = TestARS.c
        actor = smw(
            ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
            device,
            device,
        )
        servers = model_server_helper(model_num=1)
        world = get_world()
        ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
        ars = ARS(
            actor,
            t.optim.SGD,
            ars_group,
            servers,
            noise_std_dev=0.1,
            learning_rate=0.1,
            noise_size=1000000,
            rollout_num=6,
            used_rollout_num=6,
            normalize_state=True,
        )
        return ars

    @staticmethod
    def ars_lr(device, dtype):
        c = TestARS.c
        actor = smw(
            ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device),
            device,
            device,
        )
        lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                         logger=default_logger)
        servers = model_server_helper(model_num=1)
        world = get_world()
        ars_group = world.create_rpc_group("ars", ["0", "1", "2"])
        ars = ARS(
            actor,
            t.optim.SGD,
            ars_group,
            servers,
            noise_size=1000000,
            lr_scheduler=LambdaLR,
            lr_scheduler_args=((lr_func, ), ),
        )
        return ars

    ########################################################################
    # Test for ARS acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_act(_, device, dtype):
        c = TestARS.c
        ars = TestARS.ars(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        ars.act({"state": state}, "original")
        ars.act({"state": state}, ars.get_actor_types()[0])
        with pytest.raises(ValueError):
            ars.act({"state": state}, "some_invalid_actor_type")
        return True

    ########################################################################
    # Test for ARS storage
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_store_reward(_, device, dtype):
        ars = TestARS.ars(device, dtype)
        ars.store_reward(0.0, ars.get_actor_types()[0])
        with pytest.raises(ValueError):
            ars.store_reward(1.0, "some_invalid_actor_type")
        return True

    ########################################################################
    # Test for ARS update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_update(_, device, dtype):
        c = TestARS.c
        ars = TestARS.ars(device, dtype)
        for at in ars.get_actor_types():
            # get action will cause filters to initialize
            _action = ars.act(
                {"state": t.zeros([1, c.observe_dim], dtype=dtype)}, at)
            if at.startswith("neg"):
                ars.store_reward(1.0, at)
            else:
                ars.store_reward(0.0, at)
        ars.update()
        return True

    ########################################################################
    # Test for ARS save & load
    ########################################################################
    # Skipped, it is the same as base

    ########################################################################
    # Test for ARS lr_scheduler
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @WorldTestBase.setup_world
    def test_lr_scheduler(_, device, dtype):
        ars = TestARS.ars_lr(device, dtype)
        ars.update_lr_scheduler()
        return True

    ########################################################################
    # Test for ARS config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @WorldTestBase.setup_world
    def test_config_init(_):
        c = TestARS.c
        config = ARS.generate_config({})
        config["frame_config"]["models"] = ["ActorDiscrete"]
        config["frame_config"]["model_kwargs"] = [{
            "state_dim": c.observe_dim,
            "action_dim": c.action_num
        }]
        ars = ARS.init_from_config(config)

        for at in ars.get_actor_types():
            # get action will cause filters to initialize
            _action = ars.act(
                {"state": t.zeros([1, c.observe_dim], dtype=t.float32)}, at)
            if at.startswith("neg"):
                ars.store_reward(1.0, at)
            else:
                ars.store_reward(0.0, at)
        ars.update()
        return True

    ########################################################################
    # Test for ARS full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @WorldTestBase.setup_world
    def test_full_train(rank):
        c = TestARS.c
        ars = TestARS.ars("cpu", t.float32)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False

        env = c.env
        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")
        while episode < c.max_episodes:
            episode.count()

            all_reward = 0
            for at in ars.get_actor_types():
                total_reward = 0

                # batch size = 1
                state = t.tensor(env.reset(), dtype=t.float32)
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        # agent model inference
                        action = ars.act({"state": state.unsqueeze(0)}, at)
                        state, reward, terminal, __ = env.step(action)
                        state = t.tensor(state, dtype=t.float32)
                        total_reward += float(reward)
                step.reset()
                terminal = False
                ars.store_reward(total_reward, at)
                all_reward += total_reward

            # update
            ars.update()
            smoother.update(all_reward / len(ars.get_actor_types()))
            default_logger.info(
                f"Process {rank} Episode {episode} total reward={smoother.value:.2f}"
            )

            if smoother.value > c.solved_reward:
                reward_fulfilled.count()
                if reward_fulfilled >= c.solved_repeat:
                    default_logger.info("Environment solved!")
                    raise SafeExit
            else:
                reward_fulfilled.reset()

        raise RuntimeError("ARS Training failed.")
Exemple #14
0
class TestDQNApex:
    # configs and definitions
    disable_view_window()
    c = Config()
    # Note: online policy algorithms such as PPO and A2C does not
    # work well in Pendulum (reason unknown)
    # and MountainCarContinuous (sparse returns)
    c.env_name = "CartPole-v0"
    c.env = unwrap_time_limit(gym.make(c.env_name))
    c.observe_dim = 4
    c.action_num = 2
    c.max_episodes = 20000
    c.max_steps = 200
    c.replay_size = 100000
    c.solved_reward = 150
    c.solved_repeat = 5

    @staticmethod
    def dqn_apex(device, dtype):
        c = TestDQNApex.c
        q_net = smw(
            QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        q_net_t = smw(
            QNet(c.observe_dim, c.action_num).type(dtype).to(device), device, device
        )
        servers = model_server_helper(model_num=1)
        world = get_world()
        # process 0 and 1 will be workers, and 2 will be trainer
        apex_group = world.create_rpc_group("apex", ["0", "1", "2"])
        dqn_apex = DQNApex(
            q_net,
            q_net_t,
            t.optim.Adam,
            nn.MSELoss(reduction="sum"),
            apex_group,
            servers,
            replay_device="cpu",
            replay_size=c.replay_size,
        )
        return dqn_apex

    ########################################################################
    # Test for DQNApex acting
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_act(_, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        dqn_apex.act_discrete({"state": state})
        dqn_apex.act_discrete({"state": state}, True)
        dqn_apex.act_discrete_with_noise({"state": state})
        dqn_apex.act_discrete_with_noise({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex criticizing
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_criticize(_, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        state = t.zeros([1, c.observe_dim], dtype=dtype)
        dqn_apex._criticize({"state": state})
        dqn_apex._criticize({"state": state}, True)
        return True

    ########################################################################
    # Test for DQNApex storage
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex update
    ########################################################################
    @staticmethod
    @run_multi(
        expected_results=[True, True, True],
        pass_through=["device", "dtype"],
        timeout=180,
    )
    @setup_world
    def test_update(rank, device, dtype):
        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex(device, dtype)
        old_state = state = t.zeros([1, c.observe_dim], dtype=dtype)
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (0, 1):
            dqn_apex.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
            dqn_apex.manual_sync()
        if rank == 2:
            sleep(2)
            dqn_apex.update()
        return True

    ########################################################################
    # Test for DQNApex save & load
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex lr_scheduler
    ########################################################################
    # Skipped, it is the same as DQN

    ########################################################################
    # Test for DQNApex config & init
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=180)
    @setup_world
    def test_config_init(rank):
        c = TestDQNApex.c
        config = DQNApex.generate_config({})
        config["frame_config"]["models"] = ["QNet", "QNet"]
        config["frame_config"]["model_kwargs"] = [
            {"state_dim": c.observe_dim, "action_num": c.action_num}
        ] * 2
        dqn_apex = DQNApex.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, 1], dtype=t.int)
        if rank in (1, 2):
            dqn_apex.store_episode(
                [
                    {
                        "state": {"state": old_state},
                        "action": {"action": action},
                        "next_state": {"state": state},
                        "reward": 0,
                        "terminal": False,
                    }
                    for _ in range(3)
                ]
            )
            dqn_apex.manual_sync()
        if rank == 0:
            sleep(2)
            dqn_apex.update(
                update_value=True, update_target=True, concatenate_samples=True
            )
        return True

    ########################################################################
    # Test for DQNApex full training.
    ########################################################################
    @staticmethod
    @run_multi(expected_results=[True, True, True], timeout=1800)
    @setup_world
    def test_full_train(rank):
        training_group = get_world().create_rpc_group("training", ["0", "1", "2"])

        c = TestDQNApex.c
        dqn_apex = TestDQNApex.dqn_apex("cpu", t.float32)
        # perform manual syncing to decrease the number of rpc calls
        dqn_apex.set_sync(False)

        # begin training
        episode, step = Counter(), Counter()
        reward_fulfilled = Counter()
        smoother = Smooth()
        terminal = False
        env = c.env
        env.seed(rank)

        # make sure all things are initialized.
        training_group.barrier()

        # for cpu usage viewing
        default_logger.info(f"{rank}, pid {os.getpid()}")

        while episode < c.max_episodes:
            episode.count()

            if rank in (0, 1):
                # batch size = 1
                total_reward = 0
                state = t.tensor(env.reset(), dtype=t.float32)

                dqn_apex.manual_sync()
                tmp_observations = []
                while not terminal and step <= c.max_steps:
                    step.count()
                    with t.no_grad():
                        old_state = state
                        # agent model inference
                        action = dqn_apex.act_discrete_with_noise(
                            {"state": old_state.unsqueeze(0)}
                        )
                        state, reward, terminal, _ = env.step(action.item())
                        state = t.tensor(state, dtype=t.float32).flatten()
                        total_reward += float(reward)

                        tmp_observations.append(
                            {
                                "state": {"state": old_state.unsqueeze(0)},
                                "action": {"action": action},
                                "next_state": {"state": state.unsqueeze(0)},
                                "reward": float(reward),
                                "terminal": terminal or step == c.max_steps,
                            }
                        )
                dqn_apex.store_episode(tmp_observations)
                smoother.update(total_reward)
                step.reset()
                terminal = False

                default_logger.info(
                    "Process {} Episode {} total reward={:.2f}".format(
                        rank, episode, smoother.value
                    )
                )
                if smoother.value > c.solved_reward:
                    reward_fulfilled.count()
                    if reward_fulfilled >= c.solved_repeat:
                        default_logger.info("Environment solved!")
                        try:
                            training_group.pair(f"solved", True)
                        except KeyError:
                            # already solved in another process
                            pass
                else:
                    reward_fulfilled.reset()

            else:
                # wait for some samples
                if episode.get() > 200:
                    for _ in range(100):
                        dqn_apex.update()
                    default_logger.info("Updated 100 times.")

            training_group.barrier()
            if training_group.is_paired("solved"):
                return True

        raise RuntimeError("DQN-Apex Training failed.")