Esempio n. 1
0
    def train_config(self, gpu):
        disable_view_window()
        c = Config()
        # Note: online policy algorithms such as PPO and A2C does not
        # work well in Pendulum (reason unknown)
        # and MountainCarContinuous (sparse returns)
        c.env_name = "CartPole-v0"
        c.env = unwrap_time_limit(gym.make(c.env_name))
        c.observe_dim = 4
        c.action_num = 2
        # maximum and minimum of reward value
        # since reward is 1 for every step, maximum q value should be
        # below 20(reward_future_steps) * (1 + discount ** n_steps) < 40
        c.value_max = 40
        c.value_min = 0
        c.reward_future_steps = 20
        c.max_episodes = 1000
        c.max_steps = 200
        c.replay_size = 100000

        # RAINBOW is not very stable (without dueling and noisy linear)
        # compared to other DQNs
        c.solved_reward = 180
        c.solved_repeat = 5
        c.device = gpu
        return c
Esempio n. 2
0
 def train_config(self):
     disable_view_window()
     c = Config()
     # the cooperative environment environment provided in
     # https://github.com/openai/multiagent-particle-envs
     c.env_name = "simple_spread"
     c.env = create_env(c.env_name)
     c.env.discrete_action_input = True
     c.agent_num = 3
     c.action_num = c.env.action_space[0].n
     c.observe_dim = c.env.observation_space[0].shape[0]
     # for contiguous tests
     c.test_action_dim = 5
     c.test_action_range = 1
     c.test_observe_dim = 5
     c.test_agent_num = 3
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     # from https://github.com/wsjeon/maddpg-rllib/tree/master/plots
     # PROBLEM: I have no idea how they calculate the rewards
     # I cannot replicate their reward curve
     c.solved_reward = -15
     c.solved_repeat = 5
     return c
Esempio n. 3
0
 def train_config(self):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.replay_size = 100000
     c.solved_reward = -400
     c.solved_repeat = 5
     return c
Esempio n. 4
0
 def train_config(self):
     disable_view_window()
     c = Config()
     # Note: online policy algorithms such as PPO and A2C does not
     # work well in Pendulum (reason unknown)
     # and MountainCarContinuous (sparse returns)
     c.env_name = "CartPole-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 4
     c.action_num = 2
     c.max_episodes = 2000  # the actor learns a little bit slower
     c.max_steps = 200
     c.replay_size = 10000
     c.solved_reward = 150
     c.solved_repeat = 5
     return c
Esempio n. 5
0
 def train_config(self, pytestconfig):
     disable_view_window()
     c = Config()
     c.env_name = "Pendulum-v0"
     c.env = unwrap_time_limit(gym.make(c.env_name))
     c.observe_dim = 3
     c.action_dim = 1
     c.action_range = 2
     c.max_episodes = 1000
     c.max_steps = 200
     c.noise_param = (0, 0.2)
     c.noise_mode = "normal"
     c.noise_interval = 2
     c.replay_size = 100000
     c.solved_reward = -150
     c.solved_repeat = 5
     c.device = "cpu"
     return c