def train_config(self, gpu): disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 # maximum and minimum of reward value # since reward is 1 for every step, maximum q value should be # below 20(reward_future_steps) * (1 + discount ** n_steps) < 40 c.value_max = 40 c.value_min = 0 c.reward_future_steps = 20 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 # RAINBOW is not very stable (without dueling and noisy linear) # compared to other DQNs c.solved_reward = 180 c.solved_repeat = 5 c.device = gpu return c
def train_config(self): disable_view_window() c = Config() # the cooperative environment environment provided in # https://github.com/openai/multiagent-particle-envs c.env_name = "simple_spread" c.env = create_env(c.env_name) c.env.discrete_action_input = True c.agent_num = 3 c.action_num = c.env.action_space[0].n c.observe_dim = c.env.observation_space[0].shape[0] # for contiguous tests c.test_action_dim = 5 c.test_action_range = 1 c.test_observe_dim = 5 c.test_agent_num = 3 c.max_episodes = 1000 c.max_steps = 200 c.replay_size = 100000 # from https://github.com/wsjeon/maddpg-rllib/tree/master/plots # PROBLEM: I have no idea how they calculate the rewards # I cannot replicate their reward curve c.solved_reward = -15 c.solved_repeat = 5 return c
def train_config(self): disable_view_window() c = Config() # Note: online policy algorithms such as PPO and A2C does not # work well in Pendulum (reason unknown) # and MountainCarContinuous (sparse returns) c.env_name = "CartPole-v0" c.env = unwrap_time_limit(gym.make(c.env_name)) c.observe_dim = 4 c.action_num = 2 c.max_episodes = 2000 # the actor learns a little bit slower c.max_steps = 200 c.replay_size = 10000 c.solved_reward = 150 c.solved_repeat = 5 return c