def main(env_id, double, render): if env_id == 'GridWorld-v0': from simpledqn import gridworld_env env = gym.make('GridWorld-v0') def get_obs_dim(x): return x.observation_space.n def get_act_dim(x): return x.action_space.n obs_preprocessor = preprocess_obs_gridworld max_steps = 100000 log_freq = 1000 target_q_update_freq = 100 initial_step = 0 log_dir = "data/local/dqn_gridworld" elif env_id == 'Pong-ram-v0': env = EpisodicLifeEnv(NoopResetEnv(gym.make('Pong-ram-v0'))) def get_obs_dim(x): return x.observation_space.shape[0] def get_act_dim(x): return x.action_space.n obs_preprocessor = preprocess_obs_ram max_steps = 10000000 log_freq = 10000 target_q_update_freq = 1000 initial_step = 1000000 log_dir = "data/local/dqn_pong" else: raise ValueError( "Unsupported environment: must be one of 'GridWorld-v0' 'Pong-ram-v0'" ) logger.session(log_dir).__enter__() env.seed(42) # Initialize the replay buffer that we will use. replay_buffer = ReplayBuffer(max_size=10000) # Initialize DQN training procedure. dqn = DQN( env=env, get_obs_dim=get_obs_dim, get_act_dim=get_act_dim, obs_preprocessor=obs_preprocessor, replay_buffer=replay_buffer, # Q-value parameters q_dim_hid=[256, 256] if env_id == 'Pong-ram-v0' else [], opt_batch_size=64, # DQN gamma parameter discount=0.99, # Training procedure length initial_step=initial_step, max_steps=max_steps, learning_start_itr=max_steps // 100, # Frequency of copying the actual Q to the target Q target_q_update_freq=target_q_update_freq, # Frequency of updating the Q-value function train_q_freq=4, # Double Q double_q=double, # Exploration parameters initial_eps=1.0, final_eps=0.05, fraction_eps=0.1, # Logging log_freq=log_freq, render=render, ) if env_id == 'Pong-ram-v0': # Warm start Q-function dqn._q.set_params(dqn._q.load('simpledqn/weights_warm_start.pkl')) dqn._qt.set_params(dqn._qt.load('simpledqn/weights_warm_start.pkl')) # Warm start replay buffer dqn._replay_buffer.load('simpledqn/replay_buffer_warm_start.pkl') print("Warm-starting Pong training!") if env_id == 'GridWorld-v0': # Run tests on GridWorld-v0 test_args = dict( l_obs=nprs(0).rand(64, 16).astype(np.float32), l_act=nprs(1).randint(0, 3, size=(64, )), l_rew=nprs(2).randint(0, 3, size=(64, )).astype(np.float32), l_next_obs=nprs(3).rand(64, 16).astype(np.float32), l_done=nprs(4).randint(0, 2, size=(64, )).astype(np.float32), ) if not double: tgt = np.array([1.909377098083496], dtype=np.float32) actual_var = dqn.compute_q_learning_loss(**test_args) test_name = "compute_q_learning_loss" assert isinstance( actual_var, C.Variable), "%s should return a Chainer variable" % test_name actual = actual_var.data try: assert_allclose(tgt, actual) print("Test for %s passed!" % test_name) except AssertionError as e: print("Warning: test for %s didn't pass!" % test_name) print(e) input( "** Test failed. Press Ctrl+C to exit or press enter to continue training anyways" ) else: tgt = np.array([1.9066928625106812], dtype=np.float32) actual_var = dqn.compute_double_q_learning_loss(**test_args) test_name = "compute_double_q_learning_loss" assert isinstance( actual_var, C.Variable), "%s should return a Chainer variable" % test_name actual = actual_var.data try: assert_allclose(tgt, actual) print("Test for %s passed!" % test_name) except AssertionError as e: print("Warning: test for %s didn't pass!" % test_name) print(e) input( "** Test failed. Press Ctrl+C to exit or press enter to continue training anyways" ) if render: dqn.test(epsilon=0.0) else: # Train the agent! dqn.train() # Close gym environment. env.close()
def main(env_id, double, render): if env_id == 'GridWorld-v0': from simpledqn import gridworld_env env = gym.make('GridWorld-v0') def get_obs_dim(x): return x.observation_space.n def get_act_dim(x): return x.action_space.n obs_preprocessor = preprocess_obs_gridworld max_steps = 100000 log_freq = 1000 target_q_update_freq = 100 initial_step = 0 log_dir = "data/local/dqn_gridworld" elif env_id == 'Pong-ram-v0': env = EpisodicLifeEnv(NoopResetEnv(gym.make('Pong-ram-v0'))) def get_obs_dim(x): return x.observation_space.shape[0] def get_act_dim(x): return x.action_space.n obs_preprocessor = preprocess_obs_ram max_steps = 10000000 log_freq = 10000 target_q_update_freq = 1000 initial_step = 1000000 log_dir = "data/local/dqn_pong" else: raise ValueError( "Unsupported environment: must be one of 'GridWorld-v0' 'Pong-ram-v0'") logger.session(log_dir).__enter__() env.seed(42) # Initialize the replay buffer that we will use. replay_buffer = ReplayBuffer(max_size=10000) # Initialize DQN training procedure. dqn = DQN( env=env, get_obs_dim=get_obs_dim, get_act_dim=get_act_dim, obs_preprocessor=obs_preprocessor, replay_buffer=replay_buffer, # Q-value parameters q_dim_hid=[256, 256] if env_id == 'Pong-ram-v0' else [], opt_batch_size=64, # DQN gamma parameter discount=0.99, # Training procedure length initial_step=initial_step, max_steps=max_steps, learning_start_itr=max_steps // 100, # Frequency of copying the actual Q to the target Q target_q_update_freq=target_q_update_freq, # Frequency of updating the Q-value function train_q_freq=4, # Double Q double_q=double, # Exploration parameters initial_eps=1.0, final_eps=0.05, fraction_eps=0.1, # Logging log_freq=log_freq, render=render, ) if env_id == 'Pong-ram-v0': # Warm start Q-function dqn._q.set_params(dqn._q.load('simpledqn/weights_warm_start.pkl')) dqn._qt.set_params(dqn._qt.load('simpledqn/weights_warm_start.pkl')) # Warm start replay buffer dqn._replay_buffer.load('simpledqn/replay_buffer_warm_start.pkl') print("Warm-starting Pong training!") if env_id == 'GridWorld-v0': # Run tests on GridWorld-v0 test_args = dict( l_obs=nprs(0).rand(64, 16).astype(np.float32), l_act=nprs(1).randint(0, 3, size=(64,)), l_rew=nprs(2).randint(0, 3, size=(64,)).astype(np.float32), l_next_obs=nprs(3).rand(64, 16).astype(np.float32), l_done=nprs(4).randint(0, 2, size=(64,)).astype(np.float32), ) if not double: tgt = np.array([1.909377098083496], dtype=np.float32) actual_var = dqn.compute_q_learning_loss(**test_args) test_name = "compute_q_learning_loss" assert isinstance( actual_var, C.Variable), "%s should return a Chainer variable" % test_name actual = actual_var.data try: assert_allclose(tgt, actual) print("Test for %s passed!" % test_name) except AssertionError as e: print("Warning: test for %s didn't pass!" % test_name) print(e) input( "** Test failed. Press Ctrl+C to exit or press enter to continue training anyways") else: tgt = np.array([1.9066928625106812], dtype=np.float32) actual_var = dqn.compute_double_q_learning_loss(**test_args) test_name = "compute_double_q_learning_loss" assert isinstance( actual_var, C.Variable), "%s should return a Chainer variable" % test_name actual = actual_var.data try: assert_allclose(tgt, actual) print("Test for %s passed!" % test_name) except AssertionError as e: print("Warning: test for %s didn't pass!" % test_name) print(e) input( "** Test failed. Press Ctrl+C to exit or press enter to continue training anyways") if render: dqn.test(epsilon=0.0) else: # Train the agent! dqn.train() # Close gym environment. env.close()