def play(env, session, timesteps_num): def stopping_criterion(env, t): return env.stop_criterion(t) ########################## # learning rate schedule # ########################## iterations_num = float(timesteps_num) / 4.0 lr_multiplier = 1.0 lr_schedule = utils.PiecewiseSchedule( [(0, 1e-4 * lr_multiplier), (iterations_num / 10, 1e-4 * lr_multiplier), (iterations_num / 2, 5e-5 * lr_multiplier)], outside_value=5e-5 * lr_multiplier) ################# # set optimizer # ################# OptimizerSepc = namedtuple('OptimizerSpec', ['constructor', 'kwargs', 'lr_schedule']) optimizer = OptimizerSepc(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) ######################## # exploration schedule # ######################## exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (1e6, 0.1), (iterations_num / 2, 0.01)], outside_value=0.01) ################# # play the game # ################# ''' worker_max_num=multiprocessing.cpu_count() numworkers=10 assert numworkers<worker_max_num for i in range(numworkers): t = threading.Thread(target=work, args=(str(i))) t.start() ''' dqn_worker(env=env, name='dqn_worker', optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learn_start=50000, learn_freq=4, history_frames_num=4, target_update_freq=10000, grad_norm_clipping=10, stop_criterion=stopping_criterion)
def main(): args = get_args() env = make_atari_env(args.env, args.seed) benchmark_env = make_atari_env(args.env, args.seed + 1) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4) exploration_schedule = utils.PiecewiseSchedule( [ (0, 1.0), (args.prepopulate, 1.0), (args.prepopulate + args.explore_time, args.final_eps), ], outside_value=args.final_eps, ) if not args.legacy: assert args.train_freq == 4 # Training frequency is undefined for DQN(lambda) replay_memory = make_replay_memory(args.return_est, args.mem_size, args.history_len, args.discount, args.cache_size, args.block_size, args.priority) else: assert args.cache_size == 80000 # Cache-related args are undefined for legacy DQN assert args.priority == 0.0 assert args.block_size == 100 replay_memory = make_legacy_replay_memory(args.return_est, args.mem_size, args.history_len, args.discount) with utils.make_session(args.seed) as session: dqn.learn( session, env, benchmark_env, atari_cnn, replay_memory, optimizer, exploration_schedule, args.timesteps, args.batch_size, args.prepopulate, args.update_freq, train_freq=args.train_freq, grad_clip=args.grad_clip, log_every_n_steps=1000, ) env.close()
def create_scheduler(type='medium'): if type == 'none': return None if type == 'linear': return utils.LinearSchedule(200000, 1.0, 0.0) if type == 'medium': endpoints = [(0, 0), (2000, 0.1), (7000, 0.25), (40000, 0.5), (200000, 1.0)] elif type == 'high': endpoints = [(0, 0), (3000, 0.1), (15000, 0.25), (80000, 0.5), (500000, 1.0)] elif type == 'low': # low endpoints = [(0, 0), (1000, 0.1), (3000, 0.25), (20000, 0.5), (100000, 1.0)] elif type == 'tiny': # low endpoints = [(0, 0), (1000, 0.1), (2000, 0.25), (5000, 0.5), (20000, 1.0)] elif type == 'exp': endpoints = [(0, 0), (1000, 0.01), (5000, 0.1), (10000, 0.5), (20000, 0.75), (50000, 0.9), (100000, 0.95), (200000, 1.0)] print('Building PiecewiseScheduler with <endpoints> = {}'.format(endpoints)) scheduler = utils.PiecewiseSchedule(endpoints, outside_value=1.0) return scheduler
def main(): args = get_args() utils.set_global_seeds(args.seed) env = make_atari_env(args.env, args.seed) benchmark_env = make_atari_env(args.env, args.seed+1) optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4) n_timesteps = 10000000 learning_starts = 50000 exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (learning_starts, 1.0), (learning_starts + 1e6, 0.1)], outside_value=0.1, ) replay_memory = NStepReplayMemory( size=1000000, history_len=args.history_len, discount=0.99, nsteps=args.nsteps, ) q_func = AtariRecurrentConvNet() if args.recurrent else AtariConvNet() dqn.learn( env, benchmark_env, q_func, replay_memory, optimizer=optimizer, exploration=exploration_schedule, max_timesteps=n_timesteps, batch_size=32, learning_starts=learning_starts, learning_freq=4, target_update_freq=10000, grad_clip=40., log_every_n_steps=50000, ) env.close()
def main(): seed = 0 utils.set_global_seeds(seed) name = 'CartPole-v0' env = make_continuouscontrol_env(name, seed) benchmark_env = make_continuouscontrol_env(name, seed + 1) optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) n_timesteps = 500000 learning_starts = 50000 exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (learning_starts, 1.0), (learning_starts + 3e5, 0.1)], outside_value=0.1, ) replay_memory = NStepReplayMemory( size=500000, history_len=1, discount=0.99, nsteps=1, ) dqn.learn( env, benchmark_env, CartPoleNet(), replay_memory, optimizer=optimizer, exploration=exploration_schedule, max_timesteps=n_timesteps, batch_size=32, learning_starts=learning_starts, learning_freq=4, target_update_freq=10000, log_every_n_steps=10000, ) env.close()
def main(): seed = 0 name = 'CartPole-v0' env = make_gym_env(name, seed) benchmark_env = make_gym_env(name, seed + 1) optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) prepopulate = 50000 exploration_schedule = utils.PiecewiseSchedule( [(0, 1.0), (prepopulate, 1.0), (prepopulate + 3e5, 0.1)], outside_value=0.1, ) replay_memory = make_replay_memory(return_est='nstep-5', capacity=500000, history_len=1, discount=0.99, cache_size=80000, block_size=100, priority=0.0) with utils.make_session(seed) as session: dqn.learn( session, env, benchmark_env, cartpole_mlp, replay_memory, optimizer=optimizer, exploration=exploration_schedule, max_timesteps=500000, batch_size=32, prepopulate=prepopulate, target_update_freq=10000, train_freq=4, log_every_n_steps=10000, ) env.close()
def play(env,session,timesteps_num): ################### # build q network # ################### def build_cnn(input, act_num, scope, reuse=False): with tf.variable_scope(scope, reuse=reuse): out = input with tf.variable_scope('convnet'): out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) out = layers.flatten(out) with tf.variable_scope('action_value'): out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) out = layers.fully_connected(out, num_outputs=act_num, activation_fn=None) return out def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return utils.get_wrapper_by_name(env, "Monitor").get_total_steps() >= timesteps_num ########################## # learning rate schedule # ########################## iterations_num=float(timesteps_num)/4.0 lr_multiplier=1.0 lr_schedule=utils.PiecewiseSchedule([ (0 , 1e-4*lr_multiplier), (iterations_num/10 , 1e-4*lr_multiplier), (iterations_num/2 , 5e-5*lr_multiplier) ],outside_value=5e-5*lr_multiplier) ################# # set optimizer # ################# OptimizerSepc = namedtuple('OptimizerSpec', ['constructor', 'kwargs', 'lr_schedule']) optimizer=OptimizerSepc( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) ######################## # exploration schedule # ######################## exploration_schedule=utils.PiecewiseSchedule([ (0 , 1.0), (1e6 , 0.1), (iterations_num/2 , 0.01) ],outside_value=0.01) ################# # play the game # ################# agent.learn_by_dqn(env=env, q_net=build_cnn, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learn_start=50000, learn_freq=4, history_frames_num=4, target_update_freq=10000, grad_norm_clipping=10, stop_criterion=stopping_criterion )