def atari_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, # learning_starts=50000, learning_starts=50, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def lander_learn( env, session, num_timesteps, # YOUR OWN CODE seed, doubleQ=True, exp_name='doubleQ', schedule='PiecewiseSchedule', rew_file='lander_test.pk1'): # optimizer = lander_optimizer() # stopping_criterion = lander_stopping_criterion(num_timesteps) # exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn( env=env, session=session, exploration=lander_exploration_schedule(num_timesteps, schedule), stopping_criterion=lander_stopping_criterion(num_timesteps), double_q=doubleQ, # YOUR OWN CODE rew_file=rew_file, seed=seed, env_name='LunarLander-v2', exp_name=exp_name, **lander_kwargs()) env.close()
def learn(env, session, args): if args.env == 'PongNoFrameskip-v4': lr_schedule = ConstantSchedule(1e-4) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) limit = max(int(args.num_steps / 2), 2e6) exploration_schedule = PiecewiseSchedule([ (0, 1.00), (1e6, 0.10), (limit, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=args.double_q, logdir=args.logdir, max_steps=args.num_steps) elif args.env == 'CartPole-v0': lr_schedule = ConstantSchedule(5e-4) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) exploration_schedule = PiecewiseSchedule([ (0, 1.00), (5e4, 0.10), (1e5, 0.02), ], outside_value=0.02) dqn.learn(env=env, q_func=cartpole_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=10000, batch_size=100, gamma=0.99, learning_starts=1000, learning_freq=4, frame_history_len=1, target_update_freq=500, grad_norm_clipping=10, double_q=args.double_q, logdir=args.logdir, max_steps=args.num_steps, cartpole=True) else: raise ValueError(args.env) env.close()
def atari_learn(env, session, num_timesteps): num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule( [(0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier)], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdadeltaOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([(0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01)], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=True) env.close()
def atari_learn( env, session, num_timesteps, # YOUR OWN CODE seed, doubleQ=True, exp_name='doubleQ', rew_file='ram_test.pk1'): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10, # YOUR OWN CODE double_q=doubleQ, rew_file=rew_file, seed=seed, env_name='Pong-ram-v0', exp_name=exp_name) env.close()
def atari_learn(env, session, num_timesteps, double_q, explore, env_name, ex2=ex2, coef=coef): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # therefore, the exploration gradually decrease exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) # TO-DO: Pay attention to arg here, double_q dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=double_q, rew_file='./pkl/' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl', explore=explore, ex2=ex2, coef=coef) env.close()
def atari_learn(env, session, num_timesteps, lr_multiplier): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) if not (os.path.exists('data')): os.makedirs('data') logdir = os.path.join('data', 'PongNoFrameskip-v4') if not (os.path.exists(logdir)): os.makedirs(logdir) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=False, # rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), str(lr_multiplier)) rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), 'vanilla') ) env.close()
def atari_learn(env, session, args, num_timesteps): logdir = os.path.join('data', args.exp_name) #if not(os.path.exists(logdir)): #os.makedirs(logdir) # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=args.gamma, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=args.double_q, logdir=logdir ) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env if (t % 10000 == 0): print("get_total_steps:" + str(get_wrapper_by_name(env, "Monitor").get_total_steps()) + ", t:" + str(t) + ", num_timesteps:" + str(num_timesteps)) return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) exploration_schedule2 = PiecewiseSchedule([ (0, 1.0), (2e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, #pipaek stopping_criterion=stopping_criterion, #replay_buffer_size=1000000, replay_buffer_size=2000000, #pipaek batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def cartpole_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 # lr_multiplier = 1.0 # lr_multiplier = 0.1 # lr_schedule = PiecewiseSchedule([ # (0, 1e-4 * lr_multiplier), # (num_iterations / 2, 1e-5 * lr_multiplier), # ], # outside_value=5e-5 * lr_multiplier) lr_schedule = InverseSchedule(initial_p=0.1, gamma=0.6) optimizer = dqn.OptimizerSpec( constructor=tf.train.GradientDescentOptimizer, # constructor=tf.train.AdamOptimizer, # kwargs=dict(epsilon=1e-4), kwargs=dict(), # constructor=tf.train.RMSPropOptimizer, # kwargs=dict(epsilon=1e-1), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), # (0.2 * num_timesteps, 0.9), # (0.5 * num_timesteps, 0.5), (0.1 * num_timesteps, 0.1), ], outside_value=0.01) dqn.learn( env, q_func=cartpole_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=100000, batch_size=256, gamma=0.99, learning_starts=2000, learning_freq=1, frame_history_len=4, target_update_freq=1000, grad_norm_clipping=1000, ) env.close()
def lander_learn(env, session, num_timesteps, seed): optimizer = lander_optimizer() stopping_criterion = lander_stopping_criterion(num_timesteps) exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn(env=env, session=session, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), double_q=True, **lander_kwargs()) env.close()
def arm_learn(env, session, scope_name, num_timesteps, spec_file=None, exp_dir=None): # # This is just a rough estimate # num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_timesteps / 40, 1e-4 * lr_multiplier), (num_timesteps / 8, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(t): return t >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (num_timesteps / 20, 0.3), (num_timesteps / 10, 0.1), (num_timesteps / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=arm_model, optimizer_spec=optimizer, session=session, scope_name=scope_name, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=2000, learning_freq=1, frame_history_len=1, target_update_freq=500, grad_norm_clipping=10, log_every_n_steps=500, spec_file=spec_file, exp_dir=exp_dir) ep_rew = env.get_episode_rewards() ep_len = env.get_episode_lengths() return ep_rew, ep_len
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) # Initialize Logging Dir data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data') if not (osp.exists(data_path)): os.makedirs(data_path) logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join(data_path, logdir) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10, logdir=logdir) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps if REW_FILE == 'LinearSchedule': exploration_schedule = LinearSchedule(num_iterations, final_p=0.01, initial_p=1.0) elif REW_FILE == 'ConstantSchedule': exploration_schedule = ConstantSchedule(0.05) else: exploration_schedule = PiecewiseSchedule([ (0, 1.0), (num_iterations / 5, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=100000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=True, rew_file=REW_FILE) env.close()
def lander_learn(env, session, discount, num_timesteps, batch_size, double): optimizer = lander_optimizer() stopping_criterion = lander_stopping_criterion(num_timesteps) exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn(env=env, session=session, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), batch_size=batch_size, gamma=discount, double_q=double, **lander_kwargs()) env.close()
def atari_learn(env, session, discount, num_timesteps, batch_size, double, target_update_freq, **kwargs): # [Mehran Shakerinava] change end # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, learning_starts=50000, learning_freq=4, frame_history_len=4, grad_norm_clipping=10, # [Mehran Shakerinava] change begin target_update_freq=target_update_freq, batch_size=batch_size, gamma=discount, double_q=double # [Mehran Shakerinava] change end ) env.close()
def knapsack_learn(env, session, num_timesteps, lr_multiplier=1.0, target_update_freq=10000, exp_name='Knapsack_DQN', boltzmann_exploration=False): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=knapsack_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=512, gamma=1, learning_starts=5000, learning_freq=4, frame_history_len=4, target_update_freq=target_update_freq, grad_norm_clipping=10, exp_name=exp_name, boltzmann_exploration=boltzmann_exploration) env.close()
def main(): args = get_args() env = make_atari_env(args.env, args.seed) benchmark_env = make_atari_env(args.env, args.seed + 1) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4) exploration_schedule = utils.PiecewiseSchedule( [ (0, 1.0), (args.prepopulate, 1.0), (args.prepopulate + args.explore_time, args.final_eps), ], outside_value=args.final_eps, ) if not args.legacy: assert args.train_freq == 4 # Training frequency is undefined for DQN(lambda) replay_memory = make_replay_memory(args.return_est, args.mem_size, args.history_len, args.discount, args.cache_size, args.block_size, args.priority) else: assert args.cache_size == 80000 # Cache-related args are undefined for legacy DQN assert args.priority == 0.0 assert args.block_size == 100 replay_memory = make_legacy_replay_memory(args.return_est, args.mem_size, args.history_len, args.discount) with utils.make_session(args.seed) as session: dqn.learn( session, env, benchmark_env, atari_cnn, replay_memory, optimizer, exploration_schedule, args.timesteps, args.batch_size, args.prepopulate, args.update_freq, train_freq=args.train_freq, grad_clip=args.grad_clip, log_every_n_steps=1000, ) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10 ) env.close()
def atari_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 LEARNING_RATE = 5e-5 lr_multiplier = 3.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=optim.Adam, kwargs=dict(lr=LEARNING_RATE, eps=1e-4) ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, num_target_values = 10 ) env.close()
def lander_learn(env, session, num_timesteps, seed): optimizer = lander_optimizer() stopping_criterion = lander_stopping_criterion(num_timesteps) exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn(env=env, session=session, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), double_q=True, save_name='lander_ddqn_replay500000_target5000', save=True, test=False, **lander_kwargs()) env.close()
def atari_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return env.get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ # (0, 1.0), # (1e6, 0.1), (0, 0.4), (5e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) lr_schedule = dict(milestones=[num_iterations / 2], gamma=0.5) dqn.learn( env=env, lr_schedule=lr_schedule, load_path='model/step_2400000.pth.tar', # load_path=None, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=700000, # replay_buffer_size=70000, batch_size=32, gamma=0.99, learning_starts=50000, # learning_starts=50, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=True) env.close()
def lander_learn(env, session, num_timesteps, seed, double_q, explore): optimizer = lander_optimizer() stopping_criterion = lander_stopping_criterion(num_timesteps) exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn( env=env, session=session, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), # double_q=True, double_q=double_q, rew_file='./pkl/lander_' + time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl', explore=explore, **lander_kwargs()) env.close()
def atari_learn(env, session, num_timesteps, result_dir): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) lander_optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs={}, lr_schedule=ConstantSchedule(1e-3)) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1), (num_timesteps * 0.1, 0.02), ], outside_value=0.02) dqn.learn( env=env, q_func=lander_model, optimizer_spec=lander_optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=50000, batch_size=32, gamma=1, learning_starts=1000, learning_freq=1, frame_history_len=1, target_update_freq=3000, grad_norm_clipping=10, lander=True, rew_file=osp.join(result_dir, 'episode_rewards.pkl'), ) env.close()
def lander_learn(env, session, num_timesteps, seed): # Initialize Logging Dir data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data') if not (osp.exists(data_path)): os.makedirs(data_path) logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join(data_path, logdir) dqn.learn(env=env, session=session, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), double_q=False, logdir=logdir, **lander_kwargs()) env.close()
def lander_learn(env, session, seed, exp_name, num_timesteps, double_q, replay_buffer_size): optimizer = lander_optimizer() stopping_criterion = lander_stopping_criterion(num_timesteps) exploration_schedule = lander_exploration_schedule(num_timesteps) dqn.learn(env=env, session=session, exp_name=exp_name, seed=seed, exploration=lander_exploration_schedule(num_timesteps), stopping_criterion=lander_stopping_criterion(num_timesteps), double_q=double_q, replay_buffer_size=replay_buffer_size, **lander_kwargs()) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, #just the neural network as defined above optimizer_spec= optimizer, #just a named tuple containing the grad alg, lr_shedule etc.. session=session, #The tf session exploration=exploration_schedule, #epsilon greedy schedule stopping_criterion=stopping_criterion, replay_buffer_size=1000000, #Replay buffer size... size of what? batch_size=32, #Gradient ascent batch size I guess.. gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def run(worker_id, monitor, args): task_id, env_code, agent = args env = gym.make(env_code) _, reward_history, _ = dqn.learn(env, agent, monitor=monitor, worker_id=worker_id) env.close() return reward_history
def atari_learn(env, session, num_timesteps, model, double_q, logdir): lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_timesteps / 10, 1e-4 * lr_multiplier), (num_timesteps / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= 4 * num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_timesteps / 2, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=globals()["atari_model_" + model], optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=double_q, rew_file=None, logdir=logdir) env.close()
def knapsack_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return False exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=knapsack_model, nn_size=3, n_hidden_units=128, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, target_update_freq=10000, grad_norm_clipping=10, double_DQN=True, n_steps_ahead=3) env.close()
def game_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env pass exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=cnn_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()