def learn(env, session, args): if args.env == 'PongNoFrameskip-v4': lr_schedule = ConstantSchedule(1e-4) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) limit = max(int(args.num_steps / 2), 2e6) exploration_schedule = PiecewiseSchedule([ (0, 1.00), (1e6, 0.10), (limit, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=args.double_q, logdir=args.logdir, max_steps=args.num_steps) elif args.env == 'CartPole-v0': lr_schedule = ConstantSchedule(5e-4) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) exploration_schedule = PiecewiseSchedule([ (0, 1.00), (5e4, 0.10), (1e5, 0.02), ], outside_value=0.02) dqn.learn(env=env, q_func=cartpole_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=10000, batch_size=100, gamma=0.99, learning_starts=1000, learning_freq=4, frame_history_len=1, target_update_freq=500, grad_norm_clipping=10, double_q=args.double_q, logdir=args.logdir, max_steps=args.num_steps, cartpole=True) else: raise ValueError(args.env) env.close()
def atari_learn(env, session, num_timesteps): num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule( [(0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier)], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdadeltaOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([(0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01)], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=True) env.close()
def lander_optimizer(): return dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, lr_schedule=ConstantSchedule(1e-3), # lr_schedule=ConstantSchedule(0.5e-3), # lr_schedule=ConstantSchedule(0.1e-3), # lr_schedule=ConstantSchedule(0.01e-3), kwargs={})
def lander_optimizer(): lr_schedule = ConstantSchedule(1e-3) lr_lambda = lambda t: lr_schedule.value(t) return dqn.OptimizerSpec( constructor=torch.optim.Adam, lr_lambda=lr_lambda, kwargs={} )
def atari_learn(env, session, num_timesteps, double_q, explore, env_name, ex2=ex2, coef=coef): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # therefore, the exploration gradually decrease exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) # TO-DO: Pay attention to arg here, double_q dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=double_q, rew_file='./pkl/' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + '.pkl', explore=explore, ex2=ex2, coef=coef) env.close()
def atari_learn( env, session, num_timesteps, # YOUR OWN CODE seed, doubleQ=True, exp_name='doubleQ', rew_file='ram_test.pk1'): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10, # YOUR OWN CODE double_q=doubleQ, rew_file=rew_file, seed=seed, env_name='Pong-ram-v0', exp_name=exp_name) env.close()
def atari_learn(env, session, num_timesteps, lr_multiplier): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) if not (os.path.exists('data')): os.makedirs('data') logdir = os.path.join('data', 'PongNoFrameskip-v4') if not (os.path.exists(logdir)): os.makedirs(logdir) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=False, # rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), str(lr_multiplier)) rew_file='%s_lr_%s.pkl' % (os.path.join(logdir, time.strftime("%d-%m-%Y_%H-%M-%S")), 'vanilla') ) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env if (t % 10000 == 0): print("get_total_steps:" + str(get_wrapper_by_name(env, "Monitor").get_total_steps()) + ", t:" + str(t) + ", num_timesteps:" + str(num_timesteps)) return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) exploration_schedule2 = PiecewiseSchedule([ (0, 1.0), (2e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, #pipaek stopping_criterion=stopping_criterion, #replay_buffer_size=1000000, replay_buffer_size=2000000, #pipaek batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def atari_learn(env, session, args, num_timesteps): logdir = os.path.join('data', args.exp_name) #if not(os.path.exists(logdir)): #os.makedirs(logdir) # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=args.gamma, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=args.double_q, logdir=logdir ) env.close()
def cartpole_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 # lr_multiplier = 1.0 # lr_multiplier = 0.1 # lr_schedule = PiecewiseSchedule([ # (0, 1e-4 * lr_multiplier), # (num_iterations / 2, 1e-5 * lr_multiplier), # ], # outside_value=5e-5 * lr_multiplier) lr_schedule = InverseSchedule(initial_p=0.1, gamma=0.6) optimizer = dqn.OptimizerSpec( constructor=tf.train.GradientDescentOptimizer, # constructor=tf.train.AdamOptimizer, # kwargs=dict(epsilon=1e-4), kwargs=dict(), # constructor=tf.train.RMSPropOptimizer, # kwargs=dict(epsilon=1e-1), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), # (0.2 * num_timesteps, 0.9), # (0.5 * num_timesteps, 0.5), (0.1 * num_timesteps, 0.1), ], outside_value=0.01) dqn.learn( env, q_func=cartpole_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=100000, batch_size=256, gamma=0.99, learning_starts=2000, learning_freq=1, frame_history_len=4, target_update_freq=1000, grad_norm_clipping=1000, ) env.close()
def arm_learn(env, session, scope_name, num_timesteps, spec_file=None, exp_dir=None): # # This is just a rough estimate # num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_timesteps / 40, 1e-4 * lr_multiplier), (num_timesteps / 8, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(t): return t >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (num_timesteps / 20, 0.3), (num_timesteps / 10, 0.1), (num_timesteps / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=arm_model, optimizer_spec=optimizer, session=session, scope_name=scope_name, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=2000, learning_freq=1, frame_history_len=1, target_update_freq=500, grad_norm_clipping=10, log_every_n_steps=500, spec_file=spec_file, exp_dir=exp_dir) ep_rew = env.get_episode_rewards() ep_len = env.get_episode_lengths() return ep_rew, ep_len
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) # Initialize Logging Dir data_path = osp.join(osp.dirname(osp.realpath(__file__)), 'data') if not (osp.exists(data_path)): os.makedirs(data_path) logdir = 'dqn_' + env.spec.id + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join(data_path, logdir) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10, logdir=logdir) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps if REW_FILE == 'LinearSchedule': exploration_schedule = LinearSchedule(num_iterations, final_p=0.01, initial_p=1.0) elif REW_FILE == 'ConstantSchedule': exploration_schedule = ConstantSchedule(0.05) else: exploration_schedule = PiecewiseSchedule([ (0, 1.0), (num_iterations / 5, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=100000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=True, rew_file=REW_FILE) env.close()
def atari_learn(env, session, discount, num_timesteps, batch_size, double, target_update_freq, **kwargs): # [Mehran Shakerinava] change end # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env=env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, learning_starts=50000, learning_freq=4, frame_history_len=4, grad_norm_clipping=10, # [Mehran Shakerinava] change begin target_update_freq=target_update_freq, batch_size=batch_size, gamma=discount, double_q=double # [Mehran Shakerinava] change end ) env.close()
def knapsack_learn(env, session, num_timesteps, lr_multiplier=1.0, target_update_freq=10000, exp_name='Knapsack_DQN', boltzmann_exploration=False): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=knapsack_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=512, gamma=1, learning_starts=5000, learning_freq=4, frame_history_len=4, target_update_freq=target_update_freq, grad_norm_clipping=10, exp_name=exp_name, boltzmann_exploration=boltzmann_exploration) env.close()
def atari_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 LEARNING_RATE = 5e-5 lr_multiplier = 3.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=optim.Adam, kwargs=dict(lr=LEARNING_RATE, eps=1e-4) ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env, q_func=atari_model, optimizer_spec=optimizer, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, num_target_values = 10 ) env.close()
def atari_learn(env, session, num_timesteps, result_dir): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) lander_optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs={}, lr_schedule=ConstantSchedule(1e-3)) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1), (num_timesteps * 0.1, 0.02), ], outside_value=0.02) dqn.learn( env=env, q_func=lander_model, optimizer_spec=lander_optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=50000, batch_size=32, gamma=1, learning_starts=1000, learning_freq=1, frame_history_len=1, target_update_freq=3000, grad_norm_clipping=10, lander=True, rew_file=osp.join(result_dir, 'episode_rewards.pkl'), ) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn( env, q_func=atari_model, #just the neural network as defined above optimizer_spec= optimizer, #just a named tuple containing the grad alg, lr_shedule etc.. session=session, #The tf session exploration=exploration_schedule, #epsilon greedy schedule stopping_criterion=stopping_criterion, replay_buffer_size=1000000, #Replay buffer size... size of what? batch_size=32, #Gradient ascent batch size I guess.. gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def atari_learn(env, session, num_timesteps, model, double_q, logdir): lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_timesteps / 10, 1e-4 * lr_multiplier), (num_timesteps / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= 4 * num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_timesteps / 2, 0.01), ], outside_value=0.01) dqn.learn(env=env, q_func=globals()["atari_model_" + model], optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q=double_q, rew_file=None, logdir=logdir) env.close()
def smb_learn(model_name, env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) cont_train_model( model_name, env, optimizer_spec=optimizer, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=125000, batch_size=32, gamma=0.99, # 0.99 learning_starts=30000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def knapsack_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return False exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=knapsack_model, nn_size=3, n_hidden_units=128, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, target_update_freq=10000, grad_norm_clipping=10, double_DQN=True, n_steps_ahead=3) env.close()
def game_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env pass exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=cnn_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def atari_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([(0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier)], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps # Create action exploration/exploitation policy policy = LinearAnnealedPolicy(session=session, env=env, num_iterations=num_iterations) dqn.learn( env, policy = policy, q_func=atari_model, optimizer_spec=optimizer, session=session, #exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10 ) env.close()
def tt_learn(env, session, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): return t >= num_timesteps exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn.learn(env, q_func=tt_model_dqn, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_net=True)
def atari_learn(env, session, num_timesteps, env_test): # TODO: principle of Adam and more parameters optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=FLAGS.lr_schedule) # TODO: t input is not used here def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # TODO: better hyper parameters here if FLAGS.tabular: model = tabular_model else: model = atari_model dqn.learn( env, q_func=model, optimizer_spec=optimizer, session=session, exploration=FLAGS.exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=FLAGS.replay_buffer_size, batch_size=FLAGS.batch_size, gamma=FLAGS.discount_factor, learning_starts=FLAGS.learning_starts, learning_freq=FLAGS.learning_freq, frame_history_len=FLAGS.frame_history_len, target_update_freq=FLAGS.target_update_freq, grad_norm_clipping=10, env_test=env_test, ) if env is not None: env.close()
def atari_learn(env, env_test, session, num_timesteps=2e7, learning_rate=None, exploration=None, dqn_config=None): ''' fill the hyperparameters before running dqn :param env: ai gym env :param session: tensorflow session :param num_timesteps: int :param learning_rate: piecewise function :param exploration: piecewise function :param dqn_config: will override parameters above :return: none ''' replay_buffer_size = 1000000 batch_size = 32 gamma = 0.99 learning_starts = 50000 learning_freq = 4 frame_history_len = 4 target_update_freq = 10000 grad_norm_clipping = 10 eval_obs_array = None room_q_interval = 1e5 epoch_size = 5e3 config_name = None if dqn_config: if dqn_config.has_key('num_timesteps'): num_timesteps = dqn_config['num_timesteps'] if dqn_config.has_key('replay_buffer_size'): replay_buffer_size = dqn_config['replay_buffer_size'] if dqn_config.has_key('batch_size'): batch_size = dqn_config['batch_size'] if dqn_config.has_key('gamma'): gamma = dqn_config['gamma'] if dqn_config.has_key('learning_starts'): learning_starts = dqn_config['learning_starts'] if dqn_config.has_key('learning_freq'): learning_freq = dqn_config['learning_freq'] if dqn_config.has_key('frame_history_len'): frame_history_len = dqn_config['frame_history_len'] if dqn_config.has_key('target_update_freq'): target_update_freq = dqn_config['target_update_freq'] if dqn_config.has_key('grad_norm_clipping'): grad_norm_clipping = dqn_config['grad_norm_clipping'] if dqn_config.has_key('learning_rate'): learning_rate = dqn_config['learning_rate'] if dqn_config.has_key('exploration'): exploration = dqn_config['exploration'] if dqn_config.has_key('eval_obs_array'): eval_obs_array = dqn_config['eval_obs_array'] if dqn_config.has_key('room_q_interval'): room_q_interval = dqn_config['room_q_interval'] if dqn_config.has_key('epoch_size'): epoch_size = dqn_config['epoch_size'] if dqn_config.has_key('config_name'): config_name = dqn_config['config_name'] # log_dir = __cur_dir + 'logs/' + config_name + '_' + time + '/' cur_time = time.strftime("%m_%d_%y_%H:%M:%S", time.localtime(time.time())) log_dir = __cur_dir + 'logs/' if not os.path.exists(log_dir): os.makedirs(log_dir) if config_name != None: log_dir = log_dir + config_name + '_' + cur_time + '/' if not os.path.exists(log_dir): os.makedirs(log_dir) dqn_network_dir = log_dir + 'dqn/' if not os.path.exists(dqn_network_dir): os.makedirs(dqn_network_dir) pkl_dir = log_dir + 'pkl/' if not os.path.exists(pkl_dir): os.makedirs(pkl_dir) else: log_dir = None print("config_name not specified! info may not be logged in this run.") # This is just a rough estimate num_iterations = float(num_timesteps) / learning_freq if learning_rate != None: lr_schedule = learning_rate else: lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) if exploration != None: exploration_schedule = exploration else: exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) optimizer = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps dqn.learn( env, env_test, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=replay_buffer_size, batch_size=batch_size, gamma=gamma, learning_starts=learning_starts, learning_freq=learning_freq, frame_history_len=frame_history_len, target_update_freq=target_update_freq, grad_norm_clipping=grad_norm_clipping, eval_obs_array=eval_obs_array, room_q_interval=room_q_interval, epoch_size=epoch_size, log_dir=log_dir ) env.close() env_test.close()
def atari_learn(env, session, num_timesteps, exper_name=None): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # default exploration schedule exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) # # no exploration # exploration_schedule = PiecewiseSchedule( # [ # (0, 0), # (num_iterations, 0), # ], outside_value=0 # ) # # Only explore in beginning # exploration_schedule = PiecewiseSchedule( # [ # (0, 1.0), # (1e6, 0.1), # (1e6+2, 0), # (num_iterations, 0), # ], outside_value=0 # ) # exploration_schedule = PiecewiseSchedule( # [ # (0, 0.5), # (1e6, 0.1), # (1e6+2, 0), # (num_iterations, 0), # ], outside_value=0 # ) # exploration_schedule = PiecewiseSchedule( # [ # (0, 0.1), # (1e6, 0.1), # (1e6+2, 0), # (num_iterations, 0), # ], outside_value=0 # ) dqn.learn(env, q_func=atari_model, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10) env.close()
def main(): num_timesteps = 5000 num_simulations = 20 # I will restart the simulation periodically so it can learn from a fresh start num_iterations = num_timesteps * num_simulations printMyRoute(num_timesteps) sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo-gui.exe" #sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo.exe" traci.start([ sumoBinary, "-c", "conf2.sumocfg", "--tripinfo-output", "Testtripinfo.xml", "--no-step-log", "--time-to-teleport", "-1" ]) seed = random.randint(0, 9999) print('random seed = %d' % seed) session = get_session() rew_file = 'testreward.pkl' lr_multiplier = 1 num_hidden = 100 gamma = 0.95 learning_freq = 10 target_update_freq = 100 explor1 = num_timesteps explor2 = num_timesteps * (num_simulations - 1) lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (explor1, 0.1), (explor2, 0.01), ], outside_value= 0. # No exploration towards the end so we can see the true reward ) #dqnlearn( dqnlearn( num_timesteps=num_timesteps, num_hidden=num_hidden, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=200000, batch_size=32, gamma=gamma, learning_starts=100, learning_freq=learning_freq, frame_history_len=1, # no frame history, we look only last data target_update_freq=target_update_freq, grad_norm_clipping=10, rew_file=rew_file, double_q=True # True )
def main(): num_timesteps = 50000 # It's actually not the number I get, look to that num_iterations = float(num_timesteps) printMyRoute(num_timesteps) #sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo-gui.exe" sumoBinary = r"D:\InstalledProgram\SUMO\bin\sumo.exe" # Run training f = open('triedParameters.txt', 'a+') ntries = 100 for ntry in range(ntries): # hyperparameter random search lr_multiplier = 10**random.uniform(-2, 2) #1.0 num_hidden = random.randint(30, 100) gamma = random.uniform( 0.8, 0.95 ) # horizon should be 1/(1-gamma). We are getting reward every 6 seconds, so 0.9 for 60 seconds should be good learning_freq = random.randint(1, 20) target_update_freq = random.randint(10, 200) explor1 = random.randint(100, 1000) explor2 = int(num_iterations * random.uniform(0.1, 0.9)) if explor2 < explor1: explor2 = (num_iterations + explor1) / 2 print(ntry, lr_multiplier, num_hidden, gamma, learning_freq, target_update_freq, explor1, explor2, file=f) traci.start([ sumoBinary, "-c", "conf2.sumocfg", "--tripinfo-output", "tripinfo" + str(ntry) + ".xml", "--no-step-log", "--time-to-teleport", "-1" ]) seed = random.randint(0, 9999) print('random seed = %d' % seed) session = get_session() rew_file = 'reward' + str(ntry) + '.pkl' lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) exploration_schedule = PiecewiseSchedule( [ (0, 1.0), (explor1, 0.1), (explor2, 0.01), ], outside_value= 0. # No exploration towards the end so we can see the true reward ) #dqnlearn( dqnlearn( num_timesteps=num_timesteps, num_hidden=num_hidden, optimizer_spec=optimizer, session=session, exploration=exploration_schedule, replay_buffer_size=200000, batch_size=32, gamma=gamma, learning_starts=100, learning_freq=learning_freq, frame_history_len=1, # no frame history, we look only last data target_update_freq=target_update_freq, grad_norm_clipping=10, rew_file=rew_file, double_q=True # True )
def transfer_learn(env, env_test, env_test1, session, num_timesteps=2e7, learning_rate=None, learning_rate_term=None, exploration=None, dqn_config=None): ''' fill the hyperparameters before running dqn :param env: ai gym env :param session: tensorflow session :param num_timesteps: int :param learning_rate: piecewise function :param exploration: piecewise function :param dqn_config: will override parameters above :return: none ''' replay_buffer_size = 1000000 batch_size = 32 gamma = 0.99 learning_starts = 50000 learning_freq = 4 frame_history_len = 4 target_update_freq = 10000 grad_norm_clipping = 10 eval_obs_array = None room_q_interval = 1e5 epoch_size = 5e4 config_name = None transfer_config = None source_dirs = [] #term_optimizer = if dqn_config: if dqn_config.has_key('num_timesteps'): num_timesteps = dqn_config['num_timesteps'] if dqn_config.has_key('replay_buffer_size'): replay_buffer_size = dqn_config['replay_buffer_size'] if dqn_config.has_key('batch_size'): batch_size = dqn_config['batch_size'] if dqn_config.has_key('gamma'): gamma = dqn_config['gamma'] if dqn_config.has_key('learning_starts'): learning_starts = dqn_config['learning_starts'] if dqn_config.has_key('learning_freq'): learning_freq = dqn_config['learning_freq'] if dqn_config.has_key('frame_history_len'): frame_history_len = dqn_config['frame_history_len'] if dqn_config.has_key('target_update_freq'): target_update_freq = dqn_config['target_update_freq'] if dqn_config.has_key('grad_norm_clipping'): grad_norm_clipping = dqn_config['grad_norm_clipping'] if dqn_config.has_key('learning_rate'): learning_rate = dqn_config['learning_rate'] if dqn_config.has_key('exploration'): exploration = dqn_config['exploration'] if dqn_config.has_key('eval_obs_array'): eval_obs_array = dqn_config['eval_obs_array'] if dqn_config.has_key('room_q_interval'): room_q_interval = dqn_config['room_q_interval'] if dqn_config.has_key('epoch_size'): epoch_size = dqn_config['epoch_size'] if dqn_config.has_key('config_name'): config_name = dqn_config['config_name'] if dqn_config.has_key('transfer_config'): transfer_config = dqn_config['transfer_config'] if transfer_config: if transfer_config.has_key('source_dirs'): source_dirs = transfer_config['source_dirs'] if transfer_config.has_key('learning_rate_term'): learning_rate_term = transfer_config['learning_rate_term'] if len(source_dirs) == 0: print('no source policies provided! check your config.') # log_dir = __cur_dir + 'logs/' + config_name + '_' + time + '/' cur_time = time.strftime("%m_%d_%y_%H:%M:%S", time.localtime(time.time())) log_dir = __cur_dir + 'logs/' if not os.path.exists(log_dir): os.makedirs(log_dir) if config_name != None: log_dir = log_dir + config_name + '_' + cur_time + '/' if not os.path.exists(log_dir): os.makedirs(log_dir) dqn_network_dir = log_dir + 'dqn/' if not os.path.exists(dqn_network_dir): os.makedirs(dqn_network_dir) pkl_dir = log_dir + 'pkl/' if not os.path.exists(pkl_dir): os.makedirs(pkl_dir) tfb_dir = log_dir + 'tfb/' if not os.path.exists(tfb_dir): os.makedirs(tfb_dir) else: log_dir = None print("config_name not specified! info may not be logged in this run.") # This is just a rough estimate num_iterations = float(num_timesteps) / learning_freq if learning_rate != None: lr_schedule = learning_rate else: lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) lr_schedule_omega = PiecewiseSchedule([ (0, 2e-4), (num_iterations / 2, 1e-4), (num_iterations * 3 / 4, 5e-5), ], outside_value=5e-5) if learning_rate_term is not None: lr_schedule_term = learning_rate_term else: lr_schedule_term = PiecewiseSchedule([ (0, 2.5e-4), (num_iterations / 10, 1e-4), (num_iterations * 3 / 4, 5e-5), ], outside_value=5e-5) if exploration != None: exploration_schedule = exploration else: exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) optimizer = dqn.OptimizerSpec(constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), lr_schedule=lr_schedule) optimizer_omega = dqn.OptimizerSpec( constructor=tf.train.AdamOptimizer, kwargs=dict(epsilon=1e-4), # not for SGD #kwargs=dict(), # for SGD lr_schedule=lr_schedule_omega) optimizer_term = dqn.OptimizerSpec( constructor=tf.train.GradientDescentOptimizer, #kwargs=dict(epsilon=1e-4), not for SGD kwargs=dict(), lr_schedule=lr_schedule_term) def stopping_criterion( env, t ): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps #return False # init sources and primitive options options = [] options += [ Source(dqn_config, env, tf.train.get_checkpoint_state(d)) for d in source_dirs ] for action in range(env.action_space.n): options.append(PrimitiveOption(action)) dqn.learn(env, env_test, env_test1, transfer_model, optimizer, optimizer_omega, optimizer_term, session=session, options=options, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=replay_buffer_size, batch_size=batch_size, gamma=gamma, learning_starts=learning_starts, learning_freq=learning_freq, frame_history_len=frame_history_len, target_update_freq=target_update_freq, grad_norm_clipping=grad_norm_clipping, eval_obs_array=eval_obs_array, room_q_interval=room_q_interval, epoch_size=epoch_size, log_dir=log_dir, transfer_config=transfer_config) env.close() env_test.close() env_test1.close()