def train_eval(log_dir_name, random_seed, env_name="CartPole", eps_start=1.0, eps_end=0.02, decay_steps=3000, optimizer=tf.keras.optimizers.RMSprop, learning_rate=0.00025, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True, loss_fn=tf.compat.v1.losses.huber_loss, grad_clip_flg=None, num_frames=10000, train_freq=1, memory_size=5000, hot_start=100, sync_freq=1000, batch_size=32, interval_MAR=10, gamma=0.99, num_eval_episodes=1, eval_interval=1000): # init global time-step global_timestep = tf.compat.v1.train.create_global_step() # instantiate annealing funcs for ep and lr anneal_ep = tf.compat.v1.train.polynomial_decay(eps_start, global_timestep, decay_steps, eps_end) # prep for training log_dir = set_up_for_training(log_dir_name=log_dir_name, env_name=env_name, seed=random_seed) env = prep_env(env_name=env_name, video_path=log_dir["video_path"]) replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"]) reward_buffer = deque(maxlen=interval_MAR) summary_writer = tf.compat.v2.summary.create_file_writer( log_dir["summary_path"]) agent = Double_DQN( model=prep_model(env_name), policy=EpsilonGreedyPolicy_eager(dim_action=env.action_space.n, epsilon_fn=anneal_ep), optimizer=optimizer(learning_rate, decay, momentum, epsilon, centered), loss_fn=loss_fn, grad_clip_fn=gradient_clip_fn(flag=grad_clip_flg), num_action=env.action_space.n, model_dir=log_dir["model_path"], gamma=gamma, obs_prc_fn=prep_obs_processor(env_name)) train(global_timestep, agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, eval_interval, hot_start, train_freq, batch_size, sync_freq, interval_MAR)
params.alpha = 0.6 params.beta_start = 0.4 params.beta_end = 1.0 params.prioritized_replay_noise = 1e-6 # init global time-step global_timestep = tf.train.get_or_create_global_step() # instantiate annealing funcs for ep and lr anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep, params.decay_steps, params.ep_end) anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep, params.decay_steps, params.lr_end) beta = tf.train.polynomial_decay(params.beta_start, global_timestep, params.decay_steps, params.beta_end) # prep for training policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep) optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6) replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.alpha) reward_buffer = deque(maxlen=params.reward_buffer_ep) loss_fn = create_loss_func(params.loss_fn) grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg) # create a directory for log/model params = create_log_model_directory(params, get_alg_name()) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) # choose env and instantiate the agent correspondingly agent, env = invoke_agent_env(params, get_alg_name()) agent = eval(agent)(Model, optimizer, loss_fn, grad_clip_fn, env.action_space.n, params) train_DQN_PER(agent, env, policy, replay_buffer, reward_buffer, beta, summary_writer)