def __init__(self, agent, env, args): self.agent = agent self.env = env self.args = args self.global_timestep = tf.train.get_or_create_global_step() self.args.max_episode_steps = args.max_episode_steps if args.max_episode_steps else 27000 # borrow from dopamine self.summary_writer = tf.contrib.summary.create_file_writer(args.log_dir) # TODO: maybe we don't need to define session. self.sess = self._get_session(eager=True) self.log = logger(args)
tf.random.set_random_seed(params.seed) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0], theta=0.15, mu=params.mu, sigma=params.sigma) agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params) get_ready(agent.params) global_timestep = tf.compat.v1.train.get_or_create_global_step() time_buffer = deque(maxlen=agent.params.reward_buffer_ep) log = logger(agent.params) traj = list() with summary_writer.as_default(): # for summary purpose, we put all codes in this context with tf.contrib.summary.always_record_summaries(): for i in itertools.count(): state = env.reset() total_reward = 0 self_rewards = 0 start = time.time() agent.random_process.reset_states() done = False episode_len = 0
def train_DQN(agent, env, policy, replay_buffer, reward_buffer, params, summary_writer): """ Train DQN agent which defined above :param main_model: :param target_model: :param env: :param params: :return: """ # Create a glboal step variable # global_step = tf.Variable(0, name='global_step', trainable=False) # log purpose losses, all_rewards, cnt_action = [], [], [] episode_reward, index_episode = 0, 0 log = logger(params) with tf.Session() as sess: # initialise all variables used in the model sess.run(tf.global_variables_initializer()) global_step = sess.run(tf.train.get_or_create_global_step()) state = env.reset() start = time.time() for frame_idx in range(1, params.num_frames + 1): action = policy.select_action(sess, agent.main_model, state.reshape(params.state_reshape)) cnt_action.append(action) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) state = next_state episode_reward += reward global_step += 1 if done: index_episode += 1 policy.index_episode = index_episode state = env.reset() all_rewards.append(episode_reward) if frame_idx > params.learning_start and len( replay_buffer) > params.batch_size: states, actions, rewards, next_states, dones = replay_buffer.sample( params.batch_size) next_Q = agent.target_model.predict(sess, next_states) # Y = rewards + params.gamma * np.max(next_Q, axis=1) Y = rewards + params.gamma * np.max( next_Q, axis=1) * np.logical_not(dones) loss = agent.main_model.update(sess, states, actions, Y) # Logging and refreshing log purpose values losses.append(loss) log.logging(frame_idx, params.num_frames, index_episode, time.time() - start, episode_reward, np.mean(loss), policy.current_epsilon(), cnt_action) episode_reward = 0 cnt_action = [] start = time.time() if np.random.rand() > 0.5: # soft update means we partially add the original weights of target model instead of completely # sharing the weights among main and target models if params.update_hard_or_soft == "hard": sync_main_target(sess, agent.target_model, agent.main_model) elif params.update_hard_or_soft == "soft": soft_target_model_update(sess, agent.target_model, agent.main_model, tau=params.soft_update_tau) # test(sess, main_model, env, params) return all_rewards, losses
def train(agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, tau, eval_interval, hot_start, batch_size, interval_MAR, log_dir, google_colab): time_buffer = list() log = logger(num_frames=num_frames, interval_MAR=interval_MAR) with summary_writer.as_default(): tf.compat.v2.summary.text(name="Hyper-params", data=params_to_markdown( gin.operative_config_str()), step=0) for epoch in itertools.count(): state = env.reset() total_reward = 0 start = time.time() agent.random_process.reset_states() done = False episode_len = 0 while not done: if agent.global_ts.numpy() < hot_start: action = env.action_space.sample() else: action = agent.select_action(state) # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) next_state, reward, done, info = env.step( action * env.action_space.high) replay_buffer.add(state, action, reward, next_state, done) """ === Update the models """ if agent.global_ts.numpy() > hot_start: states, actions, rewards, next_states, dones = replay_buffer.sample( batch_size) loss = agent.update(states, actions, rewards, next_states, dones) soft_target_model_update_eager(agent.target_actor, agent.actor, tau=tau) soft_target_model_update_eager(agent.target_critic, agent.critic, tau=tau) agent.global_ts.assign_add(1) episode_len += 1 total_reward += reward state = next_state # for evaluation purpose if agent.global_ts.numpy() % eval_interval == 0: agent.eval_flg = True """ ===== After 1 Episode is Done ===== """ # save the updated models agent.actor_manager.save() agent.critic_manager.save() # store the episode related variables reward_buffer.append(total_reward) time_buffer.append(time.time() - start) # logging on Tensorboard ts = agent.global_ts.numpy() tf.compat.v2.summary.scalar("train/reward", total_reward, step=ts) tf.compat.v2.summary.scalar("train/exec_time", time.time() - start, step=ts) if ts > hot_start: tf.compat.v2.summary.scalar("train/MAR", np.mean(reward_buffer), step=ts) # logging if ts > hot_start and epoch % interval_MAR == 0: log.logging(time_step=ts, exec_time=np.sum(time_buffer), reward_buffer=reward_buffer, epsilon=0) time_buffer = list() if agent.eval_flg: score = eval_Agent(agent, env, log_dir=log_dir, google_colab=google_colab) tf.compat.v2.summary.scalar("eval/Score", score, step=ts) agent.eval_flg = False # check the stopping condition if ts >= num_frames: print("=== Training is Done ===") score = eval_Agent(agent, env, n_trial=num_eval_episodes, log_dir=log_dir, google_colab=google_colab) tf.compat.v2.summary.scalar("eval/Score", score, step=ts) env.close() break
def train(global_timestep, agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, eval_interval, hot_start, train_freq, batch_size, sync_freq, interval_MAR): time_buffer = list() log = logger(num_frames=num_frames, interval_MAR=interval_MAR) for epoch in itertools.count(): state = np.array(env.reset()) total_reward = 0 start = time.time() cnt_action = list() done = False while not done: action = agent.select_action(state) next_state, reward, done, info = env.step(action) next_state = np.array(next_state) replay_buffer.add(state, action, reward, next_state, done) global_timestep += 1 agent.timestep = global_timestep total_reward += reward state = next_state cnt_action.append(action) # for evaluation purpose if global_timestep % eval_interval == 0: agent.eval_flg = True if (global_timestep > hot_start) and (global_timestep % train_freq == 0): states, actions, rewards, next_states, dones = replay_buffer.sample( batch_size) agent.update(states, actions, rewards, next_states, dones) # synchronise the target and main models by hard if (global_timestep > hot_start) and (global_timestep % sync_freq == 0): agent.save() agent.sync_network() """ ===== After 1 Episode is Done ===== """ summary_writer.add_scalar("train/reward", total_reward, global_timestep) summary_writer.add_scalar("train/exec_time", time.time() - start, global_timestep) if global_timestep > hot_start: summary_writer.add_scalar("train/MAR", np.mean(reward_buffer), global_timestep) summary_writer.add_histogram("train/taken actions", np.array(cnt_action), global_timestep) # store the episode reward reward_buffer.append(total_reward) time_buffer.append(time.time() - start) if global_timestep > hot_start and epoch % interval_MAR == 0: log.logging(time_step=global_timestep, exec_time=np.sum(time_buffer), reward_buffer=reward_buffer, epsilon=agent.policy.current_epsilon(global_timestep)) time_buffer = list() if agent.eval_flg: # replay_buffer.save() score = eval_Agent(agent, env) summary_writer.add_scalar("eval/Score", score, global_timestep) agent.eval_flg = False # check the stopping condition if global_timestep >= num_frames: print("=== Training is Done ===") score = eval_Agent(agent, env, n_trial=num_eval_episodes) summary_writer.add_scalar("eval/Score", score, global_timestep) env.close() break
def train(global_timestep, agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, eval_interval, hot_start, train_freq, batch_size, sync_freq, interval_MAR, log_dir, google_colab): time_buffer = list() log = logger(num_frames=num_frames, interval_MAR=interval_MAR) with summary_writer.as_default(): tf.compat.v2.summary.text(name="Hyper-params", data=params_to_markdown( gin.operative_config_str()), step=0) for epoch in itertools.count(): state = env.reset() total_reward = 0 start = time.time() cnt_action = list() done = False while not done: action = agent.select_action(state) next_state, reward, done, info = env.step(action) replay_buffer.add(state, action, reward, next_state, done) global_timestep.assign_add(1) total_reward += reward state = next_state cnt_action.append(action) # for evaluation purpose if global_timestep.numpy() % eval_interval == 0: agent.eval_flg = True if (global_timestep.numpy() > hot_start) and ( global_timestep.numpy() % train_freq == 0): states, actions, rewards, next_states, dones = replay_buffer.sample( batch_size) agent.update(states, actions, rewards, next_states, dones) # synchronise the target and main models by hard if (global_timestep.numpy() > hot_start) and ( global_timestep.numpy() % sync_freq == 0): agent.manager.save() agent.target_model.set_weights( agent.main_model.get_weights()) """ ===== After 1 Episode is Done ===== """ ts = global_timestep.numpy() tf.compat.v2.summary.scalar("train/reward", total_reward, step=ts) tf.compat.v2.summary.scalar("train/exec_time", time.time() - start, step=ts) if ts > hot_start: tf.compat.v2.summary.scalar("train/MAR", np.mean(reward_buffer), step=ts) tf.compat.v2.summary.histogram("train/taken actions", cnt_action, step=ts) # store the episode reward reward_buffer.append(total_reward) time_buffer.append(time.time() - start) if ts > hot_start and epoch % interval_MAR == 0: log.logging(time_step=ts, exec_time=np.sum(time_buffer), reward_buffer=reward_buffer, epsilon=agent.policy.current_epsilon()) time_buffer = list() if agent.eval_flg: # replay_buffer.save() score = eval_Agent(agent, env, log_dir=log_dir, google_colab=google_colab) tf.compat.v2.summary.scalar("eval/Score", score, step=ts) agent.eval_flg = False # check the stopping condition if ts >= num_frames: print("=== Training is Done ===") score = eval_Agent(agent, env, n_trial=num_eval_episodes, log_dir=log_dir, google_colab=google_colab) tf.compat.v2.summary.scalar("eval/Score", score, step=ts) env.close() break