def dl_progress(count, block_size, total_size): if ProgressTracker.progbar is None: if total_size is -1: total_size = None ProgressTracker.progbar = Progbar(total_size) else: ProgressTracker.progbar.update(count * block_size)
def __init__(self, env, config): Dueling.__init__(self, env, config) self.logger = get_logger(config.log_path) self.avg_reward = 0 self.progress = Progbar(target=self.config.nsteps_train)
class train_Dueling(Dueling): def __init__(self, env, config): Dueling.__init__(self, env, config) self.logger = get_logger(config.log_path) self.avg_reward = 0 self.progress = Progbar(target=self.config.nsteps_train) def get_log(self, exp_schedule, lr_schedule, t, loss_eval, max_q_values, rewards): if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.avg_reward = np.mean(rewards) max_q = np.mean(max_q_values) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: self.progress.update(t + 1, values=[("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Max_Q", max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rLearning not start yet: {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() def train_step(self, t, replay_buffer, lr): loss_eval = 0 if (t > self.config.learning_start and t % self.config.learning_freq == 0): s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample( self.config.batch_size) model_spec = { self.s: s_batch, self.a: a_batch, self.r: r_batch, self.sp: sp_batch, self.done_mask: done_mask_batch, self.lr: lr, self.avg_reward_placeholder: self.avg_reward, } loss_eval, summary, _ = self.sess.run( [self.loss, self.all_summary, self.train_op], feed_dict=model_spec) self.file_writer.add_summary(summary, t) if t % self.config.target_update_freq == 0: self.sess.run(self.update_target_op) if (t % self.config.saving_freq == 0): self.saver.save(self.sess, self.config.model_output2, global_step=t) return loss_eval def train(self, exp_schedule, lr_schedule): replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) t = last_eval = last_record = 0 scores_eval = [] # scores for plot scores_eval += [self.evaluate()] while t < self.config.nsteps_train: sum_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action_values = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0] best_action = np.argmax(action_values) q_values = action_values action = exp_schedule.get_action(best_action) max_q_values.append(max(q_values)) q_values += list(q_values) new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state loss_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) self.get_log(exp_schedule, lr_schedule, t, loss_eval, max_q_values, rewards) sum_reward += reward if done or t >= self.config.nsteps_train: break rewards.append(sum_reward) if t > self.config.learning_start: if last_eval > self.config.eval_freq: last_eval = 0 scores_eval += [self.evaluate()] elif self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("*** Training is done.") self.saver.save(self.sess, self.config.model_output2, global_step=t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) def evaluate(self, env=None, num_episodes=None): if env is None: env = self.env if num_episodes is None: self.logger.info("Evaluating...") num_episodes = self.config.num_episodes_test replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): sum_reward = 0 state = env.reset() while True: idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.env.action_space.sample() if self.config.soft_epsilon < np.random.random(): action = np.argmax( self.sess.run(self.q, feed_dict={self.s: [q_input]})[0]) new_state, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) state = new_state sum_reward += reward if done: break rewards.append(sum_reward) avg_reward = np.mean(rewards) if num_episodes > 1: self.logger.info("Average reward: {:04.2f}".format(avg_reward)) return avg_reward def record(self): record_env = gym.wrappers.Monitor(self.env, self.config.record_path, video_callable=lambda x: True, resume=True) self.evaluate(record_env, 1) def run(self, exp_schedule, lr_schedule): self.sess = tf.Session() self.all_summary = tf.summary.merge_all() self.file_writer = tf.summary.FileWriter(config.output_path, self.sess.graph) init = tf.global_variables_initializer() self.sess.run(init) self.sess.run(model.update_target_op) self.saver = tf.train.Saver(max_to_keep=2) self.cnn_saver.restore(self.sess, self.config.model_output) # model self.train(exp_schedule, lr_schedule) if self.config.record: self.record()