def main(): # step 1: loading the environment env = gym.make("FrozenLake-v0") # step 2: creating the Q-table state_size = env.observation_space.n action_size = env.action_space.n q = QTable(state_size, action_size) # step 3: creating de epsilon decay e = Epsilon(initial_epsilon=1.0, max_epsilon=1.0, min_epsilon=0.01, decay_rate=0.005) # step 4: Q-table training total_episodes = 100000 max_steps = 100 q, rewards = train_qtable(env, q, e, total_episodes, max_steps) print("Score over time {:.4f}".format(sum(rewards) / total_episodes)) q.print() # Play env.reset() rewards = [] for episode in range(1000): state = env.reset() step = 0 total_rewards = 0 for step in range(100): action = q.select_action(env, state) new_state, reward, done, info = env.step(action) total_rewards += reward state = new_state if done: break rewards.append(total_rewards) if episode % 100 == 0: print("******************************************") print("EPISODE {}".format(episode)) print("Number of steps: {}".format(step)) env.render() print("Score over time {:.4f}".format(sum(rewards) / 1000)) env.close()
def main(): # Step 1: create the Taxi-v2 environment env = gym.make("Taxi-v2") # Step 2: create the QTable q = QTable(env.observation_space.n, env.action_space.n, learning_rate=0.7, gamma=0.99) # Step 3: create the Epsilon decay e = Epsilon() # Step 4: Q-table training total_episodes = 100000 max_steps = 100 q, rewards = train_qtable(env, q, e, total_episodes, max_steps, verbose=True) print("Score over time {:.4f}".format(sum(rewards) / total_episodes)) q.print() env.render() # Play env.reset() rewards = [] for episode in range(1000): state = env.reset() step = 0 total_rewards = 0 for step in range(100): action = q.select_action(env, state) new_state, reward, done, info = env.step(action) total_rewards += reward state = new_state if done: break rewards.append(total_rewards) if episode % 100 == 0: print("******************************************") print("EPISODE {}".format(episode)) print("Number of steps: {}".format(step)) env.render() print("Score over time {:.4f}".format(sum(rewards) / 1000)) env.close()
def __init__(self, n, name, config=None): self.n = n self.name = str(name) self.steps_counter = 0. self.set_counter = 0. self.achieved_counter = 0. self.success_rate = 0 self.steps_for_achievement = 0 if config is not None: # Take only the last attempts to compute epsilon self.last_attempts = collections.deque(maxlen = \ config.goal_attempts_list_len) else: # Take all the attempts to compute epsilon self.last_attempts = [] self._epsilon = Epsilon()
def parse(testfile): """ Parses given automaton in Timbuk format :param testfile: filename :return: automaton object """ if not testfile: print("No filename was given.") exit(1) parse_transitions = False alpha = set() automaton_type = "" states = set() final = set() transitions = {} start = set() epsilon_free = True with open(testfile) as filep: for line in filep: if parse_transitions and not line.isspace(): parts = line.split("->") """if '"' not in line: start.add(parts[1].strip()) continue""" if "(" not in line and ")" not in line: start.add(parts[1].strip()) continue end_state = parts[1].strip() if '"' in line: parts = parts[0].split("\"") predicate = parsePredicate(parts[1], automaton_type) else: if line.strip().startswith("("): predicate = Epsilon() epsilon_free = False parts = ["", "", parts[0]] else: symbol = parts[0].split("(")[0].strip() start_st = parts[0].split("(")[1] parts = ["", "", start_st] predicate = parsePredicate(symbol, automaton_type) start_state = parts[2].replace("(", "").replace(")", "").strip() if predicate not in transitions[start_state]: transitions[start_state][predicate] = [end_state] else: if end_state not in transitions[start_state][predicate]: transitions[start_state][predicate].append(end_state) continue if line.startswith("Ops "): for element in line.split(" ")[1:]: if not element.isspace(): if ":" in element: parts = element.split(":") if int(parts[1]) > 0 and not parts[0].isspace(): alpha.add(parts[0]) else: alpha.add(element) continue if line.startswith("Automaton "): if "@" in line: automaton_type = line.split("@")[1].strip() if automaton_type == "INFA": from in_notin_parser import parsePredicate from in_notin import InNotin label = InNotin() elif automaton_type == "LFA": from letter_parser import parsePredicate from letter import Letter label = Letter() elif automaton_type == "INT": from transducer_predicate import parsePredicate from transducer_predicate import TransPred label = TransPred() elif automaton_type == "GBA": from letter_parser import parsePredicate from letter import Letter label = Letter() else: automaton_type = "LFA" from letter_parser import parsePredicate from letter import Letter label = Letter() if line.startswith("States "): for element in line.split(" ")[1:]: if not element.isspace(): element = element.strip() states.add(element) transitions[element] = {} continue if line.startswith("Final States"): if automaton_type == "GBA": line = line[12:] final = [] for element in line.split(";"): new_set = set() for st in element.split(" "): if not st.isspace() and len(st): st = st.strip() new_set.add(st) if len(new_set): final.append(new_set) else: for element in line.split(" ")[2:]: if not element.isspace(): element = element.strip() final.add(element) continue if line.startswith("Transitions"): parse_transitions = True transitions_reduced = transitions.copy() for state in transitions: if not transitions[state]: del transitions_reduced[state] transitions = transitions_reduced automaton = type_to_class(automaton_type) automaton.alphabet = alpha automaton.states = states automaton.start = start automaton.final = final automaton.transitions = transitions automaton.automaton_type = automaton_type automaton.is_deterministic() automaton.label = label automaton.is_epsilon_free = epsilon_free check_result = automaton.check_automaton() if check_result != "OK": print("ERROR: " + testfile + " : " + check_result) exit(-1) if automaton.is_epsilon_free and automaton_type != "GBA": automaton = automaton.simple_reduce() return automaton
def train(self): self.start_train_timer() #Auxiliary flags (for monitoring) self.mc_flag_start_training, self.c_flag_start_training = False, False self.c_learnt = False #Indicates if C already knows how to achieve goals #Initial step (only useful when resuming training) self.mc_start_step = self.mc_step_op.eval() self.c_start_step = self.c_step_op.eval() #Total steps for the session self.total_steps = self.config.ag.max_step + self.c_start_step #Set up epsilon ofr MC (e-greedy, linear decay) self.mc_epsilon = Epsilon() self.mc_epsilon.setup(self.mc_ag, self.total_steps) old_obs = self.new_episode() self.m.start_timer() # Initial goal self.mc_step = self.mc_start_step self.c_step = self.c_start_step mc_avg_q = self.set_next_goal(old_obs) iterator = self.get_iterator(start_step = self.c_start_step, total_steps = self.total_steps) for self.c_step in iterator: # Controller acts action = self.predict_next_action(old_obs) info = {'goal_name' : self.current_goal.name, 'is_SF' : self.m.is_SF, 'display_episode' : self.display_episode, 'watch' : self.gl.watch, 'avg_q' : mc_avg_q, 'goal' : self.current_goal} new_obs, ext_reward, terminal, info = self.environment.act( action = action, info = info) self.current_goal.steps_counter += 1 self.process_info(info) self.m.add_act(action, self.environment.gym.one_hot_inverse(new_obs)) goal_achieved = self.current_goal.is_achieved(new_obs, action, info) int_reward = 1. if goal_achieved else 0. int_reward -= self.c_ag.intrinsic_time_penalty self.c_observe(old_obs, action, int_reward, new_obs, terminal or goal_achieved) if self.display_episode: self.console_print(old_obs, action, ext_reward, int_reward) self.m.increment_rewards(int_reward, ext_reward) if terminal or goal_achieved: self.current_goal.finished(self.m, goal_achieved) # Meta-controller learns self.mc_observe(goal_n = self.current_goal.n, ext_reward = self.m.mc_step_reward, new_obs = new_obs, terminal = terminal) reward = self.m.mc_step_reward self.m.mc_step_reward = 0 if terminal: if self.display_episode: self.console_print_terminal(reward, new_obs) self.m.close_episode() old_obs = self.new_episode() else: old_obs = new_obs.copy() """ self.mc_step is the MC equivalent for self.c_step (global counter) and self.m_mc_steps is the MC equivalent for self.c.test_step, which is constant in the case of C but not in the case of MC """ self.mc_step += 1 self.m.mc_steps += 1 # Meta-controller sets goal mc_avg_q = self.set_next_goal(old_obs) self.m.mc_goals.append(self.current_goal.n) if not terminal: old_obs = new_obs.copy() if not self.is_testing_time(prefix = 'c'): continue self.m.compute_test('c') self.m.compute_test('mc') self.m.compute_goal_results(self.goals) # print("\nName\tEpsilon\tAttempts\tSuccesses\tRate\tR2") # for i, goal in self.goals.items(): # print("%s\t%.2f\t%d\t%d\t%.2f\t%.2f" % (goal.name.ljust(20), # goal.epsilon, # goal.set_counter, # goal.achieved_counter, # goal.success_rate, # goal.achieved_counter / goal.set_counter)) #self.c_learnt = goal_success_rate > self.c_ag.learnt_threshold self.m.compute_state_visits() if self.m.has_improved(): self.c_step_assign_op.eval( {self.c_step_input: self.c_step + 1}) self.mc_step_assign_op.eval( {self.mc_step_input: self.mc_step + 1}) self.delete_last_checkpoints() self.save_model(self.c_step + 1) #self.save_model(self.mc_step + 1) self.m.update_best_score() self.send_some_metrics(prefix = 'mc') self.send_some_metrics(prefix = 'c') summary = self.m.get_summary() self.m.filter_summary(summary) #self.m.rename_summary(summary) self.inject_summary(summary, self.c_step) self.write_output() # Restart metrics self.m.restart() self.stop_train_timer()
class HDQNAgent(base.Agent): def __init__(self, config, environment, sess): super().__init__(config) self.sess = sess self.weight_dir = 'weights' self.config = config #self.ag = self.config.ag self.c_ag = self.config.ag.c self.mc_ag = self.config.ag.mc self.gl = self.config.gl self.environment = environment self.goals = self.define_goals() self.mc_ag.update({"q_output_length" : self.goal_size}, add = True) self.c_ag.update({"q_output_length" : self.environment.action_size}, add = True) self.mc_memory = self.create_memory(config = self.mc_ag, size = self.environment.state_size) self.c_memory = self.create_memory(config = self.c_ag, size = self.environment.state_size + \ self.goal_size) self.m = Metrics(self.config, self.logs_dir, self.goals) #self.config.print() self.build_hdqn() self.write_configuration() def get_goal(self, n): return self.goals[n] def define_goals(self): if self.environment.env_name in CT.MDP_envs: self.goal_size = self.environment.state_size goals = {} for n in range(self.goal_size): goal_name = "g" + str(n) goal = MDPGoal(n, goal_name, self.c_ag) goal.setup_one_hot(self.goal_size) goals[goal.n] = goal elif self.environment.env_name in CT.SF_envs: #Space Fortress goal_names = \ CT.goal_groups[self.environment.env_name][self.config.ag.goal_group] goals = generate_SF_goals( environment = self.environment, goal_names = goal_names, config = self.c_ag) self.goal_size = len(goals) else: raise ValueError("No prior goals for " + self.environment.env_name) self.goal_probs = np.array([1 / len(goals) for _ in goals]) return goals def set_next_goal(self, obs): if self.is_ready_to_learn(prefix = 'mc'): ep = self.mc_epsilon.steps_value(self.c_step) else: ep = 1 self.m.update_epsilon(value = ep) if random.random() < ep and not self.is_playing(): n_goal = random.randrange(self.goal_size) mc_avg_q = -100 else: feed_dict = {self.mc_s_t: [[obs]]} #n_goal = self.mc_q_action.eval({self.mc_s_t: [[obs]]})[0] n_goal, mc_avg_q = self.sess.run([self.mc_q_action, self.mc_avg_q,], feed_dict) mc_avg_q, n_goal = mc_avg_q[0], n_goal[0] self.mc_old_obs = obs self.m.mc_goals.append(n_goal) self.c_learnt = self.is_knowledge_of_goals_enough() goal = self.get_goal(n_goal) goal.set_counter += 1 self.current_goal = goal self.current_goal.achieved_inside_frameskip = False self.environment.gym.goal_has_changed = True return mc_avg_q def predict_next_action(self, obs): #s_t should have goals and screens concatenated ep = self.current_goal.epsilon if random.random() < ep and not self.is_playing(): action = random.randrange(self.environment.action_size) else: #screens = self.c_history.get() action = self.c_q_action.eval( {self.c_s_t: [[obs]], self.c_g_t: [self.current_goal.one_hot]})[0] # action = int(self.aux(self.c_history.get()[-1]) < self.current_goal.n) # print('**',self.aux(self.c_history.get()[-1]), self.current_goal.n, action) self.m.c_actions.append(action) return action def mc_observe(self, goal_n, ext_reward, new_obs, terminal): if self.is_playing(): return self.mc_memory.add(self.mc_old_obs, goal_n, ext_reward, new_obs, terminal) def c_observe(self, old_obs, action, int_reward, new_obs, terminal): if self.is_playing(): return old_state = np.hstack([self.current_goal.one_hot, old_obs]) next_state = np.hstack([self.current_goal.one_hot, new_obs]) self.c_memory.add(old_state, action, int_reward, next_state, terminal) #Update C self.learn_if_ready(prefix = 'c') #Update MC self.learn_if_ready(prefix = 'mc') def mc_q_learning_mini_batch(self): #Sample batch from memory (s_t, goal, ext_reward, s_t_plus_1, terminal), idx_list, p_list, \ sum_p, count = self.mc_memory.sample() target_q_t = self.generate_target_q_t(prefix = 'mc', reward = ext_reward, s_t_plus_1 = s_t_plus_1, terminal = terminal) feed_dict = { self.mc_target_q_t: target_q_t, self.mc_action: goal, self.mc_s_t: s_t, self.mc_learning_rate_step: self.mc_step, } if self.mc_ag.pmemory: # Prioritized replay memory beta = (1 - self.mc_epsilon.steps_value(self.c_step)) + self.mc_epsilon.end self.m.mc_beta = beta loss_weight = (np.array(p_list)*count/sum_p)**(-beta) feed_dict[self.mc_loss_weight] = loss_weight _, q_t, mc_td_error, loss = self.sess.run([self.mc_optim, self.mc_q, self.mc_td_error, self.mc_loss], #self.mc_q_summary], feed_dict) self.m.mc_add_update(loss, q_t.mean(), mc_td_error.mean()) def c_q_learning_mini_batch(self): #Sample batch from memory (s_t, action, int_reward, s_t_plus_1, terminal), idx_list, p_list, \ sum_p, count = self.c_memory.sample() #TODO: optimize how goals are stored in memory (and how are they retrieved) g_t = np.vstack([g[0] for g in s_t[:, :, :self.goal_size]]) s_t = s_t[:, :, self.goal_size:] g_t_plus_1 = np.vstack([g[0] for g in s_t_plus_1[:, :, :self.goal_size]]) s_t_plus_1 = s_t_plus_1[:, :, self.goal_size:] target_q_t = self.generate_target_q_t(prefix = 'c', reward = int_reward, s_t_plus_1 = s_t_plus_1, terminal = terminal, g_t_plus_1 = g_t_plus_1) feed_dict = { self.c_target_q_t: target_q_t, self.c_action: action, self.c_s_t: s_t, self.c_g_t: g_t, self.c_learning_rate_step: self.c_step, } if self.c_ag.pmemory: if self.is_ready_to_learn(prefix = 'mc'): beta = (1 - self.mc_epsilon.steps_value(self.c_step)) + \ self.mc_epsilon.end else: beta = self.mc_epsilon.end self.m.c_beta = beta loss_weight = (np.array(p_list)*count/sum_p)**(-beta) feed_dict[self.c_loss_weight] = loss_weight pass _, q_t, c_td_error, loss, loss_aux = self.sess.run([self.c_optim, self.c_q, self.c_td_error, self.c_loss, self.c_loss_aux], #self.c_q_summary], feed_dict) self.m.c_add_update(loss, q_t.mean(), c_td_error.mean()) def play(self): self.train() def train(self): self.start_train_timer() #Auxiliary flags (for monitoring) self.mc_flag_start_training, self.c_flag_start_training = False, False self.c_learnt = False #Indicates if C already knows how to achieve goals #Initial step (only useful when resuming training) self.mc_start_step = self.mc_step_op.eval() self.c_start_step = self.c_step_op.eval() #Total steps for the session self.total_steps = self.config.ag.max_step + self.c_start_step #Set up epsilon ofr MC (e-greedy, linear decay) self.mc_epsilon = Epsilon() self.mc_epsilon.setup(self.mc_ag, self.total_steps) old_obs = self.new_episode() self.m.start_timer() # Initial goal self.mc_step = self.mc_start_step self.c_step = self.c_start_step mc_avg_q = self.set_next_goal(old_obs) iterator = self.get_iterator(start_step = self.c_start_step, total_steps = self.total_steps) for self.c_step in iterator: # Controller acts action = self.predict_next_action(old_obs) info = {'goal_name' : self.current_goal.name, 'is_SF' : self.m.is_SF, 'display_episode' : self.display_episode, 'watch' : self.gl.watch, 'avg_q' : mc_avg_q, 'goal' : self.current_goal} new_obs, ext_reward, terminal, info = self.environment.act( action = action, info = info) self.current_goal.steps_counter += 1 self.process_info(info) self.m.add_act(action, self.environment.gym.one_hot_inverse(new_obs)) goal_achieved = self.current_goal.is_achieved(new_obs, action, info) int_reward = 1. if goal_achieved else 0. int_reward -= self.c_ag.intrinsic_time_penalty self.c_observe(old_obs, action, int_reward, new_obs, terminal or goal_achieved) if self.display_episode: self.console_print(old_obs, action, ext_reward, int_reward) self.m.increment_rewards(int_reward, ext_reward) if terminal or goal_achieved: self.current_goal.finished(self.m, goal_achieved) # Meta-controller learns self.mc_observe(goal_n = self.current_goal.n, ext_reward = self.m.mc_step_reward, new_obs = new_obs, terminal = terminal) reward = self.m.mc_step_reward self.m.mc_step_reward = 0 if terminal: if self.display_episode: self.console_print_terminal(reward, new_obs) self.m.close_episode() old_obs = self.new_episode() else: old_obs = new_obs.copy() """ self.mc_step is the MC equivalent for self.c_step (global counter) and self.m_mc_steps is the MC equivalent for self.c.test_step, which is constant in the case of C but not in the case of MC """ self.mc_step += 1 self.m.mc_steps += 1 # Meta-controller sets goal mc_avg_q = self.set_next_goal(old_obs) self.m.mc_goals.append(self.current_goal.n) if not terminal: old_obs = new_obs.copy() if not self.is_testing_time(prefix = 'c'): continue self.m.compute_test('c') self.m.compute_test('mc') self.m.compute_goal_results(self.goals) # print("\nName\tEpsilon\tAttempts\tSuccesses\tRate\tR2") # for i, goal in self.goals.items(): # print("%s\t%.2f\t%d\t%d\t%.2f\t%.2f" % (goal.name.ljust(20), # goal.epsilon, # goal.set_counter, # goal.achieved_counter, # goal.success_rate, # goal.achieved_counter / goal.set_counter)) #self.c_learnt = goal_success_rate > self.c_ag.learnt_threshold self.m.compute_state_visits() if self.m.has_improved(): self.c_step_assign_op.eval( {self.c_step_input: self.c_step + 1}) self.mc_step_assign_op.eval( {self.mc_step_input: self.mc_step + 1}) self.delete_last_checkpoints() self.save_model(self.c_step + 1) #self.save_model(self.mc_step + 1) self.m.update_best_score() self.send_some_metrics(prefix = 'mc') self.send_some_metrics(prefix = 'c') summary = self.m.get_summary() self.m.filter_summary(summary) #self.m.rename_summary(summary) self.inject_summary(summary, self.c_step) self.write_output() # Restart metrics self.m.restart() self.stop_train_timer() def build_meta_controller(self): self.mc_w = {} self.mc_target_w = {} # training meta-controller network with tf.variable_scope('mc_prediction'): self.mc_s_t = tf.placeholder("float", [None, 1, self.environment.state_size], name='mc_s_t') # print(self.mc_s_t) shape = self.mc_s_t.get_shape().as_list() self.mc_s_t_flat = tf.reshape(self.mc_s_t, [-1, reduce( lambda x, y: x * y, shape[1:])]) last_layer = self.mc_s_t_flat last_layer, histograms = self.add_dense_layers( architecture = self.mc_ag.architecture, input_layer = last_layer, parameters = self.mc_w, name_aux = '') if self.mc_ag.dueling: self.mc_q = self.add_dueling(prefix = 'mc', input_layer = last_layer) else: self.mc_q, self.mc_w['q_w'], self.mc_w['q_b'] = utils.linear( last_layer, self.goal_size, name='mc_q') self.mc_q_action= tf.argmax(self.mc_q, axis=1) q_summary = histograms avg_q = tf.reduce_mean(self.mc_q, 0) self.mc_avg_q = tf.reduce_max(self.mc_q, axis = 1) for idx in range(self.mc_ag.q_output_length): q_summary.append(tf.summary.histogram('mc_q/%s' % idx, avg_q[idx])) self.mc_q_summary = tf.summary.merge(q_summary, 'mc_q_summary') # target network self.create_target(prefix = 'mc') #Meta Controller optimizer self.build_optimizer(prefix = 'mc') def build_controller(self): self.c_w = {} self.c_target_w = {} with tf.variable_scope('c_prediction'): #input_size = self.environment.state_size + self.goal_size self.c_s_t = tf.placeholder("float", [None, 1, self.environment.state_size], name = 'c_s_t') shape = self.c_s_t.get_shape().as_list() self.c_s_t_flat = tf.reshape(self.c_s_t, [-1, reduce( lambda x, y: x * y, shape[1:])]) self.c_g_t = tf.placeholder("float", [None, self.goal_size], name = 'c_g_t') self.c_gs_t = tf.concat([self.c_g_t, self.c_s_t_flat], axis = 1, name = 'c_gs_concat') last_layer = self.c_gs_t last_layer, histograms = self.add_dense_layers( architecture = self.c_ag.architecture, input_layer = last_layer, parameters = self.c_w, name_aux= '') if self.c_ag.dueling: self.c_q = self.add_dueling(prefix = 'c', input_layer = last_layer) else: self.c_q, self.c_w['q_w'], self.c_w['q_b'] = \ utils.linear(last_layer, self.environment.action_size, name='c_q') self.c_q_action= tf.argmax(self.c_q, axis=1) q_summary = histograms avg_q = tf.reduce_mean(self.c_q, 0) for idx in range(self.c_ag.q_output_length): q_summary.append(tf.summary.histogram('c_q/%s' % idx, avg_q[idx])) self.c_q_summary = tf.summary.merge(q_summary, 'c_q_summary') # target network self.create_target(prefix = 'c') #Controller optimizer self.build_optimizer(prefix = 'c') def build_hdqn(self): with tf.variable_scope('c_step'): self.c_step_op = tf.Variable(0, trainable=False, name='c_step') self.c_step_input = tf.placeholder('int32', None, name='c_step_input') self.c_step_assign_op = \ self.c_step_op.assign(self.c_step_input) with tf.variable_scope('mc_step'): self.mc_step_op = tf.Variable(0, trainable=False, name='mc_step') self.mc_step_input = tf.placeholder('int32', None, name='mc_step_input') self.mc_step_assign_op = \ self.mc_step_op.assign(self.mc_step_input) self.build_meta_controller() self.build_controller() self.setup_summary(self.m.scalar_tags, self.m.histogram_tags) tf.global_variables_initializer().run() mc_vars = list(self.mc_w.values()) + [self.mc_step_op] c_vars = list(self.c_w.values()) + [self.c_step_op] self._saver = tf.train.Saver(var_list = mc_vars + c_vars, max_to_keep=30) self.load_model() self.update_target_q_network(prefix = 'mc') self.update_target_q_network(prefix = 'c') def is_knowledge_of_goals_enough(self): if self.c_learnt: """ If the threshold has been surpased once already then we assume that it is enough """ return True for _, goal in self.goals.items(): if goal.success_rate < self.c_ag.learnt_threshold or \ goal.achieved_counter < 100: return False print("\nEnough goal learning") return True
def train(self): self.start_train_timer() self.flag_start_training = False self.start_step = self.step_op.eval() if self.ag.fresh_start: self.start_step = 0 self.total_steps = self.ag.max_step + self.start_step # + self.ag.memory_size self.epsilon = Epsilon() self.epsilon.setup(self.ag, self.total_steps) old_obs = self.new_episode() self.m.start_timer() iterator = self.get_iterator(start_step=self.start_step, total_steps=self.total_steps) for self.step in iterator: # 1. predict action, avg_q = self.predict_next_action(old_obs) # 2. act info = { 'is_SF': self.m.is_SF, 'display_episode': self.display_episode, 'avg_q': avg_q, 'watch': self.gl.watch } new_obs, reward, terminal, info = self.environment.act( action, info) self.process_info(info=info) if self.m.is_SF: self.m.add_act(action) else: self.m.add_act(action, self.environment.gym.one_hot_inverse(new_obs)) if self.display_episode: self.console_print(old_obs, action, reward) # 3. observe self.observe(old_obs, action, reward, new_obs, terminal) self.m.increment_external_reward(reward) if terminal: if self.display_episode: self.console_print_terminal(reward, new_obs) self.m.close_episode() old_obs = self.new_episode() else: old_obs = new_obs.copy() if not self.is_testing_time(prefix=''): continue self.m.compute_test(prefix='', update_count=self.m.update_count) self.m.compute_state_visits() if self.m.has_improved(): self.step_assign_op.eval({self.step_input: self.step + 1}) self.save_model(self.step + 1) self.m.update_best_score() self.send_some_metrics(prefix='') summary = self.m.get_summary() self.m.filter_summary(summary) self.inject_summary(summary, self.step) self.write_output() self.m.restart() self.stop_train_timer()
class DQNAgent(Agent): def __init__(self, config, environment, sess): super().__init__(config) self.sess = sess self.weight_dir = 'weights' self.config = config self.ag = self.config.ag self.gl = self.config.gl self.environment = environment self.ag.update({"q_output_length": self.environment.action_size}, add=True) # memory_type = PriorityExperienceReplay if self.ag.pmemory else OldReplayMemory # self.memory = memory_type(config = self.ag, # screen_size = self.environment.state_size) self.memory = self.create_memory(config=self.ag, size=self.environment.state_size) self.m = Metrics(self.config, self.logs_dir) self.build_dqn() self.write_configuration() def train(self): self.start_train_timer() self.flag_start_training = False self.start_step = self.step_op.eval() if self.ag.fresh_start: self.start_step = 0 self.total_steps = self.ag.max_step + self.start_step # + self.ag.memory_size self.epsilon = Epsilon() self.epsilon.setup(self.ag, self.total_steps) old_obs = self.new_episode() self.m.start_timer() iterator = self.get_iterator(start_step=self.start_step, total_steps=self.total_steps) for self.step in iterator: # 1. predict action, avg_q = self.predict_next_action(old_obs) # 2. act info = { 'is_SF': self.m.is_SF, 'display_episode': self.display_episode, 'avg_q': avg_q, 'watch': self.gl.watch } new_obs, reward, terminal, info = self.environment.act( action, info) self.process_info(info=info) if self.m.is_SF: self.m.add_act(action) else: self.m.add_act(action, self.environment.gym.one_hot_inverse(new_obs)) if self.display_episode: self.console_print(old_obs, action, reward) # 3. observe self.observe(old_obs, action, reward, new_obs, terminal) self.m.increment_external_reward(reward) if terminal: if self.display_episode: self.console_print_terminal(reward, new_obs) self.m.close_episode() old_obs = self.new_episode() else: old_obs = new_obs.copy() if not self.is_testing_time(prefix=''): continue self.m.compute_test(prefix='', update_count=self.m.update_count) self.m.compute_state_visits() if self.m.has_improved(): self.step_assign_op.eval({self.step_input: self.step + 1}) self.save_model(self.step + 1) self.m.update_best_score() self.send_some_metrics(prefix='') summary = self.m.get_summary() self.m.filter_summary(summary) self.inject_summary(summary, self.step) self.write_output() self.m.restart() self.stop_train_timer() def predict_next_action(self, old_obs): if self.is_ready_to_learn(prefix=''): ep = self.epsilon.steps_value(self.step) else: ep = 1 self.m.update_epsilon(value=ep) if random.random() < ep and not self.is_playing(): action = random.randrange(self.environment.action_size) avg_q = -100 else: feed_dict = {self.s_t: [[old_obs]]} # action = self.q_action.eval(feed_dict)[0] action, avg_q = self.sess.run([self.q_action, self.avg_q], feed_dict) action, avg_q = action[0], avg_q[0] return action, avg_q def observe(self, old_screen, action, reward, screen, terminal): if self.is_playing(): return self.memory.add(old_screen, action, reward, screen, terminal) self.learn_if_ready(prefix='') def q_learning_mini_batch(self): """ Samples a batch of experiences from the memory and learns from them """ #Sample (s_t, action, reward, s_t_plus_1, terminal), idx_list, p_list, \ sum_p, count = self.memory.sample() #Generate target target_q_t = self.generate_target_q_t(prefix='', reward=reward, s_t_plus_1=s_t_plus_1, terminal=terminal) #Prepare data feed_dict = { self.target_q_t: target_q_t, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, } if self.ag.pmemory: beta = (1 - self.epsilon.steps_value(self.step)) + self.epsilon.end self.m.beta = beta loss_weight = (np.array(p_list) * count / sum_p)**(-beta) feed_dict[self.loss_weight] = loss_weight #Update parameters _, q_t, td_error, loss = self.sess.run( [self.optim, self.q, self.td_error, self.loss], feed_dict) if self.ag.pmemory: self.memory.update(idx_list, td_error) self.m.total_loss += loss self.m.total_q += q_t.mean() self.m.update_count += 1 self.m.td_error += td_error.mean() def build_dqn(self): self.w = {} with tf.variable_scope('step'): self.step_op = tf.Variable(0, trainable=False, name='step') self.step_input = tf.placeholder('int32', None, name='step_input') self.step_assign_op = self.step_op.assign(self.step_input) # training network with tf.variable_scope('prediction'): # tf Graph input self.s_t = tf.placeholder( "float", [None, self.ag.history_length, self.environment.state_size], name='s_t') shape = self.s_t.get_shape().as_list() self.s_t_flat = tf.reshape( self.s_t, [-1, reduce(lambda x, y: x * y, shape[1:])]) last_layer = self.s_t_flat last_layer, histograms = self.add_dense_layers( architecture=self.ag.architecture, input_layer=last_layer, parameters=self.w, name_aux='') if self.ag.dueling: self.q = self.add_dueling(prefix='', input_layer=last_layer) else: self.q, self.w['q_w'], self.w['q_b'] = utils.linear( last_layer, self.environment.action_size, name='q') self.avg_q = tf.reduce_max(self.q, axis=1) self.q_action = tf.argmax(self.q, axis=1) self.create_target(prefix='') # optimizer self.build_optimizer(prefix='') self.setup_summary(self.m.scalar_tags, self.m.histogram_tags) tf.global_variables_initializer().run() vars_ = list(self.w.values()) + [self.step_op] self._saver = tf.train.Saver(vars_, max_to_keep=30) self.load_model() self.update_target_q_network(prefix='')