def add_new_action(self, state, goal_state): preds = self.pred_func(state) goal_diff = make_diff(state, goal_state) new_action = L1Action(state, goal_state, preds, dqn_number=self.current_dqn_number) # Check if action already exists if new_action in self.actions: return # The action does not exist, so add it if self.encoding_func is not None: self.cts[new_action] = cpp_cts.CPP_CTS(*self.cts_size) for i, (att, att_goal) in goal_diff: pia = (preds, i, att) goal_pia = (preds, i, att_goal) self.actions.add(new_action) self.actions_for_pia[pia].append(new_action) self.current_dqn_number += 1 self.populate_imagined_states() self.run_vi() # self.run_vi(evaluation=True) print('Found new action: %s' % (new_action, ))
def __init__(self, abs_size, env, abs_func, pred_func, N=1000, max_VI_iterations=100, value_update_freq=1000, VI_delta=0.01, gamma=0.99, rmax=1, max_num_abstract_states=10, frame_history=1, restore_file=None, error_clip=1, state_encoder=None, bonus_beta=0.05, cts_size=None, use_min_psuedo_count=False): self.env = env self.abs_size = abs_size self.abs_func = abs_func self.pred_func = pred_func # takes in abstract state -> outputs predicates dict self.rmax = rmax self.gamma = gamma self.utopia_val = self.rmax / (1 - self.gamma) self.transition_table = MovingAverageTable(100, pred_func) self.value_iteration = value_iteration.ValueIteration(gamma, max_VI_iterations, VI_delta) self.values = dict() self.qs = dict() self.evaluation_values = dict() self.evaluation_qs = dict() self.last_evaluation_state = None self.last_evaluation_action = None self.value_update_counter = 0 self.value_update_freq = value_update_freq restore_network_file = None if restore_file is not None: restore_network_file = 'oo_net.ckpt' self.l0_learner = oo_l0_learner.MultiHeadedDQLearner(abs_size, len(self.env.get_actions_for_state(None)), max_num_abstract_states, frame_history=frame_history, rmax_learner=self, restore_network_file=restore_network_file, encoding_func=state_encoder, bonus_beta=bonus_beta, error_clip=error_clip) self.actions_for_pia = dict() # pia = (predicates, key, attribute) self.explore_for_pia = dict() # holds the reference to all the explore actions self.actions = set() # used to check if actions already exist self.states = set() self.current_dqn_number = 1 self.cts = dict() if cts_size is not None: self.global_cts = cpp_cts.CPP_CTS(*cts_size) self.encoding_func = state_encoder self.bonus_beta = bonus_beta self.cts_size = cts_size self.using_global_epsilon = False # state_encoder is not None self.use_min_psuedo_count = use_min_psuedo_count if restore_file is None: self.create_new_state(self.abs_func(self.env.get_current_state())) else: with open(restore_file + '_transition_table.pickle', 'r') as f: self.transition_table = dill.load(f) with open(restore_file + '_states.pickle', 'r') as f: self.states = dill.load(f) with open(restore_file + '_actions.pickle', 'r') as f: self.actions = dill.load(f) with open(restore_file + '_actions_for_pia.pickle', 'r') as f: self.actions_for_pia = dill.load(f) with open(restore_file + '_explore_for_pia.pickle', 'r') as f: self.explore_for_pia = dill.load(f) self.populate_imagined_states() self.run_vi()
def create_new_state(self, state): self.states.add(state) self.values[state] = self.utopia_val self.evaluation_values[state] = 0 # create explore actions for each attribute: preds = self.pred_func(state) for i, val in state: pia = (preds, i, val) if pia not in self.actions_for_pia: explore_action = L1ExploreAction(state, i, preds, dqn_number=0) # self.current_dqn_number += 1 if self.encoding_func is not None: self.cts[explore_action] = cpp_cts.CPP_CTS(*self.cts_size) self.actions_for_pia[pia] = [explore_action] self.explore_for_pia[pia] = explore_action self.actions.add(explore_action) print('Found new state: %s' % (state, ))
def run_learning_episode(self, environment, initial_l1_state_vec, goal_l1_state_vec, initial_l1_state, goal_l1_state, abs_func, abs_vec_func, max_episode_steps=100000): episode_steps = 0 total_reward = 0 episode_finished = False new_l1_state = initial_l1_state dqn_tuple = (initial_l1_state, goal_l1_state) if dqn_tuple not in self.epsilon: self.epsilon[dqn_tuple] = 1.0 key_init = tuple(initial_l1_state_vec) if key_init not in self.abs_neighbors: self.abs_neighbors[key_init] = set() self.abs_neighbors[key_init].add(key_init) self.cts[key_init] = cpp_cts.CPP_CTS(11, 12, 3) for steps in range(max_episode_steps): if environment.is_current_state_terminal(): break state = environment.get_current_state() if np.random.uniform(0, 1) < self.epsilon[dqn_tuple]: action = np.random.choice(environment.get_actions_for_state(state)) else: action = self.get_action(state, initial_l1_state_vec, goal_l1_state_vec) if self.replay_buffer.size() > self.replay_start_size: self.epsilon[dqn_tuple] = max(self.epsilon_min, self.epsilon[dqn_tuple] - self.epsilon_delta) state, action, env_reward, next_state, is_terminal = environment.perform_action(action) total_reward += env_reward new_l1_state = abs_func(state) if initial_l1_state != new_l1_state: self.abs_neighbors[key_init].add(tuple(abs_vec_func(new_l1_state))) episode_finished = True # if initial_l1_state != new_l1_state or is_terminal: # reward = 1 if new_l1_state == goal_l1_state else -1 # episode_finished = True # else: # reward = 0 enc_s = self.encoding_func(environment) # p, p_prime = self.cts[key_init].prob_update(enc_s) # n_hat = 0 if p_prime == p else (p * (1 - p_prime))/(p_prime - p) n_hat = max(self.cts[key_init].psuedo_count_for_image(enc_s), 0) R_plus = (1 - is_terminal) * (self.beta * np.power(n_hat + 0.01, -0.5)) self.replay_buffer.append(state[-1], initial_l1_state_vec, abs_vec_func(new_l1_state), action, R_plus, next_state[-1], enc_s, episode_finished or is_terminal) if (self.replay_buffer.size() > self.replay_start_size) and (self.action_ticker % self.update_freq == 0): loss = self.update_q_values() if (self.action_ticker - self.replay_start_size) % self.target_copy_freq == 0: self.sess.run(self.copy_op) self.action_ticker += 1 episode_steps += 1 if episode_finished: break return episode_steps, total_reward, new_l1_state
def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True, use_mmc=True, max_mmc_path_length=1000, mmc_beta=0.1, state_encoder=None, bonus_beta=0.05, cts_size=None): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) self.max_mmc_path_length = max_mmc_path_length self.mmc_beta = mmc_beta inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mmc_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = self.inp_reward use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error_dqn = tf.where( tf.abs(self.delta_dqn) < error_clip, 0.5 * tf.square(self.delta_dqn), error_clip * tf.abs(self.delta_dqn)) if use_mmc: self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.inp_mmc_reward self.error_mmc = tf.where( tf.abs(self.delta_mmc) < error_clip, 0.5 * tf.square(self.delta_mmc), error_clip * tf.abs(self.delta_mmc)) # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc self.loss = (1. - self.mmc_beta) * tf.reduce_sum( self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc) else: self.loss = tf.reduce_sum(self.error_dqn) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.use_mmc = use_mmc self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) if self.use_mmc: self.mmc_tracker = MMCPathTracker(self.replay_buffer, self.max_mmc_path_length, self.gamma) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) self.cts_size = cts_size self.cts = cpp_cts.CPP_CTS(*cts_size) self.encoding_func = state_encoder self.bonus_beta = bonus_beta