Example #1
0
    def add_new_action(self, state, goal_state):

        preds = self.pred_func(state)
        goal_diff = make_diff(state, goal_state)
        new_action = L1Action(state,
                              goal_state,
                              preds,
                              dqn_number=self.current_dqn_number)

        # Check if action already exists
        if new_action in self.actions:
            return

        # The action does not exist, so add it

        if self.encoding_func is not None:
            self.cts[new_action] = cpp_cts.CPP_CTS(*self.cts_size)

        for i, (att, att_goal) in goal_diff:
            pia = (preds, i, att)
            goal_pia = (preds, i, att_goal)
            self.actions.add(new_action)
            self.actions_for_pia[pia].append(new_action)

        self.current_dqn_number += 1

        self.populate_imagined_states()
        self.run_vi()
        # self.run_vi(evaluation=True)

        print('Found new action: %s' % (new_action, ))
Example #2
0
    def __init__(self, abs_size, env, abs_func, pred_func, N=1000, max_VI_iterations=100, value_update_freq=1000, VI_delta=0.01, gamma=0.99, rmax=1,
                 max_num_abstract_states=10, frame_history=1, restore_file=None, error_clip=1,
                 state_encoder=None, bonus_beta=0.05, cts_size=None, use_min_psuedo_count=False):
        self.env = env
        self.abs_size = abs_size
        self.abs_func = abs_func
        self.pred_func = pred_func  # takes in abstract state -> outputs predicates dict
        self.rmax = rmax
        self.gamma = gamma
        self.utopia_val = self.rmax / (1 - self.gamma)
        self.transition_table = MovingAverageTable(100, pred_func)
        self.value_iteration = value_iteration.ValueIteration(gamma, max_VI_iterations, VI_delta)
        self.values = dict()
        self.qs = dict()
        self.evaluation_values = dict()
        self.evaluation_qs = dict()
        self.last_evaluation_state = None
        self.last_evaluation_action = None
        self.value_update_counter = 0
        self.value_update_freq = value_update_freq

        restore_network_file = None
        if restore_file is not None:
            restore_network_file = 'oo_net.ckpt'
        self.l0_learner = oo_l0_learner.MultiHeadedDQLearner(abs_size, len(self.env.get_actions_for_state(None)),
                                                            max_num_abstract_states, frame_history=frame_history,
                                                            rmax_learner=self, restore_network_file=restore_network_file,
                                                            encoding_func=state_encoder, bonus_beta=bonus_beta,
                                                            error_clip=error_clip)
        self.actions_for_pia = dict()  # pia = (predicates, key, attribute)
        self.explore_for_pia = dict()  # holds the reference to all the explore actions
        self.actions = set()  # used to check if actions already exist
        self.states = set()
        self.current_dqn_number = 1

        self.cts = dict()
        if cts_size is not None:
            self.global_cts = cpp_cts.CPP_CTS(*cts_size)
        self.encoding_func = state_encoder
        self.bonus_beta = bonus_beta
        self.cts_size = cts_size
        self.using_global_epsilon = False # state_encoder is not None
        self.use_min_psuedo_count = use_min_psuedo_count

        if restore_file is None:
            self.create_new_state(self.abs_func(self.env.get_current_state()))
        else:
            with open(restore_file + '_transition_table.pickle', 'r') as f:
                self.transition_table = dill.load(f)
            with open(restore_file + '_states.pickle', 'r') as f:
                self.states = dill.load(f)
            with open(restore_file + '_actions.pickle', 'r') as f:
                self.actions = dill.load(f)
            with open(restore_file + '_actions_for_pia.pickle', 'r') as f:
                self.actions_for_pia = dill.load(f)
            with open(restore_file + '_explore_for_pia.pickle', 'r') as f:
                self.explore_for_pia = dill.load(f)
            self.populate_imagined_states()
        self.run_vi()
Example #3
0
    def create_new_state(self, state):
        self.states.add(state)
        self.values[state] = self.utopia_val
        self.evaluation_values[state] = 0

        # create explore actions for each attribute:
        preds = self.pred_func(state)
        for i, val in state:
            pia = (preds, i, val)
            if pia not in self.actions_for_pia:
                explore_action = L1ExploreAction(state, i, preds, dqn_number=0)
                # self.current_dqn_number += 1

                if self.encoding_func is not None:
                    self.cts[explore_action] = cpp_cts.CPP_CTS(*self.cts_size)

                self.actions_for_pia[pia] = [explore_action]
                self.explore_for_pia[pia] = explore_action
                self.actions.add(explore_action)

        print('Found new state: %s' % (state, ))
    def run_learning_episode(self, environment, initial_l1_state_vec, goal_l1_state_vec, initial_l1_state, goal_l1_state, abs_func, abs_vec_func, max_episode_steps=100000):
        episode_steps = 0
        total_reward = 0
        episode_finished = False
        new_l1_state = initial_l1_state
        dqn_tuple = (initial_l1_state, goal_l1_state)
        if dqn_tuple not in self.epsilon:
            self.epsilon[dqn_tuple] = 1.0

        key_init = tuple(initial_l1_state_vec)
        if key_init not in self.abs_neighbors:
            self.abs_neighbors[key_init] = set()
            self.abs_neighbors[key_init].add(key_init)

            self.cts[key_init] = cpp_cts.CPP_CTS(11, 12, 3)

        for steps in range(max_episode_steps):
            if environment.is_current_state_terminal():
                break

            state = environment.get_current_state()
            if np.random.uniform(0, 1) < self.epsilon[dqn_tuple]:
                action = np.random.choice(environment.get_actions_for_state(state))
            else:
                action = self.get_action(state, initial_l1_state_vec, goal_l1_state_vec)

            if self.replay_buffer.size() > self.replay_start_size:
                self.epsilon[dqn_tuple] = max(self.epsilon_min, self.epsilon[dqn_tuple] - self.epsilon_delta)

            state, action, env_reward, next_state, is_terminal = environment.perform_action(action)
            total_reward += env_reward

            new_l1_state = abs_func(state)
            if initial_l1_state != new_l1_state:
                self.abs_neighbors[key_init].add(tuple(abs_vec_func(new_l1_state)))

                episode_finished = True

            # if initial_l1_state != new_l1_state or is_terminal:
            #     reward = 1 if new_l1_state == goal_l1_state else -1
            #     episode_finished = True
            # else:
            #     reward = 0

            enc_s = self.encoding_func(environment)
            # p, p_prime = self.cts[key_init].prob_update(enc_s)
            # n_hat = 0 if p_prime == p else (p * (1 - p_prime))/(p_prime - p)
            n_hat = max(self.cts[key_init].psuedo_count_for_image(enc_s), 0)
            R_plus = (1 - is_terminal) * (self.beta * np.power(n_hat + 0.01, -0.5))

            self.replay_buffer.append(state[-1], initial_l1_state_vec, abs_vec_func(new_l1_state), action, R_plus, next_state[-1], enc_s, episode_finished or is_terminal)
            if (self.replay_buffer.size() > self.replay_start_size) and (self.action_ticker % self.update_freq == 0):
                loss = self.update_q_values()
            if (self.action_ticker - self.replay_start_size) % self.target_copy_freq == 0:
                self.sess.run(self.copy_op)
            self.action_ticker += 1
            episode_steps += 1

            if episode_finished:
                break

        return episode_steps, total_reward, new_l1_state
Example #5
0
    def __init__(self,
                 dqn,
                 num_actions,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=50000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True,
                 use_mmc=True,
                 max_mmc_path_length=1000,
                 mmc_beta=0.1,
                 state_encoder=None,
                 bonus_beta=0.05,
                 cts_size=None):
        self.dqn = dqn
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        self.max_mmc_path_length = max_mmc_path_length
        self.mmc_beta = mmc_beta
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mmc_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = self.inp_reward
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online,
                                       reduction_indices=1) - self.y
        self.error_dqn = tf.where(
            tf.abs(self.delta_dqn) < error_clip,
            0.5 * tf.square(self.delta_dqn),
            error_clip * tf.abs(self.delta_dqn))
        if use_mmc:
            self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online,
                                           axis=1) - self.inp_mmc_reward
            self.error_mmc = tf.where(
                tf.abs(self.delta_mmc) < error_clip,
                0.5 * tf.square(self.delta_mmc),
                error_clip * tf.abs(self.delta_mmc))
            # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc
            self.loss = (1. - self.mmc_beta) * tf.reduce_sum(
                self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc)
        else:
            self.loss = tf.reduce_sum(self.error_dqn)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.use_mmc = use_mmc
        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        if self.use_mmc:
            self.mmc_tracker = MMCPathTracker(self.replay_buffer,
                                              self.max_mmc_path_length,
                                              self.gamma)

        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

        self.cts_size = cts_size
        self.cts = cpp_cts.CPP_CTS(*cts_size)
        self.encoding_func = state_encoder
        self.bonus_beta = bonus_beta