def __init__(self, abs_size, num_actions, num_abstract_states, gamma=0.99, learning_rate=0.000002, replay_start_size=500,
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000,
                 update_freq=4, target_copy_freq=30000, replay_memory_size=1000000,
                 frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        inp_shape = [None, 84, 84, frame_history]
        inp_dtype = 'uint8'
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        #self.inp_q_choices = tf.placeholder(tf.int32, [None])

        self.inp_abs_state_init = tf.placeholder(tf.float32, [None, abs_size])
        self.inp_abs_state_goal = tf.placeholder(tf.float32, [None, abs_size])
        self.abs_neighbors = dict()
        self.gamma = gamma

        with tf.variable_scope('online'):
            mask_shape = [-1, 1, 1, frame_history]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = construct_dqn_with_embedding_2_layer(masked_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
        with tf.variable_scope('target'):
            mask_shape = [-1, 1, 1, frame_history]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = construct_dqn_with_embedding_2_layer(masked_sp_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = construct_dqn_with_embedding_2_layer(masked_sp_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
                print(self.q_online_prime)
            self.maxQ = tf.gather_nd(self.q_target, tf.transpose(
                [tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = self.inp_reward
        use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.y
        self.error = tf.where(tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
                               error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.replay_buffer = replay_memory.ReplayMemory((84, 84), abs_size, 'uint8', replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = dict()
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (epsilon_start - self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

        self.cts = dict()
        self.encoding_func = toy_mr_encoder.encode_toy_mr_state
        self.beta = 0.05
Example #2
0
    def __init__(self,
                 dqn,
                 num_actions,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=50000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True,
                 use_mmc=True,
                 max_mmc_path_length=1000,
                 mmc_beta=0.1,
                 state_encoder=None,
                 bonus_beta=0.05,
                 cts_size=None):
        self.dqn = dqn
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        self.max_mmc_path_length = max_mmc_path_length
        self.mmc_beta = mmc_beta
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mmc_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = self.inp_reward
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online,
                                       reduction_indices=1) - self.y
        self.error_dqn = tf.where(
            tf.abs(self.delta_dqn) < error_clip,
            0.5 * tf.square(self.delta_dqn),
            error_clip * tf.abs(self.delta_dqn))
        if use_mmc:
            self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online,
                                           axis=1) - self.inp_mmc_reward
            self.error_mmc = tf.where(
                tf.abs(self.delta_mmc) < error_clip,
                0.5 * tf.square(self.delta_mmc),
                error_clip * tf.abs(self.delta_mmc))
            # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc
            self.loss = (1. - self.mmc_beta) * tf.reduce_sum(
                self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc)
        else:
            self.loss = tf.reduce_sum(self.error_dqn)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.use_mmc = use_mmc
        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        if self.use_mmc:
            self.mmc_tracker = MMCPathTracker(self.replay_buffer,
                                              self.max_mmc_path_length,
                                              self.gamma)

        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

        self.cts_size = cts_size
        self.cts = cpp_cts.CPP_CTS(*cts_size)
        self.encoding_func = state_encoder
        self.bonus_beta = bonus_beta
Example #3
0
    def __init__(self,
                 abs_size,
                 num_actions,
                 num_abstract_states,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=500,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True,
                 use_mmc=True,
                 max_mmc_path_length=1000,
                 mmc_beta=0.5,
                 max_dqn_number=300,
                 rmax_learner=None):
        self.rmax_learner = rmax_learner
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        self.max_mmc_path_length = max_mmc_path_length
        self.mmc_beta = mmc_beta
        inp_shape = [None, 84, 84, frame_history]
        inp_dtype = 'uint8'
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mmc_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_dqn_numbers = tf.placeholder(tf.int32, [None])
        #self.inp_q_choices = tf.placeholder(tf.int32, [None])

        self.inp_abs_state_init = tf.placeholder(tf.float32, [None, abs_size])
        self.inp_abs_state_goal = tf.placeholder(tf.float32, [None, abs_size])
        self.abs_neighbors = dict()
        self.gamma = gamma
        self.max_dqn_number = max_dqn_number
        #q_constructor = lambda inp: construct_q_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions)
        # q_constructor = lambda inp: construct_small_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions)
        q_constructor = lambda inp: construct_dqn_with_embedding_2_layer(
            inp, self.inp_abs_state_init, self.inp_abs_state_goal,
            frame_history, num_actions)
        # q_constructor = lambda inp: construct_dqn_with_subgoal_embedding(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
        # q_constructor = lambda inp: construct_meta_dqn_network(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
        with tf.variable_scope('online'):
            mask_shape = [-1, 1, 1, frame_history]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = q_constructor(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1, 1, 1, frame_history]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = q_constructor(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = q_constructor(masked_sp_input)
                print(self.q_online_prime)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, batch_size, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, axis=1)

        self.r = tf.sign(self.inp_reward)
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ

        self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online,
                                       axis=1) - self.y
        self.error_dqn = tf.where(
            tf.abs(self.delta_dqn) < error_clip,
            0.5 * tf.square(self.delta_dqn),
            error_clip * tf.abs(self.delta_dqn))
        if use_mmc:
            self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online,
                                           axis=1) - self.inp_mmc_reward
            self.error_mmc = tf.where(
                tf.abs(self.delta_mmc) < error_clip,
                0.5 * tf.square(self.delta_mmc),
                error_clip * tf.abs(self.delta_mmc))
            # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc
            self.loss = (1. - self.mmc_beta) * tf.reduce_sum(
                self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc)
        else:
            self.loss = tf.reduce_sum(self.error_dqn)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.use_mmc = use_mmc
        self.replay_buffer = ReplayMemory((84, 84), abs_size, 'uint8',
                                          replay_memory_size, frame_history)
        if self.use_mmc:
            self.mmc_tracker = MMCPathTracker(self.replay_buffer,
                                              self.max_mmc_path_length,
                                              self.gamma)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = [epsilon_start
                        ] * num_abstract_states * num_abstract_states
        self.epsilon = dict()
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (epsilon_start -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

        ####################
        ## Keeping track of progress of actions

        self.samples_per_option = 50
        self.state_samples_for_option = dict()
        self.option_action_ticker = dict()
        self.progress_sample_frequency = 1000
    with tf.variable_scope('fc2_sigma'):
        fc2 = th.fully_connected(z, 50, tf.nn.elu)
    with tf.variable_scope('dec_sigma'):
        sigma_x = th.fully_connected(fc2, 11*11, lambda x: x)
    return mu_x, sigma_x

encoding_size = 50
batch_size = 32
inp_image = tf.placeholder(tf.float32, [None, 84, 84, 1])

with tf.variable_scope('encoder'):
    mu_z, sigma_z = make_encoder(inp_image, encoding_size)
z = sigma_z * tf.random_normal([batch_size, encoding_size]) + mu_z
with tf.variable_scope('decoder'):
    mu_x, sigma_x = make_decoder(z)

z_variance = tf.sqrt(tf.reduce_sum(tf.square(mu_z), reduction_indices=1))

term1 = (0.5 * tf.reduce_sum(1 - tf.square(mu_z) - 1, reduction_indices=[1]))
k = 84*84
term2 = - tf.reduce_sum(0.5*tf.square(inp_image - mu_x), [1, 2, 3])
loss = -tf.reduce_mean((term1 + term2), reduction_indices=0)
#loss = tf.reduce_mean(tf.square(inp_image - mu_x))
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)

saver = tf.train.Saver(var_list=th.get_vars('encoder', 'decoder'))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.initialize_all_variables())
    def __init__(self,
                 abs_size,
                 num_actions,
                 num_abstract_states,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=5000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True,
                 use_mmc=True,
                 max_mmc_path_length=1000,
                 mmc_beta=0.1,
                 max_dqn_number=300,
                 rmax_learner=None,
                 encoding_func=None,
                 bonus_beta=0.05):
        self.rmax_learner = rmax_learner
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        self.max_mmc_path_length = max_mmc_path_length
        self.mmc_beta = mmc_beta
        inp_shape = [None, 84, 84, frame_history]
        inp_dtype = 'uint8'
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mmc_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_dqn_numbers = tf.placeholder(tf.int32, [None])
        # self.inp_q_choices = tf.placeholder(tf.int32, [None])

        self.abs_neighbors = dict()
        self.gamma = gamma
        self.max_dqn_number = max_dqn_number

        # q_constructor = lambda inp: construct_q_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions)
        #q_constructor = lambda inp: construct_small_network_weights(inp, self.inp_dqn_numbers, max_dqn_number,
        #                                                            frame_history, num_actions)
        #q_constructor = lambda inp: construct_dqn_with_embedding_2_layer(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
        q_constructor = lambda inp: construct_q_network_weights_only_final(
            inp, self.inp_dqn_numbers, max_dqn_number, frame_history,
            num_actions)
        # q_constructor = lambda inp: construct_dqn_with_subgoal_embedding(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)
        # q_constructor = lambda inp: construct_meta_dqn_network(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions)

        with tf.variable_scope('online'):
            mask_shape = [-1, 1, 1, frame_history]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online, self.q_online_explore = q_constructor(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1, 1, 1, frame_history]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target, self.q_target_explore = q_constructor(
                masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime, self.q_online_prime_explore = q_constructor(
                    masked_sp_input)
                print(self.q_online_prime)
        else:
            self.q_online_prime = None
            self.q_online_prime_explore = None

        self.loss = construct_q_loss(self.q_online, self.q_target,
                                     self.inp_actions, self.inp_reward,
                                     self.inp_terminated, self.q_online_prime,
                                     batch_size, gamma, error_clip,
                                     self.inp_mmc_reward, mmc_beta)
        # if True:  # If using explore/exploit nets
        #     self.loss_explore = construct_q_loss(self.q_online_explore, self.q_target_explore, self.inp_actions, self.inp_reward_explore,
        #                              self.inp_terminated,
        #                              self.q_online_prime_explore, batch_size, gamma, error_clip, self.inp_mmc_reward_explore,
        #                              mmc_beta)

        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.pre_gvs = optimizer.compute_gradients(
            self.loss, var_list=th.get_vars('online'))
        self.pre_gvs = [(tf.where(tf.is_nan(grad), tf.zeros_like(grad),
                                  grad), var) for grad, var in self.pre_gvs]
        self.post_gvs = [(tf.clip_by_value(grad, -10., 10.), var)
                         for grad, var in self.pre_gvs]
        self.train_op = optimizer.apply_gradients(self.post_gvs)
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.use_mmc = use_mmc
        self.replay_buffer = ReplayMemory((84, 84), abs_size, 'uint8',
                                          replay_memory_size, frame_history)
        if self.use_mmc:
            self.mmc_tracker = MMCPathTrackerExplore(self.replay_buffer,
                                                     self.max_mmc_path_length,
                                                     self.gamma)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = [epsilon_start
                        ] * num_abstract_states * num_abstract_states
        self.epsilon = dict()
        self.global_epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (epsilon_start -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.check_op = tf.add_check_numerics_ops()
        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

        self.encoding_func = encoding_func
        self.bonus_beta = bonus_beta
        self.reward_mult = 1.  # (10 * self.bonus_beta)/(1-gamma)
        self.n_hat_tracker = dict()

        ####################
        ## Keeping track of progress of actions

        self.samples_per_option = 50
        self.state_samples_for_option = dict()
        self.option_action_ticker = dict()
        self.progress_sample_frequency = 1000
    def __init__(self,
                 dqn,
                 num_actions,
                 max_vae_loss_buffer_size=10000,
                 variance_max=3.0,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=1000,
                 epsilon_start=0.1,
                 epsilon_end=0.1,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True):
        self.dqn = dqn
        self.variance_max = variance_max
        self.max_vae_loss_buffer = deque(maxlen=max_vae_loss_buffer_size)
        self.min_vae_loss_buffer = deque(maxlen=max_vae_loss_buffer_size)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = self.inp_reward
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online,
                                   reduction_indices=1) - self.y
        self.error = tf.select(
            tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
            error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())
        vae_network.saver.restore(self.sess, '../vae_net.ckpt')

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print 'Restored network from file'
        self.sess.run(self.copy_op)
Example #7
0
    def __init__(self,
                 num_abstract_states,
                 num_actions,
                 gamma=0.9,
                 learning_rate=0.00025,
                 replay_start_size=32,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_steps=10000,
                 replay_memory_size=100,
                 frame_history=1,
                 batch_size=32,
                 error_clip=1,
                 abstraction_function=None,
                 base_network_file=None):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.num_abstract_states = num_abstract_states
        self.num_abstract_actions = num_abstract_states * (
            num_abstract_states - 1)
        self.frame_history = frame_history

        self.abstraction_function = abstraction_function

        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32,
                                          [None, self.num_abstract_actions])
        inp_shape = [None, 84, 84, self.frame_history]
        inp_dtype = 'uint8'
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        # convert t
        self.inp_sigma = tf.placeholder(tf.uint8, [None])
        self.inp_sigma_onehot = tf.cast(
            tf.sparse_to_dense(
                tf.concat(1, [
                    tf.expand_dims(tf.range(0, batch_size), -1),
                    tf.expand_dims(tf.cast(self.inp_sigma, tf.int32), -1)
                ]), [batch_size, self.num_abstract_states], 1), tf.float32)
        self.inp_sigma_p = tf.placeholder(tf.uint8, [None])
        self.inp_sigma_p_onehot = tf.cast(
            tf.sparse_to_dense(
                tf.concat(1, [
                    tf.expand_dims(tf.range(0, batch_size), -1),
                    tf.expand_dims(tf.cast(self.inp_sigma_p, tf.int32), -1)
                ]), [batch_size, self.num_abstract_states], 1), tf.float32)
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma

        self.actions_for_sigma = np.zeros(
            (self.num_abstract_states, self.num_abstract_actions),
            dtype=np.float32)
        for a in range(self.num_abstract_actions):
            i, j = flat_actions_to_state_pairs(a, num_abstract_states)
            self.actions_for_sigma[i, a] = 1

        self.visual_scope = 'visual'
        self.abstraction_scope = 'abstraction'
        with tf.variable_scope(self.visual_scope):
            # mask stuff here
            mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1])
            masked_input = self.inp_frames * mask
            self.visual_output = hook_visual(masked_input, self.frame_history)
        with tf.variable_scope(self.abstraction_scope):
            self.sigma, self.sigma_probs = hook_abstraction(
                self.visual_output,
                self.num_abstract_states,
                batch_size,
                I=self.inp_sigma_onehot)
        with tf.variable_scope(self.abstraction_scope, reuse=True):
            # the one that samples
            self.sigma_query, self.sigma_query_probs = hook_abstraction(
                self.visual_output, self.num_abstract_states, 1)

        with tf.variable_scope(self.visual_scope, reuse=True):
            mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1])
            masked_input_sp = self.inp_sp_frames * mask_sp
            self.visual_output_sp = hook_visual(masked_input_sp,
                                                self.frame_history)
        with tf.variable_scope(self.abstraction_scope, reuse=True):
            self.sigma_p, self.sigma_p_probs = hook_abstraction(
                self.visual_output_sp,
                self.num_abstract_states,
                batch_size,
                I=self.inp_sigma_p_onehot)

        self.possible_action_vector = tf.stop_gradient(
            valid_actions_for_sigma(self.actions_for_sigma, self.sigma,
                                    self.num_abstract_actions))
        with tf.variable_scope('l1_online'):
            self.q_online = hook_l1(self.sigma, self.num_abstract_actions)
        with tf.variable_scope('l1_online', reuse=True):
            self.possible_action_vector_query = -np.inf * (
                1 - valid_actions_for_sigma(self.actions_for_sigma,
                                            self.sigma_query,
                                            self.num_abstract_actions))
            self.possible_action_vector_query = tf.select(
                tf.is_nan(self.possible_action_vector_query),
                tf.zeros_like(self.possible_action_vector_query),
                self.possible_action_vector_query)
            self.q_online_query = self.possible_action_vector_query + hook_l1(
                self.sigma_query, self.num_abstract_actions)
        with tf.variable_scope('l1_online', reuse=True):
            self.possible_action_vector_prime = -np.inf * (
                1 -
                valid_actions_for_sigma(self.actions_for_sigma, self.sigma_p,
                                        self.num_abstract_actions))
            self.possible_action_vector_prime = tf.select(
                tf.is_nan(self.possible_action_vector_prime),
                tf.zeros_like(self.possible_action_vector_prime),
                self.possible_action_vector_prime)
            self.q_target = self.possible_action_vector_prime + hook_l1(
                self.sigma_p, self.num_abstract_actions)

        self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = tf.sign(self.inp_reward)
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = tf.stop_gradient(self.r + use_backup * gamma * self.maxQ)
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online,
                                   reduction_indices=1) - self.y
        self.error = tf.select(
            tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
            error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        # TODO: add th.get_vars(self.visual_scope)+th.get_vars(self.abstraction_scope)
        if self.abstraction_function is None:
            self.train_op = optimizer.minimize(
                self.loss,
                var_list=th.get_vars('l1_online', self.abstraction_scope,
                                     self.visual_scope))
        else:
            self.train_op = optimizer.minimize(
                self.loss, var_list=th.get_vars('l1_online'))

        self.saver = tf.train.Saver(var_list=th.get_vars(self.visual_scope) +
                                    th.get_vars(self.abstraction_scope) +
                                    th.get_vars('l1_online') +
                                    th.get_vars('online'))

        self.replay_buffer = L1ReplayMemory((84, 84), np.uint8,
                                            replay_memory_size, 1)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.l0_learner = L0_Learner(
            self.sess,
            self.abstraction_scope,
            self.visual_scope,
            num_actions,  #self.visual_scope, num_actions,
            self.num_abstract_actions,
            self.num_abstract_states,
            abstraction_function=self.abstraction_function,
            max_episode_steps=20,
            base_network_file=base_network_file)

        self.sess.run(tf.initialize_all_variables())

        if base_network_file is not None:
            self.l0_learner.base_network_saver.restore(self.sess,
                                                       base_network_file)
            print('Restored network from file')
Example #8
0
    def __init__(self,
                 sess,
                 abstraction_scope,
                 visual_scope,
                 num_actions,
                 num_abstract_actions,
                 num_abstract_states,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=5000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=10000,
                 replay_memory_size=1000000,
                 frame_history=1,
                 batch_size=32,
                 error_clip=1,
                 abstraction_function=None,
                 max_episode_steps=-1,
                 base_network_file=None):
        self.sess = sess
        self.num_abstract_actions = num_abstract_actions
        self.num_abstract_states = num_abstract_states
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.gamma = gamma
        self.frame_history = frame_history
        self.replay_buffer = ReplayMemory((84, 84), 'uint8',
                                          replay_memory_size, frame_history)
        self.abstraction_scope = abstraction_scope
        self.abstraction_function = abstraction_function

        self.inp_frames = tf.placeholder(tf.uint8,
                                         [None, 84, 84, self.frame_history])
        self.inp_sp_frames = tf.placeholder(tf.uint8,
                                            [None, 84, 84, self.frame_history])
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(tf.uint8, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(tf.uint8, [None, frame_history])
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        # onehot vector
        #self.inp_sigma = tf.placeholder(tf.float32, [None, self.num_abstract_states])

        self.reward_matrix = -np.ones(
            (num_abstract_states, num_abstract_states, num_abstract_actions),
            dtype=np.float32)
        # make self transitions 0
        for i in range(num_abstract_states):
            self.reward_matrix[i, i, :] = 0
        # make goal transitions have reward 1
        for a in range(num_abstract_actions):
            i, j = flat_actions_to_state_pairs(a, num_abstract_states)
            self.reward_matrix[i, j, a] = 1

        self.actions_for_sigma = np.zeros(
            (num_abstract_states, num_abstract_actions), dtype=np.float32)
        for a in range(num_abstract_actions):
            i, j = flat_actions_to_state_pairs(a, num_abstract_states)
            self.actions_for_sigma[i, a] = 1

        # mask stuff here
        mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1])
        masked_input = self.inp_frames * mask

        l0_vis_scope = 'l0_vis'
        with tf.variable_scope(l0_vis_scope):
            self.visual_output_base = hook_visual(masked_input,
                                                  self.frame_history)
            self.visual_output = tf.stop_gradient(self.visual_output_base)

        with tf.variable_scope('online_base'):
            self.q_online_base = hook_base(self.visual_output_base,
                                           self.num_actions)
        with tf.variable_scope('online_1'):
            self.q_online_1 = hook_l0(self.visual_output, 1, self.num_actions)
        with tf.variable_scope('online_2'):
            self.q_online_2 = hook_l0(self.visual_output, 1, self.num_actions)

        self.q_online = tf.concat(1, [self.q_online_1, self.q_online_2])

        mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1])
        masked_input_sp = self.inp_sp_frames * mask_sp

        l0_target_vis_scope = 'l0_target_vis'
        with tf.variable_scope(l0_target_vis_scope):
            self.visual_output_sp = hook_visual(masked_input_sp,
                                                self.frame_history)
        with tf.variable_scope('target_base'):
            self.q_target_base = hook_base(self.visual_output_sp,
                                           self.num_actions)
        with tf.variable_scope('target_1'):
            self.q_target_1 = hook_l0(self.visual_output_sp, 1,
                                      self.num_actions)
        with tf.variable_scope('target_2'):
            self.q_target_2 = hook_l0(self.visual_output_sp, 1,
                                      self.num_actions)

        self.q_target = tf.concat(1, [self.q_target_1, self.q_target_2])

        # with tf.variable_scope(visual_scope, reuse=True):
        #     # mask stuff here
        #     mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1])
        #     masked_input = self.inp_frames * mask
        #     self.visual_output = hook_visual(masked_input, self.frame_history)
        #
        #     mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1])
        #     masked_input_sp = self.inp_sp_frames * mask_sp
        #     self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history)
        #
        # with tf.variable_scope('online'):
        #     self.q_online = hook_l0(self.visual_output, self.num_abstract_actions, self.num_actions)
        # with tf.variable_scope('target'):
        #     self.q_target = hook_l0(self.visual_output_sp, self.num_abstract_actions, self.num_actions)

        # TODO set up double dqn for later experiments.

        # Q matrix is (num_abstract_actions, num_actions), results in vector with max-q for each abstract action.
        self.maxQ = tf.reduce_max(self.q_target, reduction_indices=2)

        with tf.variable_scope(visual_scope, reuse=True):
            self.l1_visual_output = hook_visual(masked_input,
                                                self.frame_history)
            self.l1_visual_output_sp = hook_visual(masked_input_sp,
                                                   self.frame_history)
        with tf.variable_scope(self.abstraction_scope, reuse=True):
            self.sigma = tf.stop_gradient(
                hook_abstraction(self.l1_visual_output, num_abstract_states,
                                 batch_size)[0])
            self.sigma_p = tf.stop_gradient(
                hook_abstraction(self.l1_visual_output_sp, num_abstract_states,
                                 batch_size)[0])
            self.sigma_query, self.sigma_query_probs = hook_abstraction(
                self.l1_visual_output, self.num_abstract_states, 1)

        self.r = tf.reduce_sum(
            tf.reshape(self.sigma_p, [-1, 1, num_abstract_states, 1]) * \
            tf.reshape(self.sigma, [-1, num_abstract_states, 1, 1]) * \
            tf.reshape(self.reward_matrix, [1, num_abstract_states, num_abstract_states, num_abstract_actions]),
            reduction_indices=[1, 2])
        # Give a reward of -1 if reached a terminal state
        self.r = (self.r * tf.reshape(tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32), [-1, 1])) +\
                 tf.reshape(tf.cast(self.inp_terminated, dtype=tf.float32) * -1, [-1, 1])

        self.use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                                  dtype=tf.float32) * tf.reduce_sum(
                                      self.sigma_p * self.sigma,
                                      reduction_indices=1)
        self.y = tf.stop_gradient(self.r +
                                  tf.reshape(self.use_backup, [-1, 1]) *
                                  gamma * self.maxQ)
        self.delta = tf.reduce_sum(
            tf.reshape(self.inp_actions, [-1, 1, num_actions]) * self.q_online,
            reduction_indices=2) - self.y
        valid_actions_mask = valid_actions_for_sigma(self.actions_for_sigma,
                                                     self.sigma,
                                                     self.num_abstract_actions)
        self.masked_delta = self.delta * valid_actions_mask
        self.error = tf.select(
            tf.abs(self.masked_delta) < error_clip,
            0.5 * tf.square(self.masked_delta),
            error_clip * tf.abs(self.masked_delta))

        # base dqn
        self.maxQ_base = tf.reduce_max(self.q_target_base, reduction_indices=1)
        self.r_base = tf.sign(self.inp_reward)
        use_backup_base = tf.cast(tf.logical_not(self.inp_terminated),
                                  dtype=tf.float32)
        self.y_base = tf.stop_gradient(self.r_base + use_backup_base * gamma *
                                       self.maxQ_base)
        self.delta_base = tf.reduce_sum(self.inp_actions * self.q_online_base,
                                        reduction_indices=1) - self.y_base
        self.error_base = tf.select(
            tf.abs(self.delta_base) < error_clip,
            0.5 * tf.square(self.delta_base),
            error_clip * tf.abs(self.delta_base))

        self.loss = tf.reduce_sum(self.error) + tf.reduce_sum(self.error_base)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars(
                                               'online_1', 'online_2',
                                               'online_base', l0_vis_scope))
        self.copy_op = [
            th.make_copy_op('online_1', 'target_1'),
            th.make_copy_op('online_2', 'target_2'),
            th.make_copy_op(l0_vis_scope, l0_target_vis_scope),
            th.make_copy_op('online_base', 'target_base')
        ]

        self.replay_buffer = L1ReplayMemory((84, 84), 'uint8',
                                            replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1
        self.max_episode_steps = max_episode_steps

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.base_network_saver = tf.train.Saver(
            var_list=th.get_vars('online_base', l0_vis_scope))