Esempio n. 1
0
    def evaluate(self, eval_dl, batch_idx):
        local_data, local_size = eval_dl.get_batch(batch_idx=batch_idx)
        active_input_names = set(self.active_input_tensor_dict.keys()) & set(local_data.keys())
        fd = {self.active_input_tensor_dict[key]: local_data[key] for key in active_input_names}
        local_output_list = self.sess.run(self.output_tensor_list,
                                          feed_dict=fd,
                                          options=self.run_options,
                                          run_metadata=self.run_metadata)

        local_eval_detail_dict = fd
        local_eval_detail_dict.update({k: v for k, v in zip(self.output_tensor_names, local_output_list)})
        for tensor_name, batch_val in local_eval_detail_dict.items():
            for val in batch_val:
                self.eval_detail_dict.setdefault(tensor_name, []).append(val)
        # Collect all input / outputs of this batch, saving into eval_detail_dict (split by each data point)

        self.scan_data += local_size
        self.scan_batch += 1
        self.tb_point += 1
        if self.scan_batch % self.ob_batch_num == 0:
            LogInfo.logs('[%3s][eval-%s-B%d/%d] scanned = %d/%d',
                         self.task_name,
                         eval_dl.mode,
                         self.scan_batch,
                         eval_dl.n_batch,
                         self.scan_data,
                         len(eval_dl))
Esempio n. 2
0
 def save(self, directory):
     import os
     if not (os.path.isdir(directory)):
         os.mkdir(directory)
     fp = directory + "/best_model"
     self.saver.save(self.sess, fp)
     LogInfo.logs("Model saved into %s.", fp)
Esempio n. 3
0
 def _get_translation_weights(self, setting):
     # weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation" \
     #             "/try_0/weights.Mlinear-cosine_D100_lr0.0050_reg0.0000"
     if setting == "common":
         weight_fp = "/home/xusheng/TabelProject/data/weight/weights.from_common_words"
     elif setting == "500":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep0500_lr0.0050_reg0.0000/weights"
     elif setting == "1000":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep1000_lr0.0050_reg0.0000/weights"
     elif setting == "1500":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep1500_lr0.0050_reg0.0000/weights"
     elif setting == "2000":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep2000_lr0.0050_reg0.0000/weights"
     elif setting == "2500":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep2500_lr0.0050_reg0.0000/weights"
     elif setting == "3000":
         weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep3000_lr0.0050_reg0.0000/weights"
     with open(weight_fp, 'rb') as fin:
         W_value = np.load(fin)
         b_value = np.load(fin)
     self.sess.run([
         self.weights['w_trans'].assign(W_value),
         self.weights['b_trans'].assign(b_value)
     ])
     LogInfo.logs("[model] pre-trained translation loaded from %s.",
                  weight_fp)
Esempio n. 4
0
    def forward(self,
                path_wd_hidden,
                path_kb_hidden,
                path_len,
                focus_wd_hidden,
                focus_kb_hidden,
                reuse=None):
        LogInfo.begin_track('SkBiRNNModule forward: ')

        with tf.variable_scope('SkBiRNNModule', reuse=reuse):
            if self.data_source == 'kb':
                use_path_hidden = path_kb_hidden
                use_focus_hidden = focus_kb_hidden
            elif self.data_source == 'word':
                use_path_hidden = path_wd_hidden
                use_focus_hidden = focus_wd_hidden
            else:
                use_path_hidden = tf.concat([path_kb_hidden, path_wd_hidden],
                                            axis=-1,
                                            name='use_path_hidden')
                # (batch, path_max_len, dim_item_hidden + dim_kb_hidden)
                use_focus_hidden = tf.concat(
                    [focus_kb_hidden, focus_wd_hidden],
                    axis=-1,
                    name='use_focus_hidden')
                # (batch, dim_item_hidden + dim_kb_hidden)

            use_path_emb_input = tf.concat(
                [tf.expand_dims(use_focus_hidden, axis=1), use_path_hidden],
                axis=1,
                name='use_path_emb_input'
            )  # (batch, path_max_len + 1, dim_use)
            show_tensor(use_path_emb_input)
            use_path_len = path_len + 1
            stamps = self.path_max_len + 1
            birnn_inputs = tf.unstack(use_path_emb_input,
                                      num=stamps,
                                      axis=1,
                                      name='birnn_inputs')
            encoder_output = self.rnn_encoder.encode(
                inputs=birnn_inputs, sequence_length=use_path_len, reuse=reuse)
            rnn_outputs = tf.stack(
                encoder_output.outputs, axis=1,
                name='rnn_outputs')  # (batch, path_max_len + 1, dim_sk_hidden)

            # Since we are in the BiRNN mode, we are simply taking average.

            sum_sk_hidden = tf.reduce_sum(
                rnn_outputs, axis=1,
                name='sum_sk_hidden')  # (batch, dim_sk_hidden)
            use_path_len_mat = tf.cast(
                tf.expand_dims(use_path_len, axis=1),
                dtype=tf.float32,
                name='use_path_len_mat')  # (batch, 1) as float32
            sk_hidden = tf.div(sum_sk_hidden,
                               use_path_len_mat,
                               name='sk_hidden')  # (batch, dim_sk_hidden)

        LogInfo.end_track()
        return sk_hidden
Esempio n. 5
0
 def evaluate_all(self, eval_dl, detail_fp=None, result_fp=None):
     self.reset_eval_info()
     for batch_idx in range(eval_dl.n_batch):
         self.evaluate(eval_dl=eval_dl, batch_idx=batch_idx)
     ret_f1 = self.post_process(eval_dl=eval_dl, detail_fp=detail_fp, result_fp=result_fp)
     LogInfo.logs('[%3s] %s_F1 = %.6f', self.task_name, eval_dl.mode, ret_f1)      # [ rm] train_F1 = xx
     return ret_f1
Esempio n. 6
0
def start_entity_search(entity_linkings, conflict_matrix, tag_set):
    LogInfo.begin_track('Searching at M/E level ...')
    entity_available_combs = []  # the return value
    el_size = len(entity_linkings)
    gl_size = len(conflict_matrix)
    for mf_idx, main_focus in enumerate(entity_linkings):
        gl_pos = main_focus.gl_pos
        visit_arr = [0] * gl_size
        for conf_idx in conflict_matrix[gl_pos]:
            visit_arr[conf_idx] += 1
        gl_data_indices = [gl_pos]
        tag_elements = []  # create the initial state of search
        mid = main_focus.value
        type_list = get_entity_type(mid)
        for tp_idx, tp in enumerate(type_list):
            state_marker = [
                'M%d/%d-(t%d/%d)' %
                (mf_idx + 1, el_size, tp_idx + 1, len(type_list))
            ]
            tag_elements.append('M:%s' % tp)
            entity_search_dfs(entity_linkings=entity_linkings,
                              conflict_matrix=conflict_matrix,
                              tag_set=tag_set,
                              cur_el_idx=-1,
                              gl_data_indices=gl_data_indices,
                              tag_elements=tag_elements,
                              visit_arr=visit_arr,
                              entity_available_combs=entity_available_combs,
                              state_marker=state_marker)
            del tag_elements[-1]
    LogInfo.end_track()
    return entity_available_combs
Esempio n. 7
0
def init_emb(name, actual_dict, dim_emb, full_dict=None, full_mat=None):
    """
    Given the actual entries and the full embedding info, construct the actual initial embedding matrix
    :param name: word/entity/predicate
    :param actual_dict: the dict storing actual entries <item, idx>
    :param full_dict: the full dict of entries <item, idx>
    :param full_mat: the full embedding matrix in numpy format
    :param dim_emb: embedding dimension
    :return: the actual initial embedding matrix in numpy format
    """
    if full_mat is not None:
        assert dim_emb == full_mat.shape[1]
    actual_size = len(actual_dict)
    ret_emb_matrix = np.random.uniform(
        low=-0.1, high=0.1, size=(actual_size, dim_emb)).astype('float32')
    # [-0.1, 0.1] as random initialize.
    if full_dict is None or full_mat is None:
        LogInfo.logs('%s: build %s actual init embedding matrix by random.', name, ret_emb_matrix.shape)
        return ret_emb_matrix                   # all random initialize

    for item, target_row_idx in actual_dict.items():
        if item in full_dict:
            # full_mat is None: we don't use TransE as initial embedding
            original_row_idx = full_dict[item]
            ret_emb_matrix[target_row_idx] = full_mat[original_row_idx]
    LogInfo.logs('%s: build %s actual init embedding matrix from full matrix with shape %s.',
                 name, ret_emb_matrix.shape, full_mat.shape if full_mat is not None else '[None]')
    return ret_emb_matrix
Esempio n. 8
0
    def build_active_voc(self, wd_emb_util, path_domain_dict):
        # LogInfo.begin_track('Showing path_domain samples:')
        # for k, v in path_domain_dict.items()[:50]:
        #     LogInfo.logs('[%s] --> %s', k, v)
        # LogInfo.end_track()
        word_idx_dict = wd_emb_util.load_word_indices()
        path_size = len(self.path_idx_dict)
        self.pw_max_len = 0
        self.pw_voc_length = np.zeros(shape=(path_size, ), dtype='int32')
        self.pw_voc_domain = np.zeros(shape=(path_size, ), dtype='int32')
        pw_voc_dict = {
        }  # dict of path word sequence (each word is represented by word index)

        for path_str, idx in self.path_idx_dict.items():
            if idx <= 2:  # PAD, START, UNK
                pw_idx_seq = []
            else:
                path_cate, mid_str = path_str.split('|')
                mid_seq = mid_str.split('\t')
                pw_idx_seq = []
                for mid in mid_seq:
                    p_name = get_item_name(mid)
                    if p_name != '':
                        spt = p_name.split(' ')
                        for wd in spt:
                            wd_idx = word_idx_dict.get(wd, 2)  # UNK if needed
                            pw_idx_seq.append(wd_idx)
                # pw_idx_seq = pw_idx_seq[:self.pw_cutoff]  # truncate if exceeding length limit
            self.pw_voc_length[idx] = len(pw_idx_seq)
            domain_type = path_domain_dict.get(path_str, '')
            if domain_type == '':
                domain_type_idx = 0  # PAD
            else:
                domain_type_idx = self.type_idx_dict.get(domain_type, 2)  # UNK
            self.pw_voc_domain[idx] = domain_type_idx
            pw_voc_dict[idx] = pw_idx_seq
        LogInfo.logs('IN_USE: %s pw_voc_domain constructed.',
                     self.pw_voc_domain.shape)
        LogInfo.logs('IN_USE: %s pw_voc_length constructed.',
                     self.pw_voc_length.shape)
        for pos in (25, 50, 75, 90, 95, 99, 99.9, 100):
            LogInfo.logs('Percentile = %.1f%%: %.6f', pos,
                         np.percentile(self.pw_voc_length, pos))
        self.pw_max_len = np.max(self.pw_voc_length)
        LogInfo.logs('IN_USE: pw_max_len = %d.', self.pw_max_len)

        # for path_str, idx in self.path_idx_dict.items():
        #     local_len = self.pw_voc_length[idx]
        #     if local_len > 7:
        #         LogInfo.logs('Length = %d [%s] --> %s', local_len, path_str, pw_voc_dict[idx])

        assert len(
            pw_voc_dict) == path_size  # ensure no paths sharing the same index
        self.pw_voc_inputs = np.zeros(shape=(path_size, self.pw_max_len),
                                      dtype='int32')
        for idx, pw_idx_seq in pw_voc_dict.items():
            local_len = len(pw_idx_seq)
            self.pw_voc_inputs[idx, :local_len] = pw_idx_seq
        LogInfo.logs('IN_USE: %s pw_voc_inputs constructed.',
                     self.pw_voc_inputs.shape)
Esempio n. 9
0
    def __init__(
        self,
        wd_emb,
        dim_emb,
        emb_dir='data/compQA/word_emb_in_use',
        # parser_ip='202.120.38.146',
        # parser_port=9601):     # BH: 9601; DS: 8601
    ):
        self.word_dict_fp = '%s/word_emb.indices' % emb_dir
        self.word_emb_mat_fp = '%s/word_emb.%s_%d.npy' % (emb_dir, wd_emb,
                                                          dim_emb)
        self.dim_emb = dim_emb
        self.word_idx_dict = None
        self.word_emb_matrix = None
        self.n_words = None

        self.mid_dict_fp = '%s/mid_emb.indices' % emb_dir
        self.mid_emb_mat_fp = '%s/mid_emb.%s_%d.npy' % (emb_dir, wd_emb,
                                                        dim_emb)
        self.mid_idx_dict = None
        self.mid_emb_matrix = None
        self.n_mids = None

        self.load_word_indices()
        self.load_mid_indices()

        self.dep_name_dict = {}
        with open(emb_dir + '/dep_names.txt', 'r') as br:
            for line in br.readlines():
                dep, name = line.strip().split('\t')
                self.dep_name_dict[dep] = name
        LogInfo.logs('%d dependency name loaded.', len(self.dep_name_dict))
Esempio n. 10
0
    def __init__(self,
                 dataset,
                 mode,
                 q_max_len,
                 sc_max_len,
                 path_max_len,
                 item_max_len,
                 batch_size,
                 sampling_config,
                 dynamic=True,
                 shuffle=True,
                 verbose=0):
        super(QScPairDataLoader, self).__init__(batch_size=batch_size,
                                                mode=mode,
                                                dynamic=dynamic,
                                                shuffle=shuffle)
        self.dataset = dataset
        self.verbose = verbose
        self.sampling_config = sampling_config

        sample_func_name = self.sampling_config['name']
        assert sample_func_name in [
            'generate_pairs_by_gold_f1', 'generate_pairs_by_runtime_score'
        ]
        LogInfo.logs('Negative sampling function: %s', sample_func_name)
        self.neg_sample_func = getattr(self, sample_func_name)
        del self.sampling_config['name']

        self.q_max_len = q_max_len
        self.sc_max_len = sc_max_len
        self.path_max_len = path_max_len
        self.item_max_len = item_max_len

        self.np_data_list = None
Esempio n. 11
0
 def build_path_repr__single(self, pw_emb, pw_len, path_emb, pseq_emb, pseq_len, rnn_encoder):
     """
     :param pw_emb: (ds, path_max_size, pw_max_len, dim_emb)
     :param pw_len: (ds, path_max_size)
     :param path_emb: (ds, path_max_size, dim_emb)
     :param pseq_emb: (ds, path_max_size, pseq_max_len, dim_emb)
     :param pseq_len: (ds, path_max_size)
     :param rnn_encoder:
     """
     LogInfo.logs('build_path_repr: path_usage = [%s].', self.path_usage)
     assert len(self.path_usage) == 2
     pw_repr = self.build_path_repr__pw_side(
         pw_emb=pw_emb, pw_len=pw_len,
         rnn_encoder=rnn_encoder,
         pw_usage=self.path_usage[0]
     )
     pseq_repr = self.build_path_repr__pseq_side(
         path_emb=path_emb, pseq_emb=pseq_emb, pseq_len=pseq_len,
         rnn_encoder=rnn_encoder, pseq_usage=self.path_usage[1]
     )
     if pw_repr is None:
         assert pseq_repr is not None
         final_repr = pseq_repr
     elif pseq_repr is None:
         final_repr = pw_repr
     else:   # summation
         final_repr = pw_repr + pseq_repr
     return final_repr       # (ds, path_max_size, dim_emb or dim_hidden)
Esempio n. 12
0
def load_schema_by_kqnew_protocol(schema_fp, gather_linkings,
                                  sc_len_dist, path_len_dist, sc_max_len, schema_level):
    """
    Read the schema files generated by KQ.
    Using the schema in kq_schema.py
    We read raw paths from json files, and convert them into path_list on-the-fly.
    Used after 12/05/2017.
    schema level: 0/1/2/3 (STRICT/ELEGANT/COHERENT/GENERAL)
    """
    LogInfo.logs('Schema level: %s', schema_level)
    schema_level = schema_level_dict[schema_level]
    super_type_dict = load_super_type_dict()
    candidate_list = []
    path_list_str_set = set([])
    with codecs.open(schema_fp, 'r', 'utf-8') as br:
        lines = br.readlines()
        for ori_idx, line in enumerate(lines):
            sc = CompqSchema.read_schema_from_json(json_line=line, gather_linkings=gather_linkings)
            sc.ori_idx = ori_idx + 1
            sc.construct_path_list()        # create the path_list on-the-fly
            path_list_str = sc.disp()
            """
                from the perspective of candidate searching in eff_candgen,
                since we treat main path and constraint path in different direction,
                there's no so-called duplicate schema at all.
                171226: Except for duplicate entities in EL results.
            """
            path_list_str_set.add(path_list_str)
            sc_len_dist.append(len(sc.path_list))
            for path in sc.path_list:
                path_len_dist.append(len(path))
            if len(sc.path_list) <= sc_max_len and schema_classification(sc, super_type_dict) <= schema_level:
                candidate_list.append(sc)
    return candidate_list, path_list_str_set, len(lines)
Esempio n. 13
0
def create_new_emb_value(full_indices_fp, full_matrix_fp, target_indices_fp,
                         out_target_matrix_fp):
    with open(full_indices_fp, 'r') as br:
        full_word_dict = cPickle.load(br)
    LogInfo.logs('Full size: %d', len(full_word_dict))
    with open(target_indices_fp, 'r') as br:
        target_word_dict = cPickle.load(br)
    LogInfo.logs('Target size: %d', len(target_word_dict))

    full_emb_mat = np.load(full_matrix_fp)
    LogInfo.logs('Full matrix: %s', full_emb_mat.shape)
    full_size, dim_emb = full_emb_mat.shape
    target_size = len(target_word_dict)
    target_emb_mat = np.random.uniform(low=-0.1,
                                       high=0.1,
                                       size=(target_size,
                                             dim_emb)).astype('float32')
    for wd in target_word_dict:
        if wd in ('<START>', '<PAD>', '<UNK>'):
            continue  # leave them alone
        target_idx = target_word_dict[wd]
        if wd in full_word_dict:
            full_idx = full_word_dict[wd]
            target_emb_mat[target_idx] = full_emb_mat[full_idx]
    LogInfo.logs('Ready for saving ...')
    np.save(out_target_matrix_fp, target_emb_mat)
    LogInfo.logs('Saving Done.')
Esempio n. 14
0
def show_overall_detail(sc):
    rich_feats_concat = sc.run_info['rich_feats_concat'].tolist()
    for category, gl_data, pred_seq in sc.raw_paths:
        LogInfo.logs('%s: link = [(#-%d) %s %s], pred_seq = %s', category,
                     gl_data.gl_pos, gl_data.comp, gl_data.value, pred_seq)
    show_str = '  '.join(['%6.3f' % x for x in rich_feats_concat])
    LogInfo.logs('rich_feats_concat = [%s]', show_str)
Esempio n. 15
0
def retrieve_schema(data_dir, q_idx, line_no):
    if line_no == -1:
        return
    div = q_idx / 100
    sub_dir = '%d-%d' % (div * 100, div * 100 + 99)
    sc_fp = '%s/%s/%d_schema' % (data_dir, sub_dir, q_idx)
    link_fp = '%s/%s/%d_links' % (data_dir, sub_dir, q_idx)
    gather_linkings = []
    with codecs.open(link_fp, 'r', 'utf-8') as br:
        for gl_line in br.readlines():
            tup_list = json.loads(gl_line.strip())
            ld_dict = {k: v for k, v in tup_list}
            gather_linkings.append(LinkData(**ld_dict))
    json_line = linecache.getline(sc_fp, lineno=line_no).strip()
    sc = CompqSchema.read_schema_from_json(q_idx=q_idx, json_line=json_line,
                                           gather_linkings=gather_linkings,
                                           use_ans_type_dist=False)
    LogInfo.logs('Answer size = %d', sc.ans_size)
    LogInfo.logs('P / R / F1 = %.3f / %.3f / %.3f', sc.p, sc.r, sc.f1)
    for path_idx, raw_path in enumerate(sc.raw_paths):
        category, gl_data, pred_seq = raw_path
        LogInfo.logs('Path-%d: [%s] [%s] [%s %s (%s)]',
                     path_idx+1, category, gl_data.mention, gl_data.comp, gl_data.value, gl_data.name)
        LogInfo.logs('        %s', pred_seq)
    LogInfo.logs('SPARQL: %s', sc.build_sparql())
Esempio n. 16
0
def load_data_and_reformulate(pydump_fp):
    np_list = load_numpy_input_with_names(pydump_fp)
    # ==== 140419: The np list contains the following items: ==== #
    q_tensor3, el_tensor3, path_tensor4, \
    score_tensor3, mask_matrix, \
    ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, \
    ord_obj_tensor3, ord_mask_matrix = np_list
    # =========================================================== #
    size = q_tensor3.shape[0]
    LogInfo.logs('QA size = %d.', size)

    gold_matrix = score_tensor3[:, :, 2]  # just use F1
    best_matrix = np.zeros(shape=gold_matrix.shape, dtype='float32')
    best_matrix[:, 0] = 1.0
    # we've ranked all schemas, so the first candidate must be the best

    opt_np_list = []
    opt_np_list += [
        q_tensor3, path_tensor4, el_tensor3, gold_matrix, best_matrix,
        mask_matrix
    ]  # corresponding to basic_tf_list
    opt_np_list += [
        ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, ord_obj_tensor3,
        ord_mask_matrix
    ]  # corresponding to ordinal_tf_list
    return opt_np_list
Esempio n. 17
0
def load_raw_names():
    tidx_tp_dict = {}  # <t_idx, type>
    tidx_name_dict = {}  # <t_idx, name>

    for fp, _dict in [('type_names.tsv', tidx_name_dict),
                      ('type_dict.tsv', tidx_tp_dict)]:
        with open(type_res_dir + '/' + fp, 'r') as br:
            for line in br.readlines():
                spt = line.strip().split('\t')
                if len(spt) == 2:
                    idx, item = spt
                    _dict[int(idx)] = item
                else:
                    _dict[int(spt[0])] = ''
            LogInfo.logs('%d items loaded from %s.', len(_dict), fp)

    assert len(tidx_tp_dict) == len(tidx_name_dict)
    size = len(tidx_name_dict)

    type_name_dict = {}  # <type, real name> (type.object.name)
    raw_name_list = []
    for idx in range(1, size + 1):
        tp = tidx_tp_dict[idx]
        name = tidx_name_dict[idx]
        name_from_id = tp[tp.rfind('.') + 1:]
        type_name_dict[tp] = name if name != '' else name_from_id
        if name != '':
            raw_name_list.append((tp, name))
        raw_name_list.append((tp, name_from_id))

    LogInfo.logs('%d <type, raw names> loaded.', len(raw_name_list))
    return type_name_dict, raw_name_list
Esempio n. 18
0
 def save_size(self):
     with open(self.size_fp, 'w') as bw:
         bw.write('words\t%d\n' % self.w_size)
         bw.write('entities\t%d\n' % self.e_size)
         bw.write('predicates\t%d\n' % self.p_size)
         bw.write('array_num\t%d\n' % self.array_num)
     LogInfo.logs('W/E/P/ArrNum size saved.')
Esempio n. 19
0
def collect_data(old_data_fp):
    q_links_dict = {}
    q_schema_dict = {}
    for q_idx in range(q_size):
        if q_idx % 100 == 0:
            LogInfo.logs('Current: %d / %d', q_idx, q_size)
        # if q_idx >= 100:
        #     break
        div = q_idx / 100
        sub_dir = '%d-%d' % (div * 100, div * 100 + 99)
        schema_fp = '%s/%s/%d_schema' % (old_data_fp, sub_dir, q_idx)
        link_fp = '%s/%s/%d_links' % (old_data_fp, sub_dir, q_idx)
        gather_linkings = []
        with codecs.open(link_fp, 'r', 'utf-8') as br:
            for line in br.readlines():
                tup_list = json.loads(line.strip())
                ld_dict = {k: v for k, v in tup_list}
                gather_linkings.append(LinkData(**ld_dict))
        strict_sc_list = []
        with codecs.open(schema_fp, 'r', 'utf-8') as br:
            lines = br.readlines()
            for ori_idx, line in enumerate(lines):
                sc = CompqSchema.read_schema_from_json(
                    q_idx,
                    json_line=line,
                    gather_linkings=gather_linkings,
                    use_ans_type_dist=False,
                    placeholder_policy='ActiveOnly')
                sc.ori_idx = ori_idx
                if schema_classification(sc) == 0:  # only pick strict schemas
                    strict_sc_list.append(sc)
        q_links_dict[q_idx] = gather_linkings
        q_schema_dict[q_idx] = strict_sc_list
    return q_links_dict, q_schema_dict
Esempio n. 20
0
 def build_improved(self, score_tf, label_tf, mask_tf):
     grad_tf_list = self.get_gradient_tf_list(score_tf)
     final_loss_tf, sum_lambda_tf = self.get_lambda_tf(
         score_tf, label_tf, mask_tf)
     update_list = self.get_update_list(grad_tf_list, sum_lambda_tf)
     LogInfo.logs('update_list (lambda-based) built.')
     return final_loss_tf, update_list
Esempio n. 21
0
    def load_necessary_entity_predicate_dict(self):
        """
        Scan FB E/T/P names, just keeping <mid, index> pairs which occur in the candidate pool
        :return: <mid, index> dictionary for both entities (including types) and predicates
        """
        e_set = set([])
        t_set = set([])
        p_set = set([])  # the sets maintaining all the entries observed in the current candidates
        for cand_list in self.q_cand_dict.values():
            for cand in cand_list:
                cand.update_item_set(e_set=e_set, t_set=t_set, p_set=p_set)
        LogInfo.logs('%d E + %d T + %d P collected.', len(e_set), len(t_set), len(p_set))
        self.fb_helper.load_names(e_set=e_set, t_set=t_set, p_set=p_set)

        e_dict = {'': 0}        # give index 0 to represent empty entity(for padding)
        for item_set in (e_set, t_set):
            for item in item_set:
                e_dict[item] = len(e_dict)
        # e_dict = {e: e_idx for e_idx, e in enumerate(e_set)}
        # e_dict.update({t: t_idx + len(e_dict) for t_idx, t in enumerate(t_set)})

        p_dict = {p: p_idx + 1 for p_idx, p in enumerate(p_set)}
        p_dict[''] = 0      # also give index 0 to represent empty predicate (for padding)
        # p_dict = {p: p_idx for p_idx, p in enumerate(p_set)}

        return e_dict, p_dict
Esempio n. 22
0
 def build(self, score_tf, label_tf, mask_tf):
     pred_tf, gold_tf, useful_pair_tf, final_loss_tf = self.get_loss_tf(
         score_tf, label_tf, mask_tf)
     train_step = tf.train.AdamOptimizer(
         self.learning_rate).minimize(final_loss_tf)
     LogInfo.logs('train_step (normal) built.')
     return final_loss_tf, train_step
Esempio n. 23
0
def analyze_output(data_dir, sort_item):
    """ Check the rank distribution in a global perspective """
    rank_matrix = [
        [], [], [], []
    ]  # focus on 4 tiers: F1 = 1.0, F1 >= 0.5, F1 >= 0.1, F1 > 0
    fp = '%s/lexicon_validate/srt_output.%s.txt' % (data_dir, sort_item)
    with codecs.open(fp, 'r', 'utf-8') as br:
        for line in br.readlines():
            spt = line.strip().split('\t')
            f1 = float(spt[-2])
            rank = int(spt[-1])
            if rank == -1:
                rank = 2111222333
            if f1 == 1.0:
                rank_matrix[0].append(rank)
            if f1 >= 0.5:
                rank_matrix[1].append(rank)
            if f1 >= 0.1:
                rank_matrix[2].append(rank)
            if f1 >= 1e-6:
                rank_matrix[3].append(rank)
    for ths, rank_list in zip((1.0, 0.5, 0.1, 1e-6), rank_matrix):
        LogInfo.begin_track('Show stat for F1 >= %.6f:', ths)
        rank_list = np.array(rank_list)
        case_size = len(rank_list)
        LogInfo.logs('Total cases = %d.', case_size)
        LogInfo.logs('MRR = %.6f', np.mean(1. / rank_list))
        for pos in (50, 60, 70, 80, 90, 95, 99, 99.9, 100):
            LogInfo.logs('Percentile = %.1f%%: %.6f', pos,
                         np.percentile(rank_list, pos))
        LogInfo.end_track()
Esempio n. 24
0
    def __init__(self,
                 base='/home/xianyang/aqqu/aqqu',
                 parser_ip='202.120.38.146',
                 parser_port=9601,
                 linking_mode='Raw',
                 q_links_dict=None,
                 lukov_linker=None):
        self.base = base
        self.linking_mode = linking_mode
        self.q_links_dict = q_links_dict  # save S-MART results
        self.lukov_linker = lukov_linker
        assert linking_mode in ('Raw', 'S-MART', 'Lukov')
        if linking_mode == 'Lukov':
            assert self.lukov_linker is not None
        """
            Raw: the raw version, won't read anything from S-MART or our Lukov's implementation
            S-MART: read from S-MART result (only available in WebQ)
            Lukov: read from our lukov_ngram linker data
        """
        LogInfo.logs('Initiating parser ... ')
        self.parser = parser.CoreNLPParser(
            'http://%s:%d/parse' %
            (parser_ip, parser_port))  # just open the parser

        self.is_data_loaded = False
        self.surface_index = None
        self.entity_linker = None
        self.type_linker = None

        self.smart_score_disc = Discretizer(
            split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000],
            output_mode='list')
        # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets

        self.pop_filter_num = 5
Esempio n. 25
0
    def optimize(self, optm_dl, batch_idx):
        local_data_list, local_indices = optm_dl.get_batch(batch_idx=batch_idx)
        local_size = len(local_indices)
        fd = {
            input_tf: local_data
            for input_tf, local_data in zip(self.input_tensor_list,
                                            local_data_list)
        }

        _, local_loss, local_extra, summary = self.sess.run(
            [self.optm_step, self.loss, self.extra_data, self.optm_summary],
            feed_dict=fd,
            options=self.run_options,
            run_metadata=self.run_metadata)
        local_loss = float(local_loss)
        self.ret_loss = (self.ret_loss * self.scan_data + local_loss *
                         local_size) / (self.scan_data + local_size)
        self.scan_data += local_size
        self.scan_batch += 1
        self.tb_point += 1
        if self.scan_batch % self.ob_batch_num == 0:
            LogInfo.logs(
                '[%3s][optm-%s-B%d/%d] cur_batch_loss = %.6f, avg_loss = %.6f, scanned = %d/%d',
                self.name, optm_dl.mode, self.scan_batch, optm_dl.n_batch,
                local_loss, self.ret_loss, self.scan_data, len(optm_dl))

            # """ For batch=1 debug only!! """
            # q_idx, pos_sc, neg_sc, weight = optm_dl.optm_pair_tup_list[batch_idx]
            # LogInfo.logs('  q_idx = %4d, pos_sc: line = %4d, score = %.6f, rm_f1 = %.6f',
            #              q_idx, pos_sc.ori_idx, local_extra[0], pos_sc.rm_f1)
            # LogInfo.logs('  q_idx = %4d, neg_sc: line = %4d, score = %.6f, rm_f1 = %.6f',
            #              q_idx, neg_sc.ori_idx, local_extra[1], neg_sc.rm_f1)

        if self.summary_writer is not None:
            self.summary_writer.add_summary(summary, self.tb_point)
Esempio n. 26
0
def make_combination(gather_linkings, sparql_driver, vb):
    """
    Given the E/T/Tm linkings, return all the possible combination of query structure.
    The only restrict: can't use multiple linkings with overlapped mention.
    ** Used in either WebQ or CompQ, not SimpQ **
    :param gather_linkings: list of named_tuple (detail, category, comparison, display)
    :param sparql_driver: the sparql query engine
    :param vb: verbose
    :return: the dictionary including all necessary information of a schema.
    """
    sz = len(gather_linkings)
    el_size = len(filter(lambda x: x.category == 'Entity', gather_linkings))

    # Step 1: Prepare conflict matrix
    conflict_matrix = []
    for i in range(sz):
        local_conf_list = []
        for j in range(sz):
            if is_overlap(gather_linkings[i].detail,
                          gather_linkings[j].detail):
                local_conf_list.append(j)
            elif gather_linkings[i].category == 'Type' and gather_linkings[
                    j].category == 'Type':
                local_conf_list.append(j)
                """ 180205: We add this restriction for saving time."""
                """ I thought there should be only one type constraint in the schema. """
                """ Don't make the task even more complex. """
        conflict_matrix.append(local_conf_list)

    # Step 2: start combination searching
    LogInfo.begin_track(
        'Starting searching combination (total links = %d, entities = %d):',
        len(gather_linkings), el_size)
    ground_comb_list = []  # [ (comb, path_len, sparql_query_ret) ]
    for path_len in (1, 2):
        for mf_idx, main_focus in enumerate(gather_linkings):
            if main_focus.category != 'Entity':
                continue
            visit_arr = [
                0
            ] * sz  # indicating how many conflicts at the particular searching state
            state_marker = [
                'Path-%d||F%d/%d' % (path_len, mf_idx + 1, el_size)
            ]
            cur_comb = [(0, mf_idx)]  # indicating the focus entity
            for conf_idx in conflict_matrix[mf_idx]:
                visit_arr[conf_idx] += 1
            search_start(path_len=path_len,
                         gather_linkings=gather_linkings,
                         sparql_driver=sparql_driver,
                         cur_idx=-1,
                         cur_comb=cur_comb,
                         conflict_matrix=conflict_matrix,
                         visit_arr=visit_arr,
                         ground_comb_list=ground_comb_list,
                         state_marker=state_marker,
                         vb=vb)
    LogInfo.end_track()
    return ground_comb_list
Esempio n. 27
0
    def __init__(self, schema_dataset, compq_mt_model, q_evals_dict, task_name,
                 mode, shuffle, batch_size):
        # q_evals_dict: <q, [sc]>
        assert isinstance(schema_dataset, SchemaDatasetACL18)
        assert isinstance(compq_mt_model, CompqMultiTaskModel)

        DataLoader.__init__(self,
                            mode=mode,
                            batch_size=batch_size,
                            dynamic=False,
                            shuffle=shuffle)
        """ dynamic=False: once we changed the train/eval data, we just change a new data loader """

        self.schema_dataset = schema_dataset
        self.total_questions = len(q_evals_dict)
        self.eval_sc_tup_list = [
        ]  # [(q_idx, sc)], used for tracing the original feed data# ]
        for q_idx, eval_list in q_evals_dict.items():
            for sc in eval_list:
                self.eval_sc_tup_list.append((q_idx, sc))
        total_size = len(self.eval_sc_tup_list)
        self.eval_sc_tup_list.sort(
            key=lambda _tup: _tup[0])  # just sort by q_idx

        wd_emb_util = schema_dataset.wd_emb_util
        input_tensor_names = getattr(compq_mt_model,
                                     '%s_eval_input_names' % task_name)
        global_input_dict = {k: [] for k in input_tensor_names}

        for q_idx, sc in self.eval_sc_tup_list:
            v_len = schema_dataset.v_len_vec[q_idx]
            v_input = schema_dataset.v_input_mat[q_idx]
            clause_input = schema_dataset.clause_input_mat[q_idx]
            sc_np_dict = sc.create_input_np_dict(
                qw_max_len=schema_dataset.q_max_len,
                sc_max_len=schema_dataset.sc_max_len,
                p_max_len=schema_dataset.path_max_len,
                pw_max_len=schema_dataset.pword_max_len,
                type_dist_len=schema_dataset.type_dist_len,
                q_len=v_len,
                word_idx_dict=wd_emb_util.load_word_indices(),
                mid_idx_dict=wd_emb_util.load_mid_indices())
            for k, v in sc_np_dict.items():
                if k in global_input_dict:
                    global_input_dict[k].append(v)
            global_input_dict['v_len'].append(v_len)
            global_input_dict['v_input'].append(v_input)
            global_input_dict['clause_input'].append(clause_input)
        LogInfo.logs('%d schemas saved in dataloader [%s-%s].', total_size,
                     task_name, mode)

        self.np_data_list = []
        for k in input_tensor_names:
            dtype = compq_mt_model.input_tensor_dict[k].dtype
            np_type = 'float32' if dtype == tf.float32 else 'int32'
            np_arr = np.array(global_input_dict[k], dtype=np_type)
            self.np_data_list.append(np_arr)

        self.update_statistics()
Esempio n. 28
0
 def build_update(self, merged_grad_tf_list):
     update_list = []
     for var, merged_grad_tf in zip(tf.global_variables(),
                                    merged_grad_tf_list):
         upd = tf.assign(var, var - self.learning_rate * merged_grad_tf)
         update_list.append(upd)
     LogInfo.logs('update_list compiled, len = %d.', len(update_list))
     return update_list
Esempio n. 29
0
 def save_size(self):
     with open(self.size_fp, 'w') as bw:
         bw.write('words\t%d\n' % self.word_size)
         bw.write('mids\t%d\n' % self.mid_size)
         bw.write('e_feat_len\t%d\n' % self.e_feat_len)
         bw.write('extra_len\t%d\n' % self.extra_len)
         bw.write('array_num\t%d\n' % self.array_num)
     LogInfo.logs('Word/Mid/EntityFeatLen/ExtraLen/ArrNum size saved.')
Esempio n. 30
0
 def prepare_data(self):
     if self.dynamic or self.np_data_list is None or self.indices_list is None:
         # create a brand new data for the model
         self.renew_data_list()
     elif self.shuffle:
         # just change the order
         self.update_statistics()
     LogInfo.logs('data size = %d, num of batch = %d.', len(self), self.n_batch)