def evaluate(self, eval_dl, batch_idx): local_data, local_size = eval_dl.get_batch(batch_idx=batch_idx) active_input_names = set(self.active_input_tensor_dict.keys()) & set(local_data.keys()) fd = {self.active_input_tensor_dict[key]: local_data[key] for key in active_input_names} local_output_list = self.sess.run(self.output_tensor_list, feed_dict=fd, options=self.run_options, run_metadata=self.run_metadata) local_eval_detail_dict = fd local_eval_detail_dict.update({k: v for k, v in zip(self.output_tensor_names, local_output_list)}) for tensor_name, batch_val in local_eval_detail_dict.items(): for val in batch_val: self.eval_detail_dict.setdefault(tensor_name, []).append(val) # Collect all input / outputs of this batch, saving into eval_detail_dict (split by each data point) self.scan_data += local_size self.scan_batch += 1 self.tb_point += 1 if self.scan_batch % self.ob_batch_num == 0: LogInfo.logs('[%3s][eval-%s-B%d/%d] scanned = %d/%d', self.task_name, eval_dl.mode, self.scan_batch, eval_dl.n_batch, self.scan_data, len(eval_dl))
def save(self, directory): import os if not (os.path.isdir(directory)): os.mkdir(directory) fp = directory + "/best_model" self.saver.save(self.sess, fp) LogInfo.logs("Model saved into %s.", fp)
def _get_translation_weights(self, setting): # weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation" \ # "/try_0/weights.Mlinear-cosine_D100_lr0.0050_reg0.0000" if setting == "common": weight_fp = "/home/xusheng/TabelProject/data/weight/weights.from_common_words" elif setting == "500": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep0500_lr0.0050_reg0.0000/weights" elif setting == "1000": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep1000_lr0.0050_reg0.0000/weights" elif setting == "1500": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep1500_lr0.0050_reg0.0000/weights" elif setting == "2000": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep2000_lr0.0050_reg0.0000/weights" elif setting == "2500": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep2500_lr0.0050_reg0.0000/weights" elif setting == "3000": weight_fp = "/home/kangqi/workspace/PythonProject/runnings/tabel/translation/try_5/Mlinear-cosine_D100_keep3000_lr0.0050_reg0.0000/weights" with open(weight_fp, 'rb') as fin: W_value = np.load(fin) b_value = np.load(fin) self.sess.run([ self.weights['w_trans'].assign(W_value), self.weights['b_trans'].assign(b_value) ]) LogInfo.logs("[model] pre-trained translation loaded from %s.", weight_fp)
def forward(self, path_wd_hidden, path_kb_hidden, path_len, focus_wd_hidden, focus_kb_hidden, reuse=None): LogInfo.begin_track('SkBiRNNModule forward: ') with tf.variable_scope('SkBiRNNModule', reuse=reuse): if self.data_source == 'kb': use_path_hidden = path_kb_hidden use_focus_hidden = focus_kb_hidden elif self.data_source == 'word': use_path_hidden = path_wd_hidden use_focus_hidden = focus_wd_hidden else: use_path_hidden = tf.concat([path_kb_hidden, path_wd_hidden], axis=-1, name='use_path_hidden') # (batch, path_max_len, dim_item_hidden + dim_kb_hidden) use_focus_hidden = tf.concat( [focus_kb_hidden, focus_wd_hidden], axis=-1, name='use_focus_hidden') # (batch, dim_item_hidden + dim_kb_hidden) use_path_emb_input = tf.concat( [tf.expand_dims(use_focus_hidden, axis=1), use_path_hidden], axis=1, name='use_path_emb_input' ) # (batch, path_max_len + 1, dim_use) show_tensor(use_path_emb_input) use_path_len = path_len + 1 stamps = self.path_max_len + 1 birnn_inputs = tf.unstack(use_path_emb_input, num=stamps, axis=1, name='birnn_inputs') encoder_output = self.rnn_encoder.encode( inputs=birnn_inputs, sequence_length=use_path_len, reuse=reuse) rnn_outputs = tf.stack( encoder_output.outputs, axis=1, name='rnn_outputs') # (batch, path_max_len + 1, dim_sk_hidden) # Since we are in the BiRNN mode, we are simply taking average. sum_sk_hidden = tf.reduce_sum( rnn_outputs, axis=1, name='sum_sk_hidden') # (batch, dim_sk_hidden) use_path_len_mat = tf.cast( tf.expand_dims(use_path_len, axis=1), dtype=tf.float32, name='use_path_len_mat') # (batch, 1) as float32 sk_hidden = tf.div(sum_sk_hidden, use_path_len_mat, name='sk_hidden') # (batch, dim_sk_hidden) LogInfo.end_track() return sk_hidden
def evaluate_all(self, eval_dl, detail_fp=None, result_fp=None): self.reset_eval_info() for batch_idx in range(eval_dl.n_batch): self.evaluate(eval_dl=eval_dl, batch_idx=batch_idx) ret_f1 = self.post_process(eval_dl=eval_dl, detail_fp=detail_fp, result_fp=result_fp) LogInfo.logs('[%3s] %s_F1 = %.6f', self.task_name, eval_dl.mode, ret_f1) # [ rm] train_F1 = xx return ret_f1
def start_entity_search(entity_linkings, conflict_matrix, tag_set): LogInfo.begin_track('Searching at M/E level ...') entity_available_combs = [] # the return value el_size = len(entity_linkings) gl_size = len(conflict_matrix) for mf_idx, main_focus in enumerate(entity_linkings): gl_pos = main_focus.gl_pos visit_arr = [0] * gl_size for conf_idx in conflict_matrix[gl_pos]: visit_arr[conf_idx] += 1 gl_data_indices = [gl_pos] tag_elements = [] # create the initial state of search mid = main_focus.value type_list = get_entity_type(mid) for tp_idx, tp in enumerate(type_list): state_marker = [ 'M%d/%d-(t%d/%d)' % (mf_idx + 1, el_size, tp_idx + 1, len(type_list)) ] tag_elements.append('M:%s' % tp) entity_search_dfs(entity_linkings=entity_linkings, conflict_matrix=conflict_matrix, tag_set=tag_set, cur_el_idx=-1, gl_data_indices=gl_data_indices, tag_elements=tag_elements, visit_arr=visit_arr, entity_available_combs=entity_available_combs, state_marker=state_marker) del tag_elements[-1] LogInfo.end_track() return entity_available_combs
def init_emb(name, actual_dict, dim_emb, full_dict=None, full_mat=None): """ Given the actual entries and the full embedding info, construct the actual initial embedding matrix :param name: word/entity/predicate :param actual_dict: the dict storing actual entries <item, idx> :param full_dict: the full dict of entries <item, idx> :param full_mat: the full embedding matrix in numpy format :param dim_emb: embedding dimension :return: the actual initial embedding matrix in numpy format """ if full_mat is not None: assert dim_emb == full_mat.shape[1] actual_size = len(actual_dict) ret_emb_matrix = np.random.uniform( low=-0.1, high=0.1, size=(actual_size, dim_emb)).astype('float32') # [-0.1, 0.1] as random initialize. if full_dict is None or full_mat is None: LogInfo.logs('%s: build %s actual init embedding matrix by random.', name, ret_emb_matrix.shape) return ret_emb_matrix # all random initialize for item, target_row_idx in actual_dict.items(): if item in full_dict: # full_mat is None: we don't use TransE as initial embedding original_row_idx = full_dict[item] ret_emb_matrix[target_row_idx] = full_mat[original_row_idx] LogInfo.logs('%s: build %s actual init embedding matrix from full matrix with shape %s.', name, ret_emb_matrix.shape, full_mat.shape if full_mat is not None else '[None]') return ret_emb_matrix
def build_active_voc(self, wd_emb_util, path_domain_dict): # LogInfo.begin_track('Showing path_domain samples:') # for k, v in path_domain_dict.items()[:50]: # LogInfo.logs('[%s] --> %s', k, v) # LogInfo.end_track() word_idx_dict = wd_emb_util.load_word_indices() path_size = len(self.path_idx_dict) self.pw_max_len = 0 self.pw_voc_length = np.zeros(shape=(path_size, ), dtype='int32') self.pw_voc_domain = np.zeros(shape=(path_size, ), dtype='int32') pw_voc_dict = { } # dict of path word sequence (each word is represented by word index) for path_str, idx in self.path_idx_dict.items(): if idx <= 2: # PAD, START, UNK pw_idx_seq = [] else: path_cate, mid_str = path_str.split('|') mid_seq = mid_str.split('\t') pw_idx_seq = [] for mid in mid_seq: p_name = get_item_name(mid) if p_name != '': spt = p_name.split(' ') for wd in spt: wd_idx = word_idx_dict.get(wd, 2) # UNK if needed pw_idx_seq.append(wd_idx) # pw_idx_seq = pw_idx_seq[:self.pw_cutoff] # truncate if exceeding length limit self.pw_voc_length[idx] = len(pw_idx_seq) domain_type = path_domain_dict.get(path_str, '') if domain_type == '': domain_type_idx = 0 # PAD else: domain_type_idx = self.type_idx_dict.get(domain_type, 2) # UNK self.pw_voc_domain[idx] = domain_type_idx pw_voc_dict[idx] = pw_idx_seq LogInfo.logs('IN_USE: %s pw_voc_domain constructed.', self.pw_voc_domain.shape) LogInfo.logs('IN_USE: %s pw_voc_length constructed.', self.pw_voc_length.shape) for pos in (25, 50, 75, 90, 95, 99, 99.9, 100): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(self.pw_voc_length, pos)) self.pw_max_len = np.max(self.pw_voc_length) LogInfo.logs('IN_USE: pw_max_len = %d.', self.pw_max_len) # for path_str, idx in self.path_idx_dict.items(): # local_len = self.pw_voc_length[idx] # if local_len > 7: # LogInfo.logs('Length = %d [%s] --> %s', local_len, path_str, pw_voc_dict[idx]) assert len( pw_voc_dict) == path_size # ensure no paths sharing the same index self.pw_voc_inputs = np.zeros(shape=(path_size, self.pw_max_len), dtype='int32') for idx, pw_idx_seq in pw_voc_dict.items(): local_len = len(pw_idx_seq) self.pw_voc_inputs[idx, :local_len] = pw_idx_seq LogInfo.logs('IN_USE: %s pw_voc_inputs constructed.', self.pw_voc_inputs.shape)
def __init__( self, wd_emb, dim_emb, emb_dir='data/compQA/word_emb_in_use', # parser_ip='202.120.38.146', # parser_port=9601): # BH: 9601; DS: 8601 ): self.word_dict_fp = '%s/word_emb.indices' % emb_dir self.word_emb_mat_fp = '%s/word_emb.%s_%d.npy' % (emb_dir, wd_emb, dim_emb) self.dim_emb = dim_emb self.word_idx_dict = None self.word_emb_matrix = None self.n_words = None self.mid_dict_fp = '%s/mid_emb.indices' % emb_dir self.mid_emb_mat_fp = '%s/mid_emb.%s_%d.npy' % (emb_dir, wd_emb, dim_emb) self.mid_idx_dict = None self.mid_emb_matrix = None self.n_mids = None self.load_word_indices() self.load_mid_indices() self.dep_name_dict = {} with open(emb_dir + '/dep_names.txt', 'r') as br: for line in br.readlines(): dep, name = line.strip().split('\t') self.dep_name_dict[dep] = name LogInfo.logs('%d dependency name loaded.', len(self.dep_name_dict))
def __init__(self, dataset, mode, q_max_len, sc_max_len, path_max_len, item_max_len, batch_size, sampling_config, dynamic=True, shuffle=True, verbose=0): super(QScPairDataLoader, self).__init__(batch_size=batch_size, mode=mode, dynamic=dynamic, shuffle=shuffle) self.dataset = dataset self.verbose = verbose self.sampling_config = sampling_config sample_func_name = self.sampling_config['name'] assert sample_func_name in [ 'generate_pairs_by_gold_f1', 'generate_pairs_by_runtime_score' ] LogInfo.logs('Negative sampling function: %s', sample_func_name) self.neg_sample_func = getattr(self, sample_func_name) del self.sampling_config['name'] self.q_max_len = q_max_len self.sc_max_len = sc_max_len self.path_max_len = path_max_len self.item_max_len = item_max_len self.np_data_list = None
def build_path_repr__single(self, pw_emb, pw_len, path_emb, pseq_emb, pseq_len, rnn_encoder): """ :param pw_emb: (ds, path_max_size, pw_max_len, dim_emb) :param pw_len: (ds, path_max_size) :param path_emb: (ds, path_max_size, dim_emb) :param pseq_emb: (ds, path_max_size, pseq_max_len, dim_emb) :param pseq_len: (ds, path_max_size) :param rnn_encoder: """ LogInfo.logs('build_path_repr: path_usage = [%s].', self.path_usage) assert len(self.path_usage) == 2 pw_repr = self.build_path_repr__pw_side( pw_emb=pw_emb, pw_len=pw_len, rnn_encoder=rnn_encoder, pw_usage=self.path_usage[0] ) pseq_repr = self.build_path_repr__pseq_side( path_emb=path_emb, pseq_emb=pseq_emb, pseq_len=pseq_len, rnn_encoder=rnn_encoder, pseq_usage=self.path_usage[1] ) if pw_repr is None: assert pseq_repr is not None final_repr = pseq_repr elif pseq_repr is None: final_repr = pw_repr else: # summation final_repr = pw_repr + pseq_repr return final_repr # (ds, path_max_size, dim_emb or dim_hidden)
def load_schema_by_kqnew_protocol(schema_fp, gather_linkings, sc_len_dist, path_len_dist, sc_max_len, schema_level): """ Read the schema files generated by KQ. Using the schema in kq_schema.py We read raw paths from json files, and convert them into path_list on-the-fly. Used after 12/05/2017. schema level: 0/1/2/3 (STRICT/ELEGANT/COHERENT/GENERAL) """ LogInfo.logs('Schema level: %s', schema_level) schema_level = schema_level_dict[schema_level] super_type_dict = load_super_type_dict() candidate_list = [] path_list_str_set = set([]) with codecs.open(schema_fp, 'r', 'utf-8') as br: lines = br.readlines() for ori_idx, line in enumerate(lines): sc = CompqSchema.read_schema_from_json(json_line=line, gather_linkings=gather_linkings) sc.ori_idx = ori_idx + 1 sc.construct_path_list() # create the path_list on-the-fly path_list_str = sc.disp() """ from the perspective of candidate searching in eff_candgen, since we treat main path and constraint path in different direction, there's no so-called duplicate schema at all. 171226: Except for duplicate entities in EL results. """ path_list_str_set.add(path_list_str) sc_len_dist.append(len(sc.path_list)) for path in sc.path_list: path_len_dist.append(len(path)) if len(sc.path_list) <= sc_max_len and schema_classification(sc, super_type_dict) <= schema_level: candidate_list.append(sc) return candidate_list, path_list_str_set, len(lines)
def create_new_emb_value(full_indices_fp, full_matrix_fp, target_indices_fp, out_target_matrix_fp): with open(full_indices_fp, 'r') as br: full_word_dict = cPickle.load(br) LogInfo.logs('Full size: %d', len(full_word_dict)) with open(target_indices_fp, 'r') as br: target_word_dict = cPickle.load(br) LogInfo.logs('Target size: %d', len(target_word_dict)) full_emb_mat = np.load(full_matrix_fp) LogInfo.logs('Full matrix: %s', full_emb_mat.shape) full_size, dim_emb = full_emb_mat.shape target_size = len(target_word_dict) target_emb_mat = np.random.uniform(low=-0.1, high=0.1, size=(target_size, dim_emb)).astype('float32') for wd in target_word_dict: if wd in ('<START>', '<PAD>', '<UNK>'): continue # leave them alone target_idx = target_word_dict[wd] if wd in full_word_dict: full_idx = full_word_dict[wd] target_emb_mat[target_idx] = full_emb_mat[full_idx] LogInfo.logs('Ready for saving ...') np.save(out_target_matrix_fp, target_emb_mat) LogInfo.logs('Saving Done.')
def show_overall_detail(sc): rich_feats_concat = sc.run_info['rich_feats_concat'].tolist() for category, gl_data, pred_seq in sc.raw_paths: LogInfo.logs('%s: link = [(#-%d) %s %s], pred_seq = %s', category, gl_data.gl_pos, gl_data.comp, gl_data.value, pred_seq) show_str = ' '.join(['%6.3f' % x for x in rich_feats_concat]) LogInfo.logs('rich_feats_concat = [%s]', show_str)
def retrieve_schema(data_dir, q_idx, line_no): if line_no == -1: return div = q_idx / 100 sub_dir = '%d-%d' % (div * 100, div * 100 + 99) sc_fp = '%s/%s/%d_schema' % (data_dir, sub_dir, q_idx) link_fp = '%s/%s/%d_links' % (data_dir, sub_dir, q_idx) gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for gl_line in br.readlines(): tup_list = json.loads(gl_line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) json_line = linecache.getline(sc_fp, lineno=line_no).strip() sc = CompqSchema.read_schema_from_json(q_idx=q_idx, json_line=json_line, gather_linkings=gather_linkings, use_ans_type_dist=False) LogInfo.logs('Answer size = %d', sc.ans_size) LogInfo.logs('P / R / F1 = %.3f / %.3f / %.3f', sc.p, sc.r, sc.f1) for path_idx, raw_path in enumerate(sc.raw_paths): category, gl_data, pred_seq = raw_path LogInfo.logs('Path-%d: [%s] [%s] [%s %s (%s)]', path_idx+1, category, gl_data.mention, gl_data.comp, gl_data.value, gl_data.name) LogInfo.logs(' %s', pred_seq) LogInfo.logs('SPARQL: %s', sc.build_sparql())
def load_data_and_reformulate(pydump_fp): np_list = load_numpy_input_with_names(pydump_fp) # ==== 140419: The np list contains the following items: ==== # q_tensor3, el_tensor3, path_tensor4, \ score_tensor3, mask_matrix, \ ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, \ ord_obj_tensor3, ord_mask_matrix = np_list # =========================================================== # size = q_tensor3.shape[0] LogInfo.logs('QA size = %d.', size) gold_matrix = score_tensor3[:, :, 2] # just use F1 best_matrix = np.zeros(shape=gold_matrix.shape, dtype='float32') best_matrix[:, 0] = 1.0 # we've ranked all schemas, so the first candidate must be the best opt_np_list = [] opt_np_list += [ q_tensor3, path_tensor4, el_tensor3, gold_matrix, best_matrix, mask_matrix ] # corresponding to basic_tf_list opt_np_list += [ ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, ord_obj_tensor3, ord_mask_matrix ] # corresponding to ordinal_tf_list return opt_np_list
def load_raw_names(): tidx_tp_dict = {} # <t_idx, type> tidx_name_dict = {} # <t_idx, name> for fp, _dict in [('type_names.tsv', tidx_name_dict), ('type_dict.tsv', tidx_tp_dict)]: with open(type_res_dir + '/' + fp, 'r') as br: for line in br.readlines(): spt = line.strip().split('\t') if len(spt) == 2: idx, item = spt _dict[int(idx)] = item else: _dict[int(spt[0])] = '' LogInfo.logs('%d items loaded from %s.', len(_dict), fp) assert len(tidx_tp_dict) == len(tidx_name_dict) size = len(tidx_name_dict) type_name_dict = {} # <type, real name> (type.object.name) raw_name_list = [] for idx in range(1, size + 1): tp = tidx_tp_dict[idx] name = tidx_name_dict[idx] name_from_id = tp[tp.rfind('.') + 1:] type_name_dict[tp] = name if name != '' else name_from_id if name != '': raw_name_list.append((tp, name)) raw_name_list.append((tp, name_from_id)) LogInfo.logs('%d <type, raw names> loaded.', len(raw_name_list)) return type_name_dict, raw_name_list
def save_size(self): with open(self.size_fp, 'w') as bw: bw.write('words\t%d\n' % self.w_size) bw.write('entities\t%d\n' % self.e_size) bw.write('predicates\t%d\n' % self.p_size) bw.write('array_num\t%d\n' % self.array_num) LogInfo.logs('W/E/P/ArrNum size saved.')
def collect_data(old_data_fp): q_links_dict = {} q_schema_dict = {} for q_idx in range(q_size): if q_idx % 100 == 0: LogInfo.logs('Current: %d / %d', q_idx, q_size) # if q_idx >= 100: # break div = q_idx / 100 sub_dir = '%d-%d' % (div * 100, div * 100 + 99) schema_fp = '%s/%s/%d_schema' % (old_data_fp, sub_dir, q_idx) link_fp = '%s/%s/%d_links' % (old_data_fp, sub_dir, q_idx) gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for line in br.readlines(): tup_list = json.loads(line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) strict_sc_list = [] with codecs.open(schema_fp, 'r', 'utf-8') as br: lines = br.readlines() for ori_idx, line in enumerate(lines): sc = CompqSchema.read_schema_from_json( q_idx, json_line=line, gather_linkings=gather_linkings, use_ans_type_dist=False, placeholder_policy='ActiveOnly') sc.ori_idx = ori_idx if schema_classification(sc) == 0: # only pick strict schemas strict_sc_list.append(sc) q_links_dict[q_idx] = gather_linkings q_schema_dict[q_idx] = strict_sc_list return q_links_dict, q_schema_dict
def build_improved(self, score_tf, label_tf, mask_tf): grad_tf_list = self.get_gradient_tf_list(score_tf) final_loss_tf, sum_lambda_tf = self.get_lambda_tf( score_tf, label_tf, mask_tf) update_list = self.get_update_list(grad_tf_list, sum_lambda_tf) LogInfo.logs('update_list (lambda-based) built.') return final_loss_tf, update_list
def load_necessary_entity_predicate_dict(self): """ Scan FB E/T/P names, just keeping <mid, index> pairs which occur in the candidate pool :return: <mid, index> dictionary for both entities (including types) and predicates """ e_set = set([]) t_set = set([]) p_set = set([]) # the sets maintaining all the entries observed in the current candidates for cand_list in self.q_cand_dict.values(): for cand in cand_list: cand.update_item_set(e_set=e_set, t_set=t_set, p_set=p_set) LogInfo.logs('%d E + %d T + %d P collected.', len(e_set), len(t_set), len(p_set)) self.fb_helper.load_names(e_set=e_set, t_set=t_set, p_set=p_set) e_dict = {'': 0} # give index 0 to represent empty entity(for padding) for item_set in (e_set, t_set): for item in item_set: e_dict[item] = len(e_dict) # e_dict = {e: e_idx for e_idx, e in enumerate(e_set)} # e_dict.update({t: t_idx + len(e_dict) for t_idx, t in enumerate(t_set)}) p_dict = {p: p_idx + 1 for p_idx, p in enumerate(p_set)} p_dict[''] = 0 # also give index 0 to represent empty predicate (for padding) # p_dict = {p: p_idx for p_idx, p in enumerate(p_set)} return e_dict, p_dict
def build(self, score_tf, label_tf, mask_tf): pred_tf, gold_tf, useful_pair_tf, final_loss_tf = self.get_loss_tf( score_tf, label_tf, mask_tf) train_step = tf.train.AdamOptimizer( self.learning_rate).minimize(final_loss_tf) LogInfo.logs('train_step (normal) built.') return final_loss_tf, train_step
def analyze_output(data_dir, sort_item): """ Check the rank distribution in a global perspective """ rank_matrix = [ [], [], [], [] ] # focus on 4 tiers: F1 = 1.0, F1 >= 0.5, F1 >= 0.1, F1 > 0 fp = '%s/lexicon_validate/srt_output.%s.txt' % (data_dir, sort_item) with codecs.open(fp, 'r', 'utf-8') as br: for line in br.readlines(): spt = line.strip().split('\t') f1 = float(spt[-2]) rank = int(spt[-1]) if rank == -1: rank = 2111222333 if f1 == 1.0: rank_matrix[0].append(rank) if f1 >= 0.5: rank_matrix[1].append(rank) if f1 >= 0.1: rank_matrix[2].append(rank) if f1 >= 1e-6: rank_matrix[3].append(rank) for ths, rank_list in zip((1.0, 0.5, 0.1, 1e-6), rank_matrix): LogInfo.begin_track('Show stat for F1 >= %.6f:', ths) rank_list = np.array(rank_list) case_size = len(rank_list) LogInfo.logs('Total cases = %d.', case_size) LogInfo.logs('MRR = %.6f', np.mean(1. / rank_list)) for pos in (50, 60, 70, 80, 90, 95, 99, 99.9, 100): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(rank_list, pos)) LogInfo.end_track()
def __init__(self, base='/home/xianyang/aqqu/aqqu', parser_ip='202.120.38.146', parser_port=9601, linking_mode='Raw', q_links_dict=None, lukov_linker=None): self.base = base self.linking_mode = linking_mode self.q_links_dict = q_links_dict # save S-MART results self.lukov_linker = lukov_linker assert linking_mode in ('Raw', 'S-MART', 'Lukov') if linking_mode == 'Lukov': assert self.lukov_linker is not None """ Raw: the raw version, won't read anything from S-MART or our Lukov's implementation S-MART: read from S-MART result (only available in WebQ) Lukov: read from our lukov_ngram linker data """ LogInfo.logs('Initiating parser ... ') self.parser = parser.CoreNLPParser( 'http://%s:%d/parse' % (parser_ip, parser_port)) # just open the parser self.is_data_loaded = False self.surface_index = None self.entity_linker = None self.type_linker = None self.smart_score_disc = Discretizer( split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000], output_mode='list') # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets self.pop_filter_num = 5
def optimize(self, optm_dl, batch_idx): local_data_list, local_indices = optm_dl.get_batch(batch_idx=batch_idx) local_size = len(local_indices) fd = { input_tf: local_data for input_tf, local_data in zip(self.input_tensor_list, local_data_list) } _, local_loss, local_extra, summary = self.sess.run( [self.optm_step, self.loss, self.extra_data, self.optm_summary], feed_dict=fd, options=self.run_options, run_metadata=self.run_metadata) local_loss = float(local_loss) self.ret_loss = (self.ret_loss * self.scan_data + local_loss * local_size) / (self.scan_data + local_size) self.scan_data += local_size self.scan_batch += 1 self.tb_point += 1 if self.scan_batch % self.ob_batch_num == 0: LogInfo.logs( '[%3s][optm-%s-B%d/%d] cur_batch_loss = %.6f, avg_loss = %.6f, scanned = %d/%d', self.name, optm_dl.mode, self.scan_batch, optm_dl.n_batch, local_loss, self.ret_loss, self.scan_data, len(optm_dl)) # """ For batch=1 debug only!! """ # q_idx, pos_sc, neg_sc, weight = optm_dl.optm_pair_tup_list[batch_idx] # LogInfo.logs(' q_idx = %4d, pos_sc: line = %4d, score = %.6f, rm_f1 = %.6f', # q_idx, pos_sc.ori_idx, local_extra[0], pos_sc.rm_f1) # LogInfo.logs(' q_idx = %4d, neg_sc: line = %4d, score = %.6f, rm_f1 = %.6f', # q_idx, neg_sc.ori_idx, local_extra[1], neg_sc.rm_f1) if self.summary_writer is not None: self.summary_writer.add_summary(summary, self.tb_point)
def make_combination(gather_linkings, sparql_driver, vb): """ Given the E/T/Tm linkings, return all the possible combination of query structure. The only restrict: can't use multiple linkings with overlapped mention. ** Used in either WebQ or CompQ, not SimpQ ** :param gather_linkings: list of named_tuple (detail, category, comparison, display) :param sparql_driver: the sparql query engine :param vb: verbose :return: the dictionary including all necessary information of a schema. """ sz = len(gather_linkings) el_size = len(filter(lambda x: x.category == 'Entity', gather_linkings)) # Step 1: Prepare conflict matrix conflict_matrix = [] for i in range(sz): local_conf_list = [] for j in range(sz): if is_overlap(gather_linkings[i].detail, gather_linkings[j].detail): local_conf_list.append(j) elif gather_linkings[i].category == 'Type' and gather_linkings[ j].category == 'Type': local_conf_list.append(j) """ 180205: We add this restriction for saving time.""" """ I thought there should be only one type constraint in the schema. """ """ Don't make the task even more complex. """ conflict_matrix.append(local_conf_list) # Step 2: start combination searching LogInfo.begin_track( 'Starting searching combination (total links = %d, entities = %d):', len(gather_linkings), el_size) ground_comb_list = [] # [ (comb, path_len, sparql_query_ret) ] for path_len in (1, 2): for mf_idx, main_focus in enumerate(gather_linkings): if main_focus.category != 'Entity': continue visit_arr = [ 0 ] * sz # indicating how many conflicts at the particular searching state state_marker = [ 'Path-%d||F%d/%d' % (path_len, mf_idx + 1, el_size) ] cur_comb = [(0, mf_idx)] # indicating the focus entity for conf_idx in conflict_matrix[mf_idx]: visit_arr[conf_idx] += 1 search_start(path_len=path_len, gather_linkings=gather_linkings, sparql_driver=sparql_driver, cur_idx=-1, cur_comb=cur_comb, conflict_matrix=conflict_matrix, visit_arr=visit_arr, ground_comb_list=ground_comb_list, state_marker=state_marker, vb=vb) LogInfo.end_track() return ground_comb_list
def __init__(self, schema_dataset, compq_mt_model, q_evals_dict, task_name, mode, shuffle, batch_size): # q_evals_dict: <q, [sc]> assert isinstance(schema_dataset, SchemaDatasetACL18) assert isinstance(compq_mt_model, CompqMultiTaskModel) DataLoader.__init__(self, mode=mode, batch_size=batch_size, dynamic=False, shuffle=shuffle) """ dynamic=False: once we changed the train/eval data, we just change a new data loader """ self.schema_dataset = schema_dataset self.total_questions = len(q_evals_dict) self.eval_sc_tup_list = [ ] # [(q_idx, sc)], used for tracing the original feed data# ] for q_idx, eval_list in q_evals_dict.items(): for sc in eval_list: self.eval_sc_tup_list.append((q_idx, sc)) total_size = len(self.eval_sc_tup_list) self.eval_sc_tup_list.sort( key=lambda _tup: _tup[0]) # just sort by q_idx wd_emb_util = schema_dataset.wd_emb_util input_tensor_names = getattr(compq_mt_model, '%s_eval_input_names' % task_name) global_input_dict = {k: [] for k in input_tensor_names} for q_idx, sc in self.eval_sc_tup_list: v_len = schema_dataset.v_len_vec[q_idx] v_input = schema_dataset.v_input_mat[q_idx] clause_input = schema_dataset.clause_input_mat[q_idx] sc_np_dict = sc.create_input_np_dict( qw_max_len=schema_dataset.q_max_len, sc_max_len=schema_dataset.sc_max_len, p_max_len=schema_dataset.path_max_len, pw_max_len=schema_dataset.pword_max_len, type_dist_len=schema_dataset.type_dist_len, q_len=v_len, word_idx_dict=wd_emb_util.load_word_indices(), mid_idx_dict=wd_emb_util.load_mid_indices()) for k, v in sc_np_dict.items(): if k in global_input_dict: global_input_dict[k].append(v) global_input_dict['v_len'].append(v_len) global_input_dict['v_input'].append(v_input) global_input_dict['clause_input'].append(clause_input) LogInfo.logs('%d schemas saved in dataloader [%s-%s].', total_size, task_name, mode) self.np_data_list = [] for k in input_tensor_names: dtype = compq_mt_model.input_tensor_dict[k].dtype np_type = 'float32' if dtype == tf.float32 else 'int32' np_arr = np.array(global_input_dict[k], dtype=np_type) self.np_data_list.append(np_arr) self.update_statistics()
def build_update(self, merged_grad_tf_list): update_list = [] for var, merged_grad_tf in zip(tf.global_variables(), merged_grad_tf_list): upd = tf.assign(var, var - self.learning_rate * merged_grad_tf) update_list.append(upd) LogInfo.logs('update_list compiled, len = %d.', len(update_list)) return update_list
def save_size(self): with open(self.size_fp, 'w') as bw: bw.write('words\t%d\n' % self.word_size) bw.write('mids\t%d\n' % self.mid_size) bw.write('e_feat_len\t%d\n' % self.e_feat_len) bw.write('extra_len\t%d\n' % self.extra_len) bw.write('array_num\t%d\n' % self.array_num) LogInfo.logs('Word/Mid/EntityFeatLen/ExtraLen/ArrNum size saved.')
def prepare_data(self): if self.dynamic or self.np_data_list is None or self.indices_list is None: # create a brand new data for the model self.renew_data_list() elif self.shuffle: # just change the order self.update_statistics() LogInfo.logs('data size = %d, num of batch = %d.', len(self), self.n_batch)