def evaluate_all(self, eval_dl, detail_fp=None, result_fp=None): self.reset_eval_info() for batch_idx in range(eval_dl.n_batch): self.evaluate(eval_dl=eval_dl, batch_idx=batch_idx) ret_f1 = self.post_process(eval_dl=eval_dl, detail_fp=detail_fp, result_fp=result_fp) LogInfo.logs('[%3s] %s_F1 = %.6f', self.task_name, eval_dl.mode, ret_f1) # [ rm] train_F1 = xx return ret_f1
def type_filtering(el_result, tl_result, sparql_driver, is_type_extend=True, vb=0): if vb >= 1: LogInfo.begin_track('Type Filtering:') relevant_preds = set([]) for el in el_result: mid = el.entity.id local_relevant_preds = collect_relevant_predicate(mid, sparql_driver) relevant_preds |= local_relevant_preds if vb >= 1: LogInfo.logs('%d relevant predicates collected.', len(relevant_preds)) topical_consistent_types = prepare_topical_consistent_types( relevant_pred_set=relevant_preds, is_type_extended=is_type_extend, vb=vb) filt_tl_result = filter( lambda tl: tl.entity.id in topical_consistent_types, tl_result) LogInfo.logs('Type Filter: %d / %d types are kept.', len(filt_tl_result), len(tl_result)) if vb >= 1: LogInfo.end_track() return filt_tl_result
def main(): qa_list = load_webq() yih_ret_fp = 'codalab/WebQ/acl2015-msr-stagg/test_predict.txt' yih_ret_dict = {} with codecs.open(yih_ret_fp, 'r', 'utf-8') as br: for line in br.readlines(): k, v = line.strip().split('\t') yih_ret_dict[k] = v LogInfo.logs('Yih result collected.') exp_tup_list = [('180514_strict/all__full__180508_K03_Fhalf__depSimulate/' 'NFix-20__wUpd_RH_qwOnly_compact__b32__fbpFalse', '180508_K03_Fhalf', 10), ('180516_strict/all__full__180508_K03_Fhalf__Lemmatize/' 'NFix-20__wUpd_RH_qwOnly_compact__b32__fbpFalse', '180508_K03_Fhalf', 15), ('180516_strict/all__full__180508_K03_Fhalf__Lemmatize/' 'NFix-20__wUpd_BH_qwOnly_compact__b32__fbpFalse', '180508_K03_Fhalf', 12)] for exp_suf, data_suf, best_epoch in exp_tup_list: exp_dir = 'runnings/WebQ/' + exp_suf data_dir = 'runnings/candgen_WebQ/' + data_suf LogInfo.begin_track('Dealing with [%s], epoch = %03d:', exp_suf, best_epoch) work(exp_dir=exp_dir, data_dir=data_dir, best_epoch=best_epoch, qa_list=qa_list, yih_ret_dict=yih_ret_dict) LogInfo.end_track()
def show_overall_detail(sc): rich_feats_concat = sc.run_info['rich_feats_concat'].tolist() for category, gl_data, pred_seq in sc.raw_paths: LogInfo.logs('%s: link = [(#-%d) %s %s], pred_seq = %s', category, gl_data.gl_pos, gl_data.comp, gl_data.value, pred_seq) show_str = ' '.join(['%6.3f' % x for x in rich_feats_concat]) LogInfo.logs('rich_feats_concat = [%s]', show_str)
def __init__(self, base='/home/xianyang/aqqu/aqqu', parser_ip='202.120.38.146', parser_port=9601, linking_mode='Raw', q_links_dict=None, lukov_linker=None): self.base = base self.linking_mode = linking_mode self.q_links_dict = q_links_dict # save S-MART results self.lukov_linker = lukov_linker assert linking_mode in ('Raw', 'S-MART', 'Lukov') if linking_mode == 'Lukov': assert self.lukov_linker is not None """ Raw: the raw version, won't read anything from S-MART or our Lukov's implementation S-MART: read from S-MART result (only available in WebQ) Lukov: read from our lukov_ngram linker data """ LogInfo.logs('Initiating parser ... ') self.parser = parser.CoreNLPParser( 'http://%s:%d/parse' % (parser_ip, parser_port)) # just open the parser self.is_data_loaded = False self.surface_index = None self.entity_linker = None self.type_linker = None self.smart_score_disc = Discretizer( split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000], output_mode='list') # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets self.pop_filter_num = 5
def save_size(self): with open(self.size_fp, 'w') as bw: bw.write('words\t%d\n' % self.w_size) bw.write('entities\t%d\n' % self.e_size) bw.write('predicates\t%d\n' % self.p_size) bw.write('array_num\t%d\n' % self.array_num) LogInfo.logs('W/E/P/ArrNum size saved.')
def optimize(self, optm_dl, batch_idx): local_data_list, local_indices = optm_dl.get_batch(batch_idx=batch_idx) local_size = len(local_indices) fd = { input_tf: local_data for input_tf, local_data in zip(self.input_tensor_list, local_data_list) } _, local_loss, local_extra, summary = self.sess.run( [self.optm_step, self.loss, self.extra_data, self.optm_summary], feed_dict=fd, options=self.run_options, run_metadata=self.run_metadata) local_loss = float(local_loss) self.ret_loss = (self.ret_loss * self.scan_data + local_loss * local_size) / (self.scan_data + local_size) self.scan_data += local_size self.scan_batch += 1 self.tb_point += 1 if self.scan_batch % self.ob_batch_num == 0: LogInfo.logs( '[%3s][optm-%s-B%d/%d] cur_batch_loss = %.6f, avg_loss = %.6f, scanned = %d/%d', self.name, optm_dl.mode, self.scan_batch, optm_dl.n_batch, local_loss, self.ret_loss, self.scan_data, len(optm_dl)) # """ For batch=1 debug only!! """ # q_idx, pos_sc, neg_sc, weight = optm_dl.optm_pair_tup_list[batch_idx] # LogInfo.logs(' q_idx = %4d, pos_sc: line = %4d, score = %.6f, rm_f1 = %.6f', # q_idx, pos_sc.ori_idx, local_extra[0], pos_sc.rm_f1) # LogInfo.logs(' q_idx = %4d, neg_sc: line = %4d, score = %.6f, rm_f1 = %.6f', # q_idx, neg_sc.ori_idx, local_extra[1], neg_sc.rm_f1) if self.summary_writer is not None: self.summary_writer.add_summary(summary, self.tb_point)
def __init__( self, wd_emb, dim_emb, emb_dir='data/compQA/word_emb_in_use', # parser_ip='202.120.38.146', # parser_port=9601): # BH: 9601; DS: 8601 ): self.word_dict_fp = '%s/word_emb.indices' % emb_dir self.word_emb_mat_fp = '%s/word_emb.%s_%d.npy' % (emb_dir, wd_emb, dim_emb) self.dim_emb = dim_emb self.word_idx_dict = None self.word_emb_matrix = None self.n_words = None self.mid_dict_fp = '%s/mid_emb.indices' % emb_dir self.mid_emb_mat_fp = '%s/mid_emb.%s_%d.npy' % (emb_dir, wd_emb, dim_emb) self.mid_idx_dict = None self.mid_emb_matrix = None self.n_mids = None self.load_word_indices() self.load_mid_indices() self.dep_name_dict = {} with open(emb_dir + '/dep_names.txt', 'r') as br: for line in br.readlines(): dep, name = line.strip().split('\t') self.dep_name_dict[dep] = name LogInfo.logs('%d dependency name loaded.', len(self.dep_name_dict))
def build_active_voc(self, wd_emb_util, path_domain_dict): # LogInfo.begin_track('Showing path_domain samples:') # for k, v in path_domain_dict.items()[:50]: # LogInfo.logs('[%s] --> %s', k, v) # LogInfo.end_track() word_idx_dict = wd_emb_util.load_word_indices() path_size = len(self.path_idx_dict) self.pw_max_len = 0 self.pw_voc_length = np.zeros(shape=(path_size, ), dtype='int32') self.pw_voc_domain = np.zeros(shape=(path_size, ), dtype='int32') pw_voc_dict = { } # dict of path word sequence (each word is represented by word index) for path_str, idx in self.path_idx_dict.items(): if idx <= 2: # PAD, START, UNK pw_idx_seq = [] else: path_cate, mid_str = path_str.split('|') mid_seq = mid_str.split('\t') pw_idx_seq = [] for mid in mid_seq: p_name = get_item_name(mid) if p_name != '': spt = p_name.split(' ') for wd in spt: wd_idx = word_idx_dict.get(wd, 2) # UNK if needed pw_idx_seq.append(wd_idx) # pw_idx_seq = pw_idx_seq[:self.pw_cutoff] # truncate if exceeding length limit self.pw_voc_length[idx] = len(pw_idx_seq) domain_type = path_domain_dict.get(path_str, '') if domain_type == '': domain_type_idx = 0 # PAD else: domain_type_idx = self.type_idx_dict.get(domain_type, 2) # UNK self.pw_voc_domain[idx] = domain_type_idx pw_voc_dict[idx] = pw_idx_seq LogInfo.logs('IN_USE: %s pw_voc_domain constructed.', self.pw_voc_domain.shape) LogInfo.logs('IN_USE: %s pw_voc_length constructed.', self.pw_voc_length.shape) for pos in (25, 50, 75, 90, 95, 99, 99.9, 100): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(self.pw_voc_length, pos)) self.pw_max_len = np.max(self.pw_voc_length) LogInfo.logs('IN_USE: pw_max_len = %d.', self.pw_max_len) # for path_str, idx in self.path_idx_dict.items(): # local_len = self.pw_voc_length[idx] # if local_len > 7: # LogInfo.logs('Length = %d [%s] --> %s', local_len, path_str, pw_voc_dict[idx]) assert len( pw_voc_dict) == path_size # ensure no paths sharing the same index self.pw_voc_inputs = np.zeros(shape=(path_size, self.pw_max_len), dtype='int32') for idx, pw_idx_seq in pw_voc_dict.items(): local_len = len(pw_idx_seq) self.pw_voc_inputs[idx, :local_len] = pw_idx_seq LogInfo.logs('IN_USE: %s pw_voc_inputs constructed.', self.pw_voc_inputs.shape)
def build_path_repr__single(self, pw_emb, pw_len, path_emb, pseq_emb, pseq_len, rnn_encoder): """ :param pw_emb: (ds, path_max_size, pw_max_len, dim_emb) :param pw_len: (ds, path_max_size) :param path_emb: (ds, path_max_size, dim_emb) :param pseq_emb: (ds, path_max_size, pseq_max_len, dim_emb) :param pseq_len: (ds, path_max_size) :param rnn_encoder: """ LogInfo.logs('build_path_repr: path_usage = [%s].', self.path_usage) assert len(self.path_usage) == 2 pw_repr = self.build_path_repr__pw_side( pw_emb=pw_emb, pw_len=pw_len, rnn_encoder=rnn_encoder, pw_usage=self.path_usage[0] ) pseq_repr = self.build_path_repr__pseq_side( path_emb=path_emb, pseq_emb=pseq_emb, pseq_len=pseq_len, rnn_encoder=rnn_encoder, pseq_usage=self.path_usage[1] ) if pw_repr is None: assert pseq_repr is not None final_repr = pseq_repr elif pseq_repr is None: final_repr = pw_repr else: # summation final_repr = pw_repr + pseq_repr return final_repr # (ds, path_max_size, dim_emb or dim_hidden)
def forward(self, item_wd_embedding, item_len, reuse=None): LogInfo.begin_track('ItemBiRNNModule forward: ') with tf.variable_scope('ItemBiRNNModule', reuse=reuse): # stamps = item_wd_embedding.get_shape().as_list()[1] stamps = self.item_max_len show_tensor(item_wd_embedding) birnn_inputs = tf.unstack(item_wd_embedding, num=stamps, axis=1, name='birnn_inputs') # rnn_input: a list of stamps elements: (batch, n_emb) encoder_output = self.rnn_encoder.encode(inputs=birnn_inputs, sequence_length=item_len, reuse=reuse) birnn_outputs = tf.stack( encoder_output.outputs, axis=1, name='birnn_outputs') # (data_size, q_len, n_hidden_emb) LogInfo.logs('birnn_output = %s', birnn_outputs.get_shape().as_list()) sum_wd_hidden = tf.reduce_sum(birnn_outputs, axis=1) # (data_size, n_hidden_emb) item_len_mat = tf.cast(tf.expand_dims(item_len, axis=1), dtype=tf.float32) # (data_size, 1) as float item_wd_hidden = tf.div( sum_wd_hidden, tf.maximum(item_len_mat, 1), # avoid dividing by 0 name='item_wd_hidden') # (data_size, n_hidden_emb) LogInfo.logs('item_wd_hidden = %s', item_wd_hidden.get_shape().as_list()) LogInfo.end_track() return item_wd_hidden
def __init__(self, use_sparql_cache=True, data_mode='Ordinal', sc_mode='Skeleton', root_path='/home/kangqi/workspace/PythonProject', cache_dir='runnings/compQA/cache'): LogInfo.begin_track('Initializing InputGenerator ... ') assert data_mode in ('Ordinal', 'ComplexQuestions') assert sc_mode in ('Skeleton', 'Sk+Ordinal') self.data_mode = data_mode self.sc_mode = sc_mode if self.data_mode == 'Ordinal': self.qa_data = load_complex_questions_ordinal_only() self.train_qa_list, self.test_qa_list = self.qa_data elif self.data_mode == 'ComplexQuestions': self.qa_data = load_complex_questions() self.train_qa_list, self.test_qa_list = self.qa_data else: LogInfo.logs('Unknown data mode: %s', self.data_mode) self.cand_gen = CandidateGenerator(use_sparql_cache=use_sparql_cache) self.loss_calc = LossCalculator(driver=self.cand_gen.driver) # qa_schema_score_cache_fp = '%s/%s/qa_schema_score_%s_cache' %(root_path, cache_dir, sc_mode) # self.score_cache = DictCache(qa_schema_score_cache_fp) LogInfo.end_track()
def construct_gather_linkings(el_result, tl_result, tml_result, tml_comp_result): # Put all E/T/Tm linkings together. gather_linkings = [] for el in el_result: assert hasattr(el, 'link_feat') disp = 'E: [%d, %d) %s (%s) %.6f' % ( el.tokens[0].index, el.tokens[-1].index + 1, el.entity.id.encode('utf-8'), el.name.encode('utf-8'), el.surface_score) gather_linkings.append(LinkData(el, 'Entity', '==', disp, el.link_feat)) for tl in tl_result: disp = 'T: [%d, %d) %s (%s) %.6f' % ( tl.tokens[0].index, tl.tokens[-1].index + 1, tl.entity.id.encode('utf-8'), tl.name.encode('utf-8'), tl.surface_score) gather_linkings.append(LinkData(tl, 'Type', '==', disp, [])) for tml, comp in zip(tml_result, tml_comp_result): disp = 'Tm: [%d, %d) %s %s %.6f' % ( tml.tokens[0].index, tml.tokens[-1].index + 1, comp, tml.entity.sparql_name().encode('utf-8'), tml.surface_score) gather_linkings.append(LinkData(tml, 'Time', comp, disp, [])) sz = len(gather_linkings) LogInfo.begin_track('%d E + %d T + %d Tm = %d links.', len(el_result), len(tl_result), len(tml_result), sz) for link_data in gather_linkings: LogInfo.logs(link_data.display) LogInfo.end_track() return gather_linkings
def load_annotations_bio(word_dict, q_max_len): """ Read annotation, convert to B,I,O format, and store into numpy array """ LogInfo.begin_track('Load SimpQ-mention annotation from [%s]:', anno_fp) raw_tup_list = [] # [(v, v_len, tag)] with codecs.open(anno_fp, 'r', 'utf-8') as br: for line_idx, line in enumerate(br.readlines()): spt = line.strip().split('\t') q_idx, st, ed = [int(x) for x in spt[:3]] jac = float(spt[3]) if jac != 1.0: continue # only pick the most accurate sentences tok_list = spt[-1].lower().split(' ') v_len = len(tok_list) v = [word_dict[tok] for tok in tok_list] # TODO: make sure all word exists tag = [2] * st + [ 0 ] + [1] * (ed - st - 1) + [2] * (v_len - ed) # 0: B, 1: I, 2: O # if line_idx < 10: # LogInfo.begin_track('Check case-%d: ', line_idx) # LogInfo.logs('tok_list: %s', tok_list) # LogInfo.logs('v: %s', v) # LogInfo.logs('tag: %s', tag) # LogInfo.end_track() assert len(tag) == len(v) raw_tup_list.append((v, v_len, tag)) q_size = len(raw_tup_list) v_len_list = [tup[1] for tup in raw_tup_list] LogInfo.logs('%d high-quality annotation loaded.', q_size) LogInfo.logs('maximum length = %d (%.6f on avg)', np.max(v_len_list), np.mean(v_len_list)) for pos in (25, 50, 75, 90, 95, 99, 99.9): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(v_len_list, pos)) filt_tup_list = filter(lambda _tup: _tup[1] <= q_max_len, raw_tup_list) LogInfo.logs('%d / %d sentence filtered by [q_max_len=%d].', len(filt_tup_list), q_size, q_max_len) # idx = 0 for v, _, tag in filt_tup_list: v += [0] * (q_max_len - len(v)) tag += [2] * (q_max_len - len(tag)) # if idx < 10: # LogInfo.begin_track('Check formed case-%d ', idx) # LogInfo.logs('v: %s', v) # LogInfo.logs('tag: %s', tag) # LogInfo.end_track() # idx += 1 v_list, v_len_list, tag_list = [[tup[i] for tup in filt_tup_list] for i in range(3)] np_data_list = [ np.array(v_list, dtype='int32'), # (ds, q_max_len) np.array(v_len_list, dtype='int32'), # (ds, ) np.array(tag_list, dtype='int32') # (ds, num_classes) ] for idx, np_data in enumerate(np_data_list): LogInfo.logs('np-%d: %s', idx, np_data.shape) LogInfo.end_track() return np_data_list
def load_schema_by_kqnew_protocol(schema_fp, gather_linkings, sc_len_dist, path_len_dist, sc_max_len, schema_level): """ Read the schema files generated by KQ. Using the schema in kq_schema.py We read raw paths from json files, and convert them into path_list on-the-fly. Used after 12/05/2017. schema level: 0/1/2/3 (STRICT/ELEGANT/COHERENT/GENERAL) """ LogInfo.logs('Schema level: %s', schema_level) schema_level = schema_level_dict[schema_level] super_type_dict = load_super_type_dict() candidate_list = [] path_list_str_set = set([]) with codecs.open(schema_fp, 'r', 'utf-8') as br: lines = br.readlines() for ori_idx, line in enumerate(lines): sc = CompqSchema.read_schema_from_json(json_line=line, gather_linkings=gather_linkings) sc.ori_idx = ori_idx + 1 sc.construct_path_list() # create the path_list on-the-fly path_list_str = sc.disp() """ from the perspective of candidate searching in eff_candgen, since we treat main path and constraint path in different direction, there's no so-called duplicate schema at all. 171226: Except for duplicate entities in EL results. """ path_list_str_set.add(path_list_str) sc_len_dist.append(len(sc.path_list)) for path in sc.path_list: path_len_dist.append(len(path)) if len(sc.path_list) <= sc_max_len and schema_classification(sc, super_type_dict) <= schema_level: candidate_list.append(sc) return candidate_list, path_list_str_set, len(lines)
def load_smart_cands(self): if self.smart_q_cand_dict is not None: # already loaded return if not os.path.isfile(self.dump_fp): # no dump, read from txt self.load_smart_schemas_from_txt() else: LogInfo.begin_track('Loading smart_candidates from [%s] ...', self.dump_fp) with open(self.dump_fp, 'rb') as br: LogInfo.begin_track('Loading smart_q_cand_dict ... ') self.smart_q_cand_dict = cPickle.load(br) LogInfo.logs('Candidates for %d questions loaded.', len(self.smart_q_cand_dict)) cand_size_dist = np.array( [len(v) for v in self.smart_q_cand_dict.values()]) LogInfo.logs('Total schemas = %d, avg = %.6f.', np.sum(cand_size_dist), np.mean(cand_size_dist)) for pos in (25, 50, 75, 90, 95, 99, 99.9, 100): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(cand_size_dist, pos)) LogInfo.end_track() self.path_idx_dict = cPickle.load(br) self.entity_idx_dict = cPickle.load(br) self.type_idx_dict = cPickle.load(br) LogInfo.logs('Active E/T/Path dict loaded.') self.pw_voc_inputs = cPickle.load(br) # (path_voc, pw_max_len) self.pw_voc_length = cPickle.load(br) # (path_voc,) self.pw_voc_domain = cPickle.load(br) # (path_voc,) self.entity_type_matrix = cPickle.load( br) # (entity_voc, type_voc) self.pw_max_len = self.pw_voc_inputs.shape[1] LogInfo.logs('path word & entity_type lookup tables loaded.') self.q_idx_list = sorted(self.smart_q_cand_dict.keys()) LogInfo.end_track() # end of loading self.meta_stat() # show meta statistics
def load_necessary_entity_predicate_dict(self): """ Scan FB E/T/P names, just keeping <mid, index> pairs which occur in the candidate pool :return: <mid, index> dictionary for both entities (including types) and predicates """ e_set = set([]) t_set = set([]) p_set = set([]) # the sets maintaining all the entries observed in the current candidates for cand_list in self.q_cand_dict.values(): for cand in cand_list: cand.update_item_set(e_set=e_set, t_set=t_set, p_set=p_set) LogInfo.logs('%d E + %d T + %d P collected.', len(e_set), len(t_set), len(p_set)) self.fb_helper.load_names(e_set=e_set, t_set=t_set, p_set=p_set) e_dict = {'': 0} # give index 0 to represent empty entity(for padding) for item_set in (e_set, t_set): for item in item_set: e_dict[item] = len(e_dict) # e_dict = {e: e_idx for e_idx, e in enumerate(e_set)} # e_dict.update({t: t_idx + len(e_dict) for t_idx, t in enumerate(t_set)}) p_dict = {p: p_idx + 1 for p_idx, p in enumerate(p_set)} p_dict[''] = 0 # also give index 0 to represent empty predicate (for padding) # p_dict = {p: p_idx for p_idx, p in enumerate(p_set)} return e_dict, p_dict
def __init__(self, dataset, mode, q_max_len, sc_max_len, path_max_len, item_max_len, batch_size, sampling_config, dynamic=True, shuffle=True, verbose=0): super(QScPairDataLoader, self).__init__(batch_size=batch_size, mode=mode, dynamic=dynamic, shuffle=shuffle) self.dataset = dataset self.verbose = verbose self.sampling_config = sampling_config sample_func_name = self.sampling_config['name'] assert sample_func_name in [ 'generate_pairs_by_gold_f1', 'generate_pairs_by_runtime_score' ] LogInfo.logs('Negative sampling function: %s', sample_func_name) self.neg_sample_func = getattr(self, sample_func_name) del self.sampling_config['name'] self.q_max_len = q_max_len self.sc_max_len = sc_max_len self.path_max_len = path_max_len self.item_max_len = item_max_len self.np_data_list = None
def load_cands(self): if len(self.np_data_list) > 0 and \ self.q_cand_dict is not None and \ self.q_words_dict is not None: return if not os.path.isfile(self.dump_fp): self.prepare_all_data() return LogInfo.begin_track('Loading candidates & np_data from [%s] ...', self.dump_fp) with open(self.dump_fp, 'rb') as br: self.q_list = cPickle.load(br) LogInfo.logs('q_list loaded for %d questions.', len(self.q_list)) self.q_words_dict = cPickle.load(br) LogInfo.logs('q_words_dict loaded for %d questions.', len(self.q_words_dict)) self.q_cand_dict = cPickle.load(br) LogInfo.logs('q_cand_dict loaded.') cand_size_dist = np.array([len(v) for v in self.q_cand_dict.values()]) LogInfo.begin_track('Show candidate size distribution:') for pos in (25, 50, 75, 90, 95, 99, 99.9, 100): LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(cand_size_dist, pos)) LogInfo.end_track() for data_idx in range(self.array_num): np_data = np.load(br) self.np_data_list.append(np_data) LogInfo.logs('np-data-%d loaded: %s', data_idx, np_data.shape) LogInfo.end_track()
def batch_schema_f1_query(self, q_id, states, level): """ perform F1 query for each schema in the state. :param q_id: WebQ-xxx / CompQ-xxx :param states: [(schema, visit_arr)] :param level: coarse / typed / timed / ordinal :return: filtered states where each schema returns at least one answer. """ Tt.start('%s_F1' % level) LogInfo.begin_track('Calculating F1 for %d %s schemas:', len(states), level) for idx, (sc, _) in enumerate(states): if idx % 100 == 0: LogInfo.logs('Current: %d / %d', idx, len(states)) sparql_str = sc.build_sparql(simple_time_match=self.simple_time_match) tm_comp, tm_value, ord_comp, ord_rank, agg = sc.build_aux_for_sparql() allow_forever = self.allow_forever if tm_comp != 'None' else '' # won't specific forever if no time constraints q_sc_key = '|'.join([q_id, sparql_str, tm_comp, tm_value, allow_forever, ord_comp, ord_rank, agg]) if self.vb >= 2: LogInfo.begin_track('Checking schema %d / %d:', idx, len(states)) LogInfo.logs(sc.disp_raw_path()) LogInfo.logs('var_types: %s', sc.var_types) LogInfo.logs(sparql_str) Tt.start('query_q_sc_stat') sc.ans_size, sc.p, sc.r, sc.f1 = self.query_srv.query_q_sc_stat(q_sc_key) Tt.record('query_q_sc_stat') if self.vb >= 2: LogInfo.logs('Answers = %d, P = %.6f, R = %.6f, F1 = %.6f', sc.ans_size, sc.p, sc.r, sc.f1) LogInfo.end_track() filt_states = filter(lambda _tup: _tup[0].ans_size > 0, states) LogInfo.end_track('%d / %d %s schemas kept with ans_size > 0.', len(filt_states), len(states), level) Tt.record('%s_F1' % level) return filt_states
def load_data_and_reformulate(pydump_fp): np_list = load_numpy_input_with_names(pydump_fp) # ==== 140419: The np list contains the following items: ==== # q_tensor3, el_tensor3, path_tensor4, \ score_tensor3, mask_matrix, \ ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, \ ord_obj_tensor3, ord_mask_matrix = np_list # =========================================================== # size = q_tensor3.shape[0] LogInfo.logs('QA size = %d.', size) gold_matrix = score_tensor3[:, :, 2] # just use F1 best_matrix = np.zeros(shape=gold_matrix.shape, dtype='float32') best_matrix[:, 0] = 1.0 # we've ranked all schemas, so the first candidate must be the best opt_np_list = [] opt_np_list += [ q_tensor3, path_tensor4, el_tensor3, gold_matrix, best_matrix, mask_matrix ] # corresponding to basic_tf_list opt_np_list += [ ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, ord_obj_tensor3, ord_mask_matrix ] # corresponding to ordinal_tf_list return opt_np_list
def collect_data(old_data_fp): q_links_dict = {} q_schema_dict = {} for q_idx in range(q_size): if q_idx % 100 == 0: LogInfo.logs('Current: %d / %d', q_idx, q_size) # if q_idx >= 100: # break div = q_idx / 100 sub_dir = '%d-%d' % (div * 100, div * 100 + 99) schema_fp = '%s/%s/%d_schema' % (old_data_fp, sub_dir, q_idx) link_fp = '%s/%s/%d_links' % (old_data_fp, sub_dir, q_idx) gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for line in br.readlines(): tup_list = json.loads(line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) strict_sc_list = [] with codecs.open(schema_fp, 'r', 'utf-8') as br: lines = br.readlines() for ori_idx, line in enumerate(lines): sc = CompqSchema.read_schema_from_json( q_idx, json_line=line, gather_linkings=gather_linkings, use_ans_type_dist=False, placeholder_policy='ActiveOnly') sc.ori_idx = ori_idx if schema_classification(sc) == 0: # only pick strict schemas strict_sc_list.append(sc) q_links_dict[q_idx] = gather_linkings q_schema_dict[q_idx] = strict_sc_list return q_links_dict, q_schema_dict
def retrieve_schema(data_dir, q_idx, line_no): if line_no == -1: return div = q_idx / 100 sub_dir = '%d-%d' % (div * 100, div * 100 + 99) sc_fp = '%s/%s/%d_schema' % (data_dir, sub_dir, q_idx) link_fp = '%s/%s/%d_links' % (data_dir, sub_dir, q_idx) gather_linkings = [] with codecs.open(link_fp, 'r', 'utf-8') as br: for gl_line in br.readlines(): tup_list = json.loads(gl_line.strip()) ld_dict = {k: v for k, v in tup_list} gather_linkings.append(LinkData(**ld_dict)) json_line = linecache.getline(sc_fp, lineno=line_no).strip() sc = CompqSchema.read_schema_from_json(q_idx=q_idx, json_line=json_line, gather_linkings=gather_linkings, use_ans_type_dist=False) LogInfo.logs('Answer size = %d', sc.ans_size) LogInfo.logs('P / R / F1 = %.3f / %.3f / %.3f', sc.p, sc.r, sc.f1) for path_idx, raw_path in enumerate(sc.raw_paths): category, gl_data, pred_seq = raw_path LogInfo.logs('Path-%d: [%s] [%s] [%s %s (%s)]', path_idx+1, category, gl_data.mention, gl_data.comp, gl_data.value, gl_data.name) LogInfo.logs(' %s', pred_seq) LogInfo.logs('SPARQL: %s', sc.build_sparql())
def pick_one_search(spec_linkings, conflict_matrix, tag_set, av_combs, spec): """ Work for T/Tm/Ord, since only one of them can be selected, no need for DFS. """ assert spec in ('T', 'Tm', 'Ord') LogInfo.begin_track('Searching at %s level ...', spec) spec_available_combs = [] for gl_data_indices, tag_elements, visit_arr in av_combs: for gl_data in spec_linkings: gl_pos = gl_data.gl_pos if visit_arr[gl_pos] != 0: # cannot be visited due to conflict continue new_visit_arr = list(visit_arr) # new state after applying types for conf_idx in conflict_matrix[gl_pos]: new_visit_arr[conf_idx] += 1 if spec in ('Tm', 'Ord'): tag_elem = spec else: tag_elem = 'T:%s' % gl_data.value new_gl_data_indices = list(gl_data_indices) + [gl_pos] new_tag_elements = list(tag_elements) + [tag_elem] tag = '|'.join(new_tag_elements) if tag in tag_set: if vb >= 1: LogInfo.logs(tag) spec_available_combs.append( (new_gl_data_indices, new_tag_elements, new_visit_arr)) LogInfo.end_track() return spec_available_combs
def load_raw_names(): tidx_tp_dict = {} # <t_idx, type> tidx_name_dict = {} # <t_idx, name> for fp, _dict in [('type_names.tsv', tidx_name_dict), ('type_dict.tsv', tidx_tp_dict)]: with open(type_res_dir + '/' + fp, 'r') as br: for line in br.readlines(): spt = line.strip().split('\t') if len(spt) == 2: idx, item = spt _dict[int(idx)] = item else: _dict[int(spt[0])] = '' LogInfo.logs('%d items loaded from %s.', len(_dict), fp) assert len(tidx_tp_dict) == len(tidx_name_dict) size = len(tidx_name_dict) type_name_dict = {} # <type, real name> (type.object.name) raw_name_list = [] for idx in range(1, size + 1): tp = tidx_tp_dict[idx] name = tidx_name_dict[idx] name_from_id = tp[tp.rfind('.') + 1:] type_name_dict[tp] = name if name != '' else name_from_id if name != '': raw_name_list.append((tp, name)) raw_name_list.append((tp, name_from_id)) LogInfo.logs('%d <type, raw names> loaded.', len(raw_name_list)) return type_name_dict, raw_name_list
def build(self, score_tf, label_tf, mask_tf): pred_tf, gold_tf, useful_pair_tf, final_loss_tf = self.get_loss_tf( score_tf, label_tf, mask_tf) train_step = tf.train.AdamOptimizer( self.learning_rate).minimize(final_loss_tf) LogInfo.logs('train_step (normal) built.') return final_loss_tf, train_step
def get_gradient_tf_list(self, score_tf): LogInfo.begin_track('LambdaRank genearating gradients ... ') grad_tf_list = [] # the return value scan = 0 for var in tf.global_variables(): scan += 1 LogInfo.begin_track('Variable %d / %d %s: ', scan, len(tf.global_variables()), var.get_shape().as_list()) per_row_grad_tf_list = [] for row_idx in range(self.batch_size): LogInfo.begin_track('row_idx = %d / %d: ', row_idx + 1, self.batch_size) local_grad_tf_list = [] for item_idx in range(self.list_len): if (item_idx + 1) % 50 == 0: LogInfo.logs('item_idx = %d / %d', item_idx + 1, self.list_len) local_grad_tf = tf.gradients(score_tf[row_idx, item_idx], var)[0] # ("var_shape", ) local_grad_tf_list.append(local_grad_tf) per_row_grad_tf = tf.stack(local_grad_tf_list, axis=0) per_row_grad_tf_list.append(per_row_grad_tf) # per_row_grad_tf: (list_len, "var_shape") LogInfo.end_track() grad_tf = tf.stack(per_row_grad_tf_list, axis=0) grad_tf_list.append(grad_tf) LogInfo.logs('grad_tf: %s', grad_tf.get_shape().as_list()) # grad_tf: (batch_size, list_len, "var_shape") LogInfo.end_track() return grad_tf_list
def build_improved(self, score_tf, label_tf, mask_tf): grad_tf_list = self.get_gradient_tf_list(score_tf) final_loss_tf, sum_lambda_tf = self.get_lambda_tf( score_tf, label_tf, mask_tf) update_list = self.get_update_list(grad_tf_list, sum_lambda_tf) LogInfo.logs('update_list (lambda-based) built.') return final_loss_tf, update_list
def init_emb(name, actual_dict, dim_emb, full_dict=None, full_mat=None): """ Given the actual entries and the full embedding info, construct the actual initial embedding matrix :param name: word/entity/predicate :param actual_dict: the dict storing actual entries <item, idx> :param full_dict: the full dict of entries <item, idx> :param full_mat: the full embedding matrix in numpy format :param dim_emb: embedding dimension :return: the actual initial embedding matrix in numpy format """ if full_mat is not None: assert dim_emb == full_mat.shape[1] actual_size = len(actual_dict) ret_emb_matrix = np.random.uniform( low=-0.1, high=0.1, size=(actual_size, dim_emb)).astype('float32') # [-0.1, 0.1] as random initialize. if full_dict is None or full_mat is None: LogInfo.logs('%s: build %s actual init embedding matrix by random.', name, ret_emb_matrix.shape) return ret_emb_matrix # all random initialize for item, target_row_idx in actual_dict.items(): if item in full_dict: # full_mat is None: we don't use TransE as initial embedding original_row_idx = full_dict[item] ret_emb_matrix[target_row_idx] = full_mat[original_row_idx] LogInfo.logs('%s: build %s actual init embedding matrix from full matrix with shape %s.', name, ret_emb_matrix.shape, full_mat.shape if full_mat is not None else '[None]') return ret_emb_matrix
def evaluate(self, eval_dl, batch_idx): local_data, local_size = eval_dl.get_batch(batch_idx=batch_idx) active_input_names = set(self.active_input_tensor_dict.keys()) & set(local_data.keys()) fd = {self.active_input_tensor_dict[key]: local_data[key] for key in active_input_names} local_output_list = self.sess.run(self.output_tensor_list, feed_dict=fd, options=self.run_options, run_metadata=self.run_metadata) local_eval_detail_dict = fd local_eval_detail_dict.update({k: v for k, v in zip(self.output_tensor_names, local_output_list)}) for tensor_name, batch_val in local_eval_detail_dict.items(): for val in batch_val: self.eval_detail_dict.setdefault(tensor_name, []).append(val) # Collect all input / outputs of this batch, saving into eval_detail_dict (split by each data point) self.scan_data += local_size self.scan_batch += 1 self.tb_point += 1 if self.scan_batch % self.ob_batch_num == 0: LogInfo.logs('[%3s][eval-%s-B%d/%d] scanned = %d/%d', self.task_name, eval_dl.mode, self.scan_batch, eval_dl.n_batch, self.scan_data, len(eval_dl))