def __init__(self, base='/home/xianyang/aqqu/aqqu', parser_ip='202.120.38.146', parser_port=9601, linking_mode='Raw', q_links_dict=None, lukov_linker=None): self.base = base self.linking_mode = linking_mode self.q_links_dict = q_links_dict # save S-MART results self.lukov_linker = lukov_linker assert linking_mode in ('Raw', 'S-MART', 'Lukov') if linking_mode == 'Lukov': assert self.lukov_linker is not None """ Raw: the raw version, won't read anything from S-MART or our Lukov's implementation S-MART: read from S-MART result (only available in WebQ) Lukov: read from our lukov_ngram linker data """ LogInfo.logs('Initiating parser ... ') self.parser = parser.CoreNLPParser( 'http://%s:%d/parse' % (parser_ip, parser_port)) # just open the parser self.is_data_loaded = False self.surface_index = None self.entity_linker = None self.type_linker = None self.smart_score_disc = Discretizer( split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000], output_mode='list') # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets self.pop_filter_num = 5
def generate_extra_feat(self, sc): ans_size_disc = Discretizer([2, 3, 5, 10, 50], output_mode='list') # 5+1 extra_feat_list = [ ] # E/T/Tm/Ord size, T/T/Tm/Ord indicator, 2_hop, with_med, ans_size_discrete # 4 + 4 + 2 + 6 = 16 constr_size_dict = {} main_pred_seq = [] for category, _, pred_seq in sc.raw_paths: if category == 'Main': main_pred_seq = pred_seq constr_size_dict[category] = 1 + constr_size_dict.get(category, 0) for cate in ('Entity', 'Type', 'Time', 'Ordinal'): extra_feat_list.append(constr_size_dict.get( cate, 0)) # how many constraint paths for cate in ('Entity', 'Type', 'Time', 'Ordinal'): extra_feat_list.append(min(1, constr_size_dict.get( cate, 0))) # whether have such constraints is_two_hop = 1 if sc.hops == 2 else 0 with_med = 1 if is_mediator_as_expect(main_pred_seq[0]) else 0 extra_feat_list += [is_two_hop, with_med] extra_feat_list += ans_size_disc.convert(sc.ans_size) assert len(extra_feat_list) == self.extra_feat_size return np.array(extra_feat_list, dtype='float32')
def __init__(self, lexicon): self.lexicon = lexicon self.year_re = re.compile(r'^[1-2][0-9][0-9][0-9]$') self.punc_str = u"?!:',." self.trivial_set = {'the', 'a', 'an', 'of', 'on', 'at', 'by'} # Lukov et al., Sec 2.2.1 self.log_wiki_pop_disc = Discretizer([1, 2, 3, 4, 6], output_mode='list', name='log_wiki_pop') self.log_fb_pop_disc = Discretizer([3, 4, 6, 8], output_mode='list', name='log_fb_pop')
def work(exp_dir, data_dir, best_epoch, qa_list, yih_ret_dict): log_fp = '%s/yih_compare_%03d.txt' % (exp_dir, best_epoch) pick_sc_dict = {q_idx: (-1, 0.) for q_idx in range(3778, 5810)} ret_fp = '%s/result/full.t.%03d' % (exp_dir, best_epoch) with open(ret_fp, 'r') as br: for line in br.readlines(): spt = line.strip().split('\t') q_idx = int(spt[0]) line_no = int(spt[1]) ours_f1 = float(spt[2]) pick_sc_dict[q_idx] = (line_no, ours_f1) disc = Discretizer([-0.99, -0.50, -0.25, -0.01, 0.01, 0.25, 0.50, 0.99]) delta_tup_list = [] avg_yih_f1 = 0. avg_ours_f1 = 0. for q_idx in range(3778, 5810): qa = qa_list[q_idx] q = qa['utterance'] gold_answer_list = qa['targetValue'] yih_answer_list = json.loads(yih_ret_dict[q]) _, _, yih_f1 = compute_f1(goldList=gold_answer_list, predictedList=yih_answer_list) ours_f1 = pick_sc_dict[q_idx][1] avg_yih_f1 += yih_f1 avg_ours_f1 += ours_f1 delta = ours_f1 - yih_f1 disc.convert(delta) delta_tup_list.append((q_idx, delta)) avg_yih_f1 /= 2032 avg_ours_f1 /= 2032 delta_tup_list.sort(key=lambda _tup: _tup[1]) LogInfo.logs('%d questions delta sorted.', len(delta_tup_list)) total_size = len(delta_tup_list) worse_size = len(filter(lambda _tup: _tup[1] < 0., delta_tup_list)) better_size = len(filter(lambda _tup: _tup[1] > 0., delta_tup_list)) equal_size = total_size - worse_size - better_size bw = codecs.open(log_fp, 'w', 'utf-8') LogInfo.redirect(bw) LogInfo.logs('Avg_Yih_F1 = %.6f, Avg_Ours_F1 = %.6f', avg_yih_f1, avg_ours_f1) LogInfo.logs(' Worse cases = %d (%.2f%%)', worse_size, 100. * worse_size / total_size) LogInfo.logs(' Equal cases = %d (%.2f%%)', equal_size, 100. * equal_size / total_size) LogInfo.logs('Better cases = %d (%.2f%%)', better_size, 100. * better_size / total_size) disc.show_distribution() LogInfo.logs() for q_idx, _ in delta_tup_list: qa = qa_list[q_idx] line_no, ours_f1 = pick_sc_dict[q_idx] q = qa['utterance'] yih_answer_list = json.loads(yih_ret_dict[q]) if line_no == -1: continue single_question(q_idx=q_idx, qa=qa, data_dir=data_dir, line_no=line_no, yih_answer_list=yih_answer_list, ours_f1=ours_f1) LogInfo.stop_redirect()
class LinkingWrapper: def __init__(self, base='/home/xianyang/aqqu/aqqu', parser_ip='202.120.38.146', parser_port=9601, linking_mode='Raw', q_links_dict=None, lukov_linker=None): self.base = base self.linking_mode = linking_mode self.q_links_dict = q_links_dict # save S-MART results self.lukov_linker = lukov_linker assert linking_mode in ('Raw', 'S-MART', 'Lukov') if linking_mode == 'Lukov': assert self.lukov_linker is not None """ Raw: the raw version, won't read anything from S-MART or our Lukov's implementation S-MART: read from S-MART result (only available in WebQ) Lukov: read from our lukov_ngram linker data """ LogInfo.logs('Initiating parser ... ') self.parser = parser.CoreNLPParser( 'http://%s:%d/parse' % (parser_ip, parser_port)) # just open the parser self.is_data_loaded = False self.surface_index = None self.entity_linker = None self.type_linker = None self.smart_score_disc = Discretizer( split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000], output_mode='list') # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets self.pop_filter_num = 5 # Only used in LukovLinker, for each span, # we just select top number of entities sorted by popularity def load_data(self): if self.is_data_loaded: return LogInfo.begin_track('EL-Wrapper initializing ... ') LogInfo.logs('Initiating index ...') self.surface_index = surface_index_memory.EntitySurfaceIndexMemory( self.base + '/data/entity-list', self.base + '/data/entity-surface-map', self.base + '/data/entity-index') LogInfo.logs('Initiating entity_linker') self.entity_linker = entity_linker.EntityLinker(self.surface_index, 7) LogInfo.logs('Initiating type_linker [KQ]') self.type_linker = type_linker.TypeLinker() LogInfo.end_track('Initialized.') self.is_data_loaded = True # Key function: return tokens, entities, types and times def link(self, q_idx, sentence): parse_result = self.parser.parse(sentence) tokens = parse_result.tokens linking_mode = self.linking_mode el_result = [] tl_result = [] tml_result = [] if linking_mode in ('Raw', 'S-MART'): self.load_data() raw_result = self.entity_linker.identify_entities_in_tokens( tokens) # entity & time for item in raw_result: if isinstance(item.entity, entity_linker.KBEntity): el_result.append(item) elif isinstance(item.entity, entity_linker.DateValue): tml_result.append(item) if linking_mode == 'S-MART': # won't use the previous results, but just read S-MART data el_result = [] smart_list = self.q_links_dict.get(q_idx, []) for smart_item in smart_list: # enumerate each candidate in S-MART result use_tokens = [ ] # determine the token we use for the current EL result start = smart_item.st_pos end = smart_item.st_pos + smart_item.length cur_pos = 0 for t in tokens: if start <= cur_pos < end: use_tokens.append(t) cur_pos += len(t.token) + 1 obj_entity = KBEntity(name=smart_item.e_name, identifier=smart_item.mid, score=0, aliases=None) perfect = (smart_item.e_name.lower().replace( '_', ' ') == smart_item.surface_form.lower()) # Chicago_Ohio --> chicago ohio el_item = IdentifiedEntity(tokens=use_tokens, name=smart_item.e_name, entity=obj_entity, score=0, surface_score=smart_item.score, perfect_match=perfect) link_feat = self.smart_score_disc.convert( score=smart_item.score) setattr(el_item, 'link_feat', link_feat) el_result.append(el_item) tl_result = self.type_linker.identiy_types_in_tokens( tokens) # type link elif linking_mode == 'Lukov': # All the Entity/Type/Time linking will be performed by the Lukov Linker el_result, tl_result, tml_result = self.lukov_linker.link_single_question( tokens) # if self.q_links_dict is None: # self.q_links_dict = load_lukov_link_result(link_fp=self.aux_fp) # el_result = [] # tl_result = [] # tml_result = [] # lukov_link_list = self.q_links_dict.get(q_idx, []) # group_link_dict = {} # <st_ed, links> # """ separate link results into several groups by [st, ed) """ # for link_tup in lukov_link_list: # st = link_tup.start # ed = st + link_tup.length # key = '%s_%s' % (st, ed) # group_link_dict.setdefault(key, []).append(link_tup) # """ judge tagging, at least one NN(P) and JJ occurs in the span """ # postag_available_groups = [] # store all st-ed pair satisfying pos-tag limitation # for st_ed in group_link_dict.keys(): # st, ed = [int(x) for x in st_ed.split('_')] # flag = False # for idx in range(st, ed): # postag = tokens[idx].pos # if postag.startswith('NN') or postag.startswith('JJ'): # flag = True # break # if flag: # postag_available_groups.append((st, ed)) # """ longest match filtering """ # longest_match_groups = [] # sz = len(postag_available_groups) # for i in range(sz): # st_i, ed_i = postag_available_groups[i] # filter_flag = False # for j in range(sz): # if i == j: # continue # st_j, ed_j = postag_available_groups[j] # if st_j <= st_i and ed_j >= ed_i: # [st_i, ed_i) \in [st_j, ed_j) # filter_flag = True # found a longer span, filter the current one # break # if not filter_flag: # longest_match_groups.append((st_i, ed_i)) # """ Popularity filtering at each position """ # for st, ed in longest_match_groups: # key = '%s_%s' % (st, ed) # links = group_link_dict[key] # links.sort(key=lambda tup: tup.popularity, reverse=True) # E/T/Tm # for link_tup in links[: self.pop_filter_num]: # LogInfo.logs('[%s] [%d, %d): %s (%s)', # link_tup.category, link_tup.start, # link_tup.start + link_tup.length, # link_tup.mid, link_tup.name.encode('utf-8')) # if link_tup.category in ('Entity', 'Type'): # obj_item = KBEntity(name=link_tup.name, # identifier=link_tup.mid, # score=link_tup.score, # aliases=None) # perfect = (link_tup.name.lower() == link_tup.surface.lower()) # el_item = IdentifiedEntity(tokens=tokens[st: ed], # name=link_tup.name, # entity=obj_item, score=0., # surface_score=link_tup.score, # perfect_match=perfect) # if link_tup.category == 'Entity': # el_result.append(el_item) # else: # tl_result.append(el_item) # else: # Time obj # tmv = DateValue(name=link_tup.name, date=link_tup.mid) # # either name or date is the year surface # tml_item = IdentifiedEntity(tokens=tokens[st: ed], # name=link_tup.name, # entity=tmv, score=0., # surface_score=link_tup.score, # perfect_match=True) # tml_result.append(tml_item) return tokens, el_result, tl_result, tml_result def parse(self, sentence): return self.parser.parse(sentence) # contains the time identification def entity_identify_with_parse(self, tokens): self.load_data() return self.entity_linker.identify_entities_in_tokens(tokens) def time_identify_with_parse(self, tokens): self.load_data() return self.entity_linker.identify_dates(tokens) # ==== Used in SimpleQuestions, given an entity, return its mention ==== # def link_with_ground_truth(self, sentence, focus_name, focus_mid): """ ** ONLY USED IN SIMPLEQUESTIONS SCENARIO ** Given the focus entity name, return the most likely mention span. The best span would be: 1. exact match the entity name 2. the longest substring of the entity name We allow the mention starting with a useless "the" :param sentence: the question surface :param focus_name: the focus name :param focus_mid: the corresponding mid :return: the identified entities (but there should be only one) """ tokens = self.parser.parse(sentence).tokens q_word_list = [tok.token.lower() for tok in tokens] focus_word_list = [''] # the default list, just an empty string if focus_name != '': focus_tokens = self.parser.parse(focus_name).tokens focus_word_list = [tok.token.lower() for tok in focus_tokens] n = len(q_word_list) m = len(focus_word_list) st = ed = -1 best_match_words = 0. best_match_chars = 0. for i in range(n): if best_match_words == m: break # already found exact match for j in range(i + 1, n + 1): if best_match_words == m: break # already found exact match span = q_word_list[i:j] if self.is_contained(span, focus_word_list): match_words = len(span) match_chars = len(''.join(span)) if match_words < best_match_words: continue if match_words == best_match_words and match_chars < best_match_chars: continue # now update the interval st = i ed = j - 1 # close interval best_match_words = match_words best_match_chars = match_chars if st > 0 and q_word_list[st - 1] == 'the': st -= 1 obj_entity = KBEntity(name=focus_name, identifier=focus_mid, score=0, aliases=None) el_item = IdentifiedEntity(tokens=tokens[st:ed + 1], name=focus_name, entity=obj_entity, score=0, surface_score=1. * best_match_words / m, perfect_match=best_match_words == m) LogInfo.logs('Q surface: %s', q_word_list) LogInfo.logs('Focus surface: %s', focus_word_list) LogInfo.logs('EL result: [%d, %d] "%s" --> %s', st, ed, ' '.join(q_word_list[st:ed + 1]).encode('utf-8'), focus_name.encode('utf-8')) if st == -1 or ed == -1: LogInfo.logs('Warning: no suitable span found.') el_result = [el_item] tl_result = [] tml_result = [] return tokens, el_result, tl_result, tml_result @staticmethod def is_contained(span, target_word_list): """ Check whether the span is a sub word sequence in the target word list """ len_diff = len(target_word_list) - len(span) if len_diff < 0: return False for st in range(len_diff + 1): flag = True for i in range(len(span)): if span[i] != target_word_list[st + i]: flag = False break if flag: return True return False
get_dt_preds_given_type, get_ord_preds_given_type, \ is_mediator_as_expect, inverse_predicate, get_end_dt_pred, load_sup_sub_types from kangqi.util.discretizer import Discretizer # from kangqi.util.LogUtil import LogInfo tml_comp_dict = { '==': u'm.__in__', '<': u'm.__before__', '>': u'm.__after__', '>=': u'm.__since__' } # convert time comparison into a virtual mid ordinal_dict = {'max': u'm.__max__', 'min': u'm.__min__'} ans_size_disc = Discretizer([2, 3, 5, 10, 50], output_mode='list') # 5+1 # ans < 2 # 2 <= ans < 3 # 3 <= ans < 5 # 5 <= ans < 10 # 10 <= ans < 50 # ans >= 50 RawPath = namedtuple('RawPath', ['path_cate', 'focus', 'pred_seq']) class CompqSchema(object): def __init__(self): self.q_idx = None self.gather_linkings = None # all related linkings of this question (either used or not used)
import os import json import math import codecs import cPickle import shutil from ...candgen.smart import load_webq_linking_data from ...eff_candgen.combinator import LinkData from kangqi.util.LogUtil import LogInfo from kangqi.util.discretizer import Discretizer log_score_disc = Discretizer(split_list=[0, 2, 4, 6, 8, 10, 12]) # 7+1 ratio_disc = Discretizer(split_list=[0.001, 0.01, 0.1, 0.2, 0.5]) # 5+1 feat_len = log_score_disc.len + ratio_disc.len def build_feature_vector(score, max_score): log_score = math.log(score) ratio = 1. * score / max_score log_score_vec = log_score_disc.convert(score=log_score).tolist() ratio_vec = ratio_disc.convert(score=ratio).tolist() return log_score_vec + ratio_vec def single_question(schema_fp, ans_fp, links_fp, smart_item_list): if os.path.isfile(schema_fp + '.ori') and os.path.isfile(links_fp + '.ori'): LogInfo.logs('Skip, already done.')
""" Author: Kangqi Luo Date: 180118 Goal: Generate Structural Data """ import numpy as np from .dataset_schema_reader import schema_classification from kangqi.util.discretizer import Discretizer from kangqi.util.LogUtil import LogInfo ans_size_disc = Discretizer([2, 3, 5, 10, 50]) # 5+1 # ans < 2 # 2 <= ans < 3 # 3 <= ans < 5 # 5 <= ans < 10 # 10 <= ans < 50 # ans >= 50 def build_structural_data(all_cands_tup_list): """ Given all the candidate schemas, build structural-based data """ cand_size = len(all_cands_tup_list) LogInfo.begin_track('Build Structural Data for %d candidates:', cand_size) data_list = [] # store all extra list of candidates for data_idx, q_idx, sc in all_cands_tup_list: if data_idx % 50000 == 0:
def work(data_name, exp_dir_1, data_dir_1, exp_dir_2, data_dir_2, out_detail_fp, out_anno_fp): qa_list = load_compq() detail_fp_1 = exp_dir_1 + '/detail/full.t.best' detail_fp_2 = exp_dir_2 + '/detail/full.t.best' qidx_meta_dict_1 = read_ours(detail_fp_1) qidx_meta_dict_2 = read_ours(detail_fp_2) bw_detail = codecs.open(out_detail_fp, 'w', 'utf-8') bw_anno = codecs.open(out_anno_fp, 'w', 'utf-8') LogInfo.redirect(bw_detail) for bw in (bw_detail, bw_anno): bw.write('detail_fp_1: [%s] --> [%s]\n' % (data_dir_1, detail_fp_1)) bw.write('detail_fp_2: [%s] --> [%s]\n\n' % (data_dir_2, detail_fp_2)) missing_list = [] first_only_list = [] second_only_list = [] compare_list = [] if data_name == 'WebQ': range_list = range(3778, 5810) else: assert data_name == 'CompQ' range_list = range(1300, 2100) for q_idx in range_list: if q_idx not in qidx_meta_dict_1 and q_idx not in qidx_meta_dict_2: missing_list.append(q_idx) elif q_idx not in qidx_meta_dict_2: first_only_list.append(q_idx) elif q_idx not in qidx_meta_dict_1: second_only_list.append(q_idx) else: compare_list.append(q_idx) LogInfo.logs('Missing questions: %s', missing_list) LogInfo.logs('First only questions: %s', first_only_list) LogInfo.logs('Second only questions: %s\n', second_only_list) time_f1_list = [[], []] nontime_f1_list = [[], []] mark_counter = {} disc = Discretizer(split_list=[-0.5, -0.1, -0.000001, 0.000001, 0.1, 0.5]) compare_list.sort( key=lambda x: qidx_meta_dict_1[x]['f1'] - qidx_meta_dict_2[x]['f1']) for q_idx in compare_list: info_dict_1 = qidx_meta_dict_1[q_idx] info_dict_2 = qidx_meta_dict_2[q_idx] f1_1 = info_dict_1['f1'] f1_2 = info_dict_2['f1'] delta = f1_1 - f1_2 disc.convert(delta) qa = qa_list[q_idx] LogInfo.logs('============================\n') LogInfo.begin_track('Q-%04d: [%s]', q_idx, qa['utterance']) LogInfo.logs('f1_1 = %.6f, f1_2 = %.6f, delta = %.6f', f1_1, f1_2, delta) upb_list = [] for d_idx, (data_dir, info_dict) in enumerate([(data_dir_1, info_dict_1), (data_dir_2, info_dict_2)]): LogInfo.begin_track('Schema-%d, line = %d', d_idx, info_dict['line_no']) upb = retrieve_schema(data_dir, q_idx, info_dict['line_no']) upb_list.append(upb) LogInfo.end_track() LogInfo.end_track() LogInfo.logs('') bw_anno.write('Q-%04d: [%s]\n' % (q_idx, qa['utterance'])) bw_anno.write('f1_1 = %.6f, f1_2 = %.6f, delta = %.6f\n' % (f1_1, f1_2, delta)) if abs(delta) >= 0.5: hml = 'H' elif abs(delta) >= 0.1: hml = 'M' elif abs(delta) >= 1e-6: hml = 'L' else: hml = '0' if delta >= 1e-6: sgn = '+' elif delta <= -1e-6: sgn = '-' else: sgn = '' bw_anno.write('# Change: [%s%s]\n' % (sgn, hml)) has_time = 'N' for tok in qa['tokens']: if re.match('^[1-2][0-9][0-9][0-9]$', tok.token[:4]): has_time = 'Y' break if has_time == 'Y': time_f1_list[0].append(f1_1) time_f1_list[1].append(f1_2) else: nontime_f1_list[0].append(f1_1) nontime_f1_list[1].append(f1_2) bw_anno.write('# Time: [%s]\n' % has_time) upb1, upb2 = upb_list if upb1 - upb2 <= -1e-6: upb_mark = 'Less' elif upb1 - upb2 >= 1e-6: upb_mark = 'Greater' else: upb_mark = 'Equal' bw_anno.write('# Upb: [%s] (%.3f --> %.3f)\n' % (upb_mark, upb1, upb2)) overall = '%s%s_%s_%s' % (sgn, hml, has_time, upb_mark) mark_counter[overall] = 1 + mark_counter.get(overall, 0) bw_anno.write('# Overall: [%s]\n' % overall) bw_anno.write('\n\n') disc.show_distribution() LogInfo.logs('') for has_time in ('Y', 'N'): LogInfo.logs('Related to DateTime: [%s]', has_time) LogInfo.logs(' \tLess\tEqual\tGreater') for hml in ('-H', '-M', '-L', '0', '+L', '+M', '+H'): line = '%4s' % hml for upb_mark in ('Less', 'Equal', 'Greater'): overall = '%s_%s_%s' % (hml, has_time, upb_mark) count = mark_counter.get(overall, 0) line += '\t%4d' % count # LogInfo.logs('[%s]: %d (%.2f%%)', overall, count, 100. * count / 800) LogInfo.logs(line) LogInfo.logs('') LogInfo.logs('DateTime-related F1: %.6f v.s. %.6f, size = %d', np.mean(time_f1_list[0]), np.mean(time_f1_list[1]), len(time_f1_list[0])) LogInfo.logs('DateTime-not-related F1: %.6f v.s. %.6f, size = %d', np.mean(nontime_f1_list[0]), np.mean(nontime_f1_list[1]), len(nontime_f1_list[0])) LogInfo.stop_redirect() bw_detail.close() bw_anno.close()