def retrieve_query_doc(self): for q in self.queries: q_results = dict() print(q.name + '...'), found_doc_path = search(q.name, self.lucene_searcher, self.lucene_analyzer, 3000) for doc_path in found_doc_path: doc_id = doc_path.split('/')[-1].strip() doc_content = io.open(doc_path, 'r', -1, 'utf-8').read() if doc_content.replace(' ', '').replace('\n', '').count(q.name) < 4: continue q_results[doc_id] = doc_content # clean doc cleaned_doc = remove_doc_noise(doc_content) # remove tags like datetime, headline, dateline, etc. cleaned_doc = remove_xml_tag(cleaned_doc) cleaned_doc = remove_space_linebreak(cleaned_doc) # create offset mapping table betwenn clean doc and origin doc if doc_id in self.doc_mapping_table.keys(): continue offset_mapping_table = OrderedDict() cleaned_doc_index = 0 origin_doc_index = 0 for char in cleaned_doc: while True: if char != doc_content[origin_doc_index]: origin_doc_index += 1 else: offset_mapping_table[cleaned_doc_index] = origin_doc_index cleaned_doc_index += 1 origin_doc_index += 1 break # check correctness of offset mapping table for index in offset_mapping_table.keys(): assert cleaned_doc[index] == doc_content[offset_mapping_table[index]] self.doc_mapping_table[doc_id] = offset_mapping_table self.cleaned_docs[doc_id] = cleaned_doc self.query_docs[q.id] = q_results print('Done')
def general_inference(query_system_answer): # load sf source document if version == 'ldc': src_doc = load_sf_src_doc( '../../data/LDC/LDC2014E123_TAC_KBP_2014_Chinese_Regular_Slot_Filling_Training_Data/' 'data/source_doc/') elif version == 'eval': src_doc = load_sf_src_doc( '../../data/LDC/KBP_2015_Chinese_Regular_Slot_Filling_Evaluation_Data/source_doc/' ) # ================== remove slot filler related to 记者 ===================== # for slot_type in query_system_answer.output.keys(): line_outputs = query_system_answer.output[slot_type] corrected_line_outputs = [] for l in line_outputs: if not l.slot_filler: corrected_line_outputs.append(l) continue void_l = False for w_p in l.wide_provenance: w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end] if (u'记者' + l.slot_filler) in w_p_text or ( l.slot_filler + u'记者') in w_p_text: void_l = True break if void_l is False: corrected_line_outputs.append(l) query_system_answer.output[slot_type] = corrected_line_outputs # ================== fix slot filler offset ===================== # for slot_type in query_system_answer.output.keys(): if 'date' in slot_type: continue line_outputs = query_system_answer.output[slot_type] for l in line_outputs: if not l.slot_filler: continue for i in xrange(len(l.slot_filler_prov)): try: s_p = l.slot_filler_prov[i] s_p_text = src_doc[s_p.doc_id][s_p.beg:s_p.end + 1] if l.slot_filler == remove_space_linebreak(s_p_text): continue w_p = l.wide_provenance[i] w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end + 1] correct_w_p_index = src_doc[w_p.doc_id].find(w_p_text) s_p_inner_index = w_p_text.find(l.slot_filler) correct_s_p_beg = correct_w_p_index + s_p_inner_index correct_s_p_end = correct_s_p_beg + len( l.slot_filler) - 1 corrected_s_p_text = src_doc[ s_p.doc_id][correct_s_p_beg:correct_s_p_end + 1] if corrected_s_p_text != l.slot_filler: continue l.slot_filler_prov[i].beg = correct_s_p_beg l.slot_filler_prov[i].end = correct_s_p_end except IndexError: continue return query_system_answer
def general_inference(query_system_answer): # load sf source document if version == 'ldc': src_doc = load_sf_src_doc('../../data/LDC/LDC2014E123_TAC_KBP_2014_Chinese_Regular_Slot_Filling_Training_Data/' 'data/source_doc/') elif version == 'eval': src_doc = load_sf_src_doc('../../data/LDC/KBP_2015_Chinese_Regular_Slot_Filling_Evaluation_Data/source_doc/') # ================== remove slot filler related to 记者 ===================== # for slot_type in query_system_answer.output.keys(): line_outputs = query_system_answer.output[slot_type] corrected_line_outputs = [] for l in line_outputs: if not l.slot_filler: corrected_line_outputs.append(l) continue void_l = False for w_p in l.wide_provenance: w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end] if (u'记者' + l.slot_filler) in w_p_text or (l.slot_filler + u'记者') in w_p_text: void_l = True break if void_l is False: corrected_line_outputs.append(l) query_system_answer.output[slot_type] = corrected_line_outputs # ================== fix slot filler offset ===================== # for slot_type in query_system_answer.output.keys(): if 'date' in slot_type: continue line_outputs = query_system_answer.output[slot_type] for l in line_outputs: if not l.slot_filler: continue for i in xrange(len(l.slot_filler_prov)): try: s_p = l.slot_filler_prov[i] s_p_text = src_doc[s_p.doc_id][s_p.beg:s_p.end+1] if l.slot_filler == remove_space_linebreak(s_p_text): continue w_p = l.wide_provenance[i] w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end+1] correct_w_p_index = src_doc[w_p.doc_id].find(w_p_text) s_p_inner_index = w_p_text.find(l.slot_filler) correct_s_p_beg = correct_w_p_index + s_p_inner_index correct_s_p_end = correct_s_p_beg + len(l.slot_filler) - 1 corrected_s_p_text = src_doc[s_p.doc_id][correct_s_p_beg:correct_s_p_end+1] if corrected_s_p_text != l.slot_filler: continue l.slot_filler_prov[i].beg = correct_s_p_beg l.slot_filler_prov[i].end = correct_s_p_end except IndexError: continue return query_system_answer