def retrieve_query_doc(self): for q in self.queries: q_results = dict() print(q.name + '...'), found_doc_path = search(q.name, self.lucene_searcher, self.lucene_analyzer, 3000) for doc_path in found_doc_path: doc_id = doc_path.split('/')[-1].strip() doc_content = io.open(doc_path, 'r', -1, 'utf-8').read() if doc_content.replace(' ', '').replace('\n', '').count(q.name) < 4: continue q_results[doc_id] = doc_content # clean doc cleaned_doc = remove_doc_noise(doc_content) # remove tags like datetime, headline, dateline, etc. cleaned_doc = remove_xml_tag(cleaned_doc) cleaned_doc = remove_space_linebreak(cleaned_doc) # create offset mapping table betwenn clean doc and origin doc if doc_id in self.doc_mapping_table.keys(): continue offset_mapping_table = OrderedDict() cleaned_doc_index = 0 origin_doc_index = 0 for char in cleaned_doc: while True: if char != doc_content[origin_doc_index]: origin_doc_index += 1 else: offset_mapping_table[cleaned_doc_index] = origin_doc_index cleaned_doc_index += 1 origin_doc_index += 1 break # check correctness of offset mapping table for index in offset_mapping_table.keys(): assert cleaned_doc[index] == doc_content[offset_mapping_table[index]] self.doc_mapping_table[doc_id] = offset_mapping_table self.cleaned_docs[doc_id] = cleaned_doc self.query_docs[q.id] = q_results print('Done')
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province + city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output + [l]
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country + state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output + [l, province] return current_output + [l]
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province+city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output+[l]
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country+state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output+[l, province] return current_output+[l]