def create_line_output(self, e, slot_filler, slot_filler_index, slot_type, combined_slot_filler=False, confidence_score=1): doc_id = e.doc_id parse_result = e.parse_result l = LineOutput() l.slot_type = slot_type evidence_offset_beg = self.sf_object.cleaned_docs[doc_id].find(''.join( parse_result['text'])) w_p = Provenance() w_p.doc_id = doc_id cleaned_doc_beg = int( parse_result['words'][0][1]['CharacterOffsetBegin']) cleaned_doc_end = int( parse_result['words'][-1][1]['CharacterOffsetEnd']) - 1 if cleaned_doc_beg == 0: cleaned_doc_beg += evidence_offset_beg cleaned_doc_end += evidence_offset_beg w_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg] w_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end] w_p.text = e.sent_text w_p.trigger = e.trigger l.wide_provenance = [w_p] l.slot_filler = slot_filler sf_p = Provenance() sf_p.doc_id = doc_id if combined_slot_filler: cleaned_doc_beg = ''.join(parse_result['text']).find(slot_filler) cleaned_doc_end = cleaned_doc_beg + len(slot_filler) - 1 else: # here node index in dependency graph need -1 because it start from root which indexed 0 cleaned_doc_beg = int( parse_result['words'][slot_filler_index - 1][1]['CharacterOffsetBegin']) cleaned_doc_end = int( parse_result['words'][slot_filler_index - 1][1]['CharacterOffsetEnd']) - 1 if int(parse_result['words'][0][1] ['CharacterOffsetBegin']) == 0 or combined_slot_filler: cleaned_doc_beg += evidence_offset_beg cleaned_doc_end += evidence_offset_beg sf_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg] sf_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end] sf_p.text = slot_filler l.slot_filler_prov = [sf_p] l.confidence_score = confidence_score return l
def create_line_output(self, e, slot_filler, slot_filler_index, slot_type, combined_slot_filler=False, confidence_score=1): doc_id = e.doc_id parse_result = e.parse_result l = LineOutput() l.slot_type = slot_type evidence_offset_beg = self.sf_object.cleaned_docs[doc_id].find(''.join(parse_result['text'])) w_p = Provenance() w_p.doc_id = doc_id cleaned_doc_beg = int(parse_result['words'][0][1]['CharacterOffsetBegin']) cleaned_doc_end = int(parse_result['words'][-1][1]['CharacterOffsetEnd'])-1 if cleaned_doc_beg == 0: cleaned_doc_beg += evidence_offset_beg cleaned_doc_end += evidence_offset_beg w_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg] w_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end] w_p.text = e.sent_text w_p.trigger = e.trigger l.wide_provenance = [w_p] l.slot_filler = slot_filler sf_p = Provenance() sf_p.doc_id = doc_id if combined_slot_filler: cleaned_doc_beg = ''.join(parse_result['text']).find(slot_filler) cleaned_doc_end = cleaned_doc_beg+len(slot_filler)-1 else: # here node index in dependency graph need -1 because it start from root which indexed 0 cleaned_doc_beg = int(parse_result['words'][slot_filler_index-1][1]['CharacterOffsetBegin']) cleaned_doc_end = int(parse_result['words'][slot_filler_index-1][1]['CharacterOffsetEnd'])-1 if int(parse_result['words'][0][1]['CharacterOffsetBegin']) == 0 or combined_slot_filler: cleaned_doc_beg += evidence_offset_beg cleaned_doc_end += evidence_offset_beg sf_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg] sf_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end] sf_p.text = slot_filler l.slot_filler_prov = [sf_p] l.confidence_score = confidence_score return l
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province + city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output + [l]
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country + state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output + [l, province] return current_output + [l]
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province+city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output+[l]
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country+state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output+[l, province] return current_output+[l]