def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher,
                                self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province + city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output + [l]
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher,
                                self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country + state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output + [l, province]

        return current_output + [l]
Beispiel #3
0
    def create_line_output(self, e, slot_filler, slot_filler_index, slot_type, combined_slot_filler=False, confidence_score=1):
        doc_id = e.doc_id
        parse_result = e.parse_result
        l = LineOutput()
        l.slot_type = slot_type

        evidence_offset_beg = self.sf_object.cleaned_docs[doc_id].find(''.join(parse_result['text']))

        w_p = Provenance()
        w_p.doc_id = doc_id
        cleaned_doc_beg = int(parse_result['words'][0][1]['CharacterOffsetBegin'])
        cleaned_doc_end = int(parse_result['words'][-1][1]['CharacterOffsetEnd'])-1
        if cleaned_doc_beg == 0:
            cleaned_doc_beg += evidence_offset_beg
            cleaned_doc_end += evidence_offset_beg
        w_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg]
        w_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end]
        w_p.text = e.sent_text
        w_p.trigger = e.trigger
        l.wide_provenance = [w_p]

        l.slot_filler = slot_filler

        sf_p = Provenance()
        sf_p.doc_id = doc_id

        if combined_slot_filler:
            cleaned_doc_beg = ''.join(parse_result['text']).find(slot_filler)
            cleaned_doc_end = cleaned_doc_beg+len(slot_filler)-1
        else:
            # here node index in dependency graph need -1 because it start from root which indexed 0
            cleaned_doc_beg = int(parse_result['words'][slot_filler_index-1][1]['CharacterOffsetBegin'])
            cleaned_doc_end = int(parse_result['words'][slot_filler_index-1][1]['CharacterOffsetEnd'])-1
        if int(parse_result['words'][0][1]['CharacterOffsetBegin']) == 0 or combined_slot_filler:
            cleaned_doc_beg += evidence_offset_beg
            cleaned_doc_end += evidence_offset_beg
        sf_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg]
        sf_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end]
        sf_p.text = slot_filler
        l.slot_filler_prov = [sf_p]

        l.confidence_score = confidence_score

        return l
Beispiel #4
0
    def stateorprovince(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        city = None

        # find query's city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                city = line_output
        if city is None:
            return current_output

        # infer province by city
        province = ''
        evidence = ''  # evidence is a LineOutput object
        city_slot_filler = city.slot_filler
        city_slot_filler = jianfan.ftoj(city_slot_filler)
        for r in [u'区', u'县', u'市']:
            city_slot_filler = city_slot_filler.replace(r, '')

        for p in self.china_province_city:
            if province:
                break
            if p['type'] == 0:
                if city_slot_filler in [item['name'] for item in p['sub']]:
                    province = p['name']
                    evidence = city
                    break
            else:
                for c in p['sub']:
                    if city_slot_filler in [item['name'] for item in c['sub']]:
                        province = p['name']
                        evidence = city
                        break

        # if inference fails, return original answer
        if not province:
            return current_output

        # search provenance
        found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(province + city_slot_filler)
        wp_end = wp_beg + len(province + city_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province)
        sp_end = sp_beg + len(province) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = province+city_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = province

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = province
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        return current_output+[l]
Beispiel #5
0
    def country(self, slot_type, evidence_slot_type):
        current_output = self.query_answer.output[slot_type]

        province = None

        # find query's province and city answer.
        for line_output in self.query_answer.output[evidence_slot_type]:
            if line_output.slot_filler:
                province = line_output
        if province is None:
            return current_output

        # infer country by province
        country = ''
        evidence = ''  # evidence is a LineOutput object
        state_slot_filler = jianfan.ftoj(province.slot_filler)
        for c in self.world_coutry_province:
            if state_slot_filler in self.world_coutry_province[c]:
                country = c
                evidence = province
                break

        # if inference fails, return original answer
        if not country:
            return current_output

        # search provenance
        found_doc_path = search(country + state_slot_filler,
                                self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50)

        if not found_doc_path:
            return current_output

        evidence_doc_path = found_doc_path[0]
        # add additional doc to source_doc for visualization
        doc_id = evidence_doc_path.split('/')[-1].strip()
        doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read()
        self.sf_object.query_docs[doc_id] = doc

        wp_beg = doc.find(country + state_slot_filler)
        wp_end = wp_beg + len(country + state_slot_filler) - 1
        sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country)
        sp_end = sp_beg + len(country) - 1

        l = LineOutput()
        l.slot_type = slot_type
        l.run_id = self.query_answer.run_id

        p = Provenance()
        p.doc_id = doc_id
        p.beg = wp_beg
        p.end = wp_end
        p.text = country+state_slot_filler
        l.wide_provenance = [p]
        evidence.wide_provenance[0].inference = True
        l.wide_provenance += evidence.wide_provenance  # evidence is a LineOutput object

        l.slot_filler = country

        p = Provenance()
        p.doc_id = doc_id
        p.beg = sp_beg
        p.end = sp_end
        p.text = country
        l.slot_filler_prov = [p]

        l.confidence_score = 1

        # if province is 台湾, coutry should also add 台湾
        if u'台湾' in jianfan.ftoj(province.slot_filler):
            return current_output+[l, province]

        return current_output+[l]
Beispiel #6
0
    def create_line_output(self,
                           e,
                           slot_filler,
                           slot_filler_index,
                           slot_type,
                           combined_slot_filler=False,
                           confidence_score=1):
        doc_id = e.doc_id
        parse_result = e.parse_result
        l = LineOutput()
        l.slot_type = slot_type

        evidence_offset_beg = self.sf_object.cleaned_docs[doc_id].find(''.join(
            parse_result['text']))

        w_p = Provenance()
        w_p.doc_id = doc_id
        cleaned_doc_beg = int(
            parse_result['words'][0][1]['CharacterOffsetBegin'])
        cleaned_doc_end = int(
            parse_result['words'][-1][1]['CharacterOffsetEnd']) - 1
        if cleaned_doc_beg == 0:
            cleaned_doc_beg += evidence_offset_beg
            cleaned_doc_end += evidence_offset_beg
        w_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg]
        w_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end]
        w_p.text = e.sent_text
        w_p.trigger = e.trigger
        l.wide_provenance = [w_p]

        l.slot_filler = slot_filler

        sf_p = Provenance()
        sf_p.doc_id = doc_id

        if combined_slot_filler:
            cleaned_doc_beg = ''.join(parse_result['text']).find(slot_filler)
            cleaned_doc_end = cleaned_doc_beg + len(slot_filler) - 1
        else:
            # here node index in dependency graph need -1 because it start from root which indexed 0
            cleaned_doc_beg = int(
                parse_result['words'][slot_filler_index -
                                      1][1]['CharacterOffsetBegin'])
            cleaned_doc_end = int(
                parse_result['words'][slot_filler_index -
                                      1][1]['CharacterOffsetEnd']) - 1
        if int(parse_result['words'][0][1]
               ['CharacterOffsetBegin']) == 0 or combined_slot_filler:
            cleaned_doc_beg += evidence_offset_beg
            cleaned_doc_end += evidence_offset_beg
        sf_p.beg = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_beg]
        sf_p.end = self.sf_object.doc_mapping_table[doc_id][cleaned_doc_end]
        sf_p.text = slot_filler
        l.slot_filler_prov = [sf_p]

        l.confidence_score = confidence_score

        return l