Ejemplo n.º 1
0
    def retrieve_query_doc(self):
        for q in self.queries:
            q_results = dict()
            print(q.name + '...'),
            found_doc_path = search(q.name, self.lucene_searcher, self.lucene_analyzer, 3000)
            for doc_path in found_doc_path:
                doc_id = doc_path.split('/')[-1].strip()
                doc_content = io.open(doc_path, 'r', -1, 'utf-8').read()
                if doc_content.replace(' ', '').replace('\n', '').count(q.name) < 4:
                    continue
                q_results[doc_id] = doc_content

                # clean doc
                cleaned_doc = remove_doc_noise(doc_content)  # remove tags like datetime, headline, dateline, etc.
                cleaned_doc = remove_xml_tag(cleaned_doc)
                cleaned_doc = remove_space_linebreak(cleaned_doc)

                # create offset mapping table betwenn clean doc and origin doc
                if doc_id in self.doc_mapping_table.keys():
                    continue
                offset_mapping_table = OrderedDict()
                cleaned_doc_index = 0
                origin_doc_index = 0
                for char in cleaned_doc:
                    while True:
                        if char != doc_content[origin_doc_index]:
                            origin_doc_index += 1
                        else:
                            offset_mapping_table[cleaned_doc_index] = origin_doc_index
                            cleaned_doc_index += 1
                            origin_doc_index += 1
                            break
                # check correctness of offset mapping table
                for index in offset_mapping_table.keys():
                    assert cleaned_doc[index] == doc_content[offset_mapping_table[index]]

                self.doc_mapping_table[doc_id] = offset_mapping_table
                self.cleaned_docs[doc_id] = cleaned_doc

            self.query_docs[q.id] = q_results

            print('Done')
Ejemplo n.º 2
0
    def general_inference(query_system_answer):
        # load sf source document
        if version == 'ldc':
            src_doc = load_sf_src_doc(
                '../../data/LDC/LDC2014E123_TAC_KBP_2014_Chinese_Regular_Slot_Filling_Training_Data/'
                'data/source_doc/')
        elif version == 'eval':
            src_doc = load_sf_src_doc(
                '../../data/LDC/KBP_2015_Chinese_Regular_Slot_Filling_Evaluation_Data/source_doc/'
            )

        # ================== remove slot filler related to 记者 ===================== #
        for slot_type in query_system_answer.output.keys():
            line_outputs = query_system_answer.output[slot_type]

            corrected_line_outputs = []
            for l in line_outputs:
                if not l.slot_filler:
                    corrected_line_outputs.append(l)
                    continue
                void_l = False
                for w_p in l.wide_provenance:
                    w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end]
                    if (u'记者' + l.slot_filler) in w_p_text or (
                            l.slot_filler + u'记者') in w_p_text:
                        void_l = True
                        break
                if void_l is False:
                    corrected_line_outputs.append(l)

            query_system_answer.output[slot_type] = corrected_line_outputs

        # ================== fix slot filler offset ===================== #
        for slot_type in query_system_answer.output.keys():
            if 'date' in slot_type:
                continue

            line_outputs = query_system_answer.output[slot_type]

            for l in line_outputs:
                if not l.slot_filler:
                    continue
                for i in xrange(len(l.slot_filler_prov)):
                    try:
                        s_p = l.slot_filler_prov[i]
                        s_p_text = src_doc[s_p.doc_id][s_p.beg:s_p.end + 1]
                        if l.slot_filler == remove_space_linebreak(s_p_text):
                            continue
                        w_p = l.wide_provenance[i]
                        w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end + 1]

                        correct_w_p_index = src_doc[w_p.doc_id].find(w_p_text)
                        s_p_inner_index = w_p_text.find(l.slot_filler)
                        correct_s_p_beg = correct_w_p_index + s_p_inner_index
                        correct_s_p_end = correct_s_p_beg + len(
                            l.slot_filler) - 1

                        corrected_s_p_text = src_doc[
                            s_p.doc_id][correct_s_p_beg:correct_s_p_end + 1]
                        if corrected_s_p_text != l.slot_filler:
                            continue

                        l.slot_filler_prov[i].beg = correct_s_p_beg
                        l.slot_filler_prov[i].end = correct_s_p_end
                    except IndexError:
                        continue

        return query_system_answer
Ejemplo n.º 3
0
    def general_inference(query_system_answer):
        # load sf source document
        if version == 'ldc':
            src_doc = load_sf_src_doc('../../data/LDC/LDC2014E123_TAC_KBP_2014_Chinese_Regular_Slot_Filling_Training_Data/'
                                      'data/source_doc/')
        elif version == 'eval':
            src_doc = load_sf_src_doc('../../data/LDC/KBP_2015_Chinese_Regular_Slot_Filling_Evaluation_Data/source_doc/')

        # ================== remove slot filler related to 记者 ===================== #
        for slot_type in query_system_answer.output.keys():
            line_outputs = query_system_answer.output[slot_type]

            corrected_line_outputs = []
            for l in line_outputs:
                if not l.slot_filler:
                    corrected_line_outputs.append(l)
                    continue
                void_l = False
                for w_p in l.wide_provenance:
                    w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end]
                    if (u'记者' + l.slot_filler) in w_p_text or (l.slot_filler + u'记者') in w_p_text:
                        void_l = True
                        break
                if void_l is False:
                    corrected_line_outputs.append(l)

            query_system_answer.output[slot_type] = corrected_line_outputs

        # ================== fix slot filler offset ===================== #
        for slot_type in query_system_answer.output.keys():
            if 'date' in slot_type:
                continue

            line_outputs = query_system_answer.output[slot_type]

            for l in line_outputs:
                if not l.slot_filler:
                    continue
                for i in xrange(len(l.slot_filler_prov)):
                    try:
                        s_p = l.slot_filler_prov[i]
                        s_p_text = src_doc[s_p.doc_id][s_p.beg:s_p.end+1]
                        if l.slot_filler == remove_space_linebreak(s_p_text):
                            continue
                        w_p = l.wide_provenance[i]
                        w_p_text = src_doc[w_p.doc_id][w_p.beg:w_p.end+1]

                        correct_w_p_index = src_doc[w_p.doc_id].find(w_p_text)
                        s_p_inner_index = w_p_text.find(l.slot_filler)
                        correct_s_p_beg = correct_w_p_index + s_p_inner_index
                        correct_s_p_end = correct_s_p_beg + len(l.slot_filler) - 1

                        corrected_s_p_text = src_doc[s_p.doc_id][correct_s_p_beg:correct_s_p_end+1]
                        if corrected_s_p_text != l.slot_filler:
                            continue

                        l.slot_filler_prov[i].beg = correct_s_p_beg
                        l.slot_filler_prov[i].end = correct_s_p_end
                    except IndexError:
                        continue

        return query_system_answer