コード例 #1
0
 def corefers_with_title_entity(s):
     s_set = get_stopwordless_token_set(s)
     is_shorter = len(s_set) <= len(wiki_article_title_set)
     has_overlap = len(
         s_set.intersection(wiki_article_title_set)) / len(
             wiki_article_title_set) > 0
     return is_shorter and has_overlap
コード例 #2
0
    def extract_data(self, file_name):

        len_prefix = len(
            f"data/versions/{self.opts.data_version_name}/wikiextractor_out/{self.opts.wiki_lang_version}/"
        )

        local_mention_counter = Counter()
        out_file_names = list()
        local_entities_found_in_article = list()

        with io.open(file_name) as f:

            for i, wiki_article in enumerate(f.readlines()):

                wiki_article = json.loads(wiki_article)

                debug = wiki_article["id"] == "28490"
                print(wiki_article["title"])
                print(file_name)
                wiki_article_title_set = get_stopwordless_token_set(
                    wiki_article["title"])

                def corefers_with_title_entity(s):
                    s_set = get_stopwordless_token_set(s)
                    is_shorter = len(s_set) <= len(wiki_article_title_set)
                    has_overlap = len(
                        s_set.intersection(wiki_article_title_set)) / len(
                            wiki_article_title_set) > 0
                    return is_shorter and has_overlap

                wiki_article_normalized_wiki_entity = normalize_wiki_entity(
                    wiki_article["title"], replace_ws=True)
                wiki_article_title_entity = self.redirects_en.get(
                    wiki_article_normalized_wiki_entity,
                    wiki_article_normalized_wiki_entity,
                )

                local_entities_found_counter = Counter()

                start_offset_dict = dict()

                # for ((start, end), (mention, wiki_page_name)) in pickle.loads(base64.b64decode(wiki_article['internal_links'].encode('utf-8'))).items():
                #     # print(char_offsets, (mention, wiki_page_name))
                #     start_offset_dict[start] = (end, (mention, wiki_page_name))

                links_offsets = sorted(
                    pickle.loads(
                        base64.b64decode(wiki_article["internal_links"].encode(
                            "utf-8"))).items(),
                    key=lambda x: x[0][0],
                )

                keywords_found = self.keyword_processor.extract_keywords(
                    wiki_article["text"], span_info=True)

                title_seen = False
                category_seen = False

                # print('-DOCSTART- ({} {})\n'.format(wiki_article['id'], wiki_article['title']))

                wiki_text_toks = list()
                wiki_text_toks_len = 0

                links_seen = 0
                kw_seen = 0
                inside_link = False
                entity = None
                inside_kw = False
                after_punct = True
                title_double_newline_seen = 0
                last_char = None

                reconstructed_wiki_text = ""
                current_snippet = ""

                if len(links_offsets) == 0:
                    continue

                if links_offsets[links_seen][0][0] == (
                        len(reconstructed_wiki_text) + len(current_snippet)):
                    inside_link = True

                if len(keywords_found) > 0 and (
                        not inside_link and
                    (keywords_found[kw_seen][1] ==
                     (len(reconstructed_wiki_text) + len(current_snippet)))):
                    inside_kw = True

                # if debug: print('inside_kw', inside_kw, 'inside_link', inside_link)

                for char_idx, char in enumerate(list(wiki_article["text"])):

                    if category_seen:
                        break

                    if char == "\n" and last_char != "." and not after_punct:
                        char = "."

                    if char == "." or last_char == ".":
                        after_punct = True
                    else:
                        after_punct = False

                    current_snippet = current_snippet + char

                    # if debug: print('wiki_text_toks', wiki_text_toks)

                    #
                    # check if *beginning* of annotated link or keyword link
                    #

                    if links_seen < len(links_offsets) and (
                            links_offsets[links_seen][0][0]
                            == (len(reconstructed_wiki_text) +
                                len(current_snippet))):

                        # clean up current snippet
                        if len(current_snippet) > 0:
                            current_snippet_tokenized = self.tokenizer.tokenize(
                                current_snippet)
                            wiki_text_toks.extend(
                                zip(
                                    current_snippet_tokenized,
                                    ["O" for _ in current_snippet_tokenized],
                                    ["-" for _ in current_snippet_tokenized],
                                ))
                            reconstructed_wiki_text += current_snippet
                            current_snippet = ""

                        # check if KB known entity
                        normalized_wiki_entity = normalize_wiki_entity(
                            links_offsets[links_seen][1][1], replace_ws=True)
                        entity = self.redirects_en.get(normalized_wiki_entity,
                                                       normalized_wiki_entity)
                        if entity in self.popular_entity_counter_dict:
                            inside_link = True
                        inside_kw = False

                    if kw_seen < len(keywords_found) and (
                            not inside_link and
                        (keywords_found[kw_seen][1]
                         == (len(reconstructed_wiki_text) +
                             len(current_snippet)))):

                        # clean up current snippet
                        if len(current_snippet) > 0:
                            current_snippet_tokenized = self.tokenizer.tokenize(
                                current_snippet)
                            wiki_text_toks.extend(
                                zip(
                                    current_snippet_tokenized,
                                    ["O" for _ in current_snippet_tokenized],
                                    ["-" for _ in current_snippet_tokenized],
                                ))
                            reconstructed_wiki_text += current_snippet
                            current_snippet = ""

                        inside_kw = True

                    #
                    # check if *end* of annotated link or keyword link
                    #

                    if links_seen < len(links_offsets) and (
                            links_offsets[links_seen][0][1]
                            == (len(reconstructed_wiki_text) +
                                len(current_snippet))):

                        # ignore if its Category link
                        if (char_idx < len(wiki_article["text"]) - 1
                                and "Category:" in current_snippet +
                                wiki_article["text"][char_idx + 1]):
                            category_seen = True
                            continue

                        # normalized_wiki_entity = normalize_wiki_entity(links_offsets[links_seen][1][1], replace_ws=True)
                        # entity = redirects_en.get(normalized_wiki_entity, normalized_wiki_entity)

                        if inside_link:
                            current_snippet_tokenized = self.tokenizer.tokenize(
                                current_snippet)
                            wiki_text_toks.extend(
                                zip(
                                    current_snippet_tokenized,
                                    [
                                        entity
                                        for _ in current_snippet_tokenized
                                    ],
                                    [
                                        current_snippet
                                        for _ in current_snippet_tokenized
                                    ],
                                ))
                            local_entities_found_counter[entity] += 1
                            local_mention_counter[current_snippet] += 1
                            reconstructed_wiki_text += current_snippet
                            current_snippet = ""

                        links_seen += 1
                        inside_link = False
                        entity = None

                    #
                    # check if *end* of keyword link and if any keyword matches have been seen.
                    #

                    if kw_seen < len(keywords_found) and (
                            keywords_found[kw_seen][2]
                            == (len(reconstructed_wiki_text) +
                                len(current_snippet))):

                        # ignore if its Category link
                        if (char_idx < len(wiki_article["text"]) - 1
                                and "Category:" in current_snippet +
                                wiki_article["text"][char_idx + 1]):
                            category_seen = True
                            continue

                        if inside_kw:
                            current_snippet_tokenized = self.tokenizer.tokenize(
                                current_snippet)
                            if (current_snippet in self.
                                    mention_entity_counter_popular_entities
                                    and wiki_article_normalized_wiki_entity
                                    in dict(
                                        self.
                                        mention_entity_counter_popular_entities[
                                            current_snippet])
                                    and corefers_with_title_entity(
                                        current_snippet)):

                                local_mention_counter[current_snippet] += 1
                                wiki_text_toks.extend(
                                    zip(
                                        current_snippet_tokenized,
                                        [
                                            wiki_article_title_entity
                                            for _ in current_snippet_tokenized
                                        ],
                                        [
                                            current_snippet
                                            for _ in current_snippet_tokenized
                                        ],
                                    ))
                            elif current_snippet in self.mention_entity_counter_popular_entities:
                                local_mention_counter[current_snippet] += 1
                                wiki_text_toks.extend(
                                    zip(
                                        current_snippet_tokenized,
                                        [
                                            "UNK"
                                            for _ in current_snippet_tokenized
                                        ],
                                        [
                                            current_snippet
                                            for _ in current_snippet_tokenized
                                        ],
                                    ))
                            else:
                                wiki_text_toks.extend(
                                    zip(
                                        current_snippet_tokenized,
                                        [
                                            "O"
                                            for _ in current_snippet_tokenized
                                        ],
                                        [
                                            "-"
                                            for _ in current_snippet_tokenized
                                        ],
                                    ))
                            reconstructed_wiki_text += current_snippet
                            current_snippet = ""

                            inside_kw = False
                            inside_link = False
                            entity = None

                        kw_seen += 1

                    last_char = char

                current_snippet_tokenized = self.tokenizer.tokenize(
                    current_snippet)
                wiki_text_toks.extend(
                    zip(
                        current_snippet_tokenized,
                        ["O" for _ in current_snippet_tokenized],
                        ["-" for _ in current_snippet_tokenized],
                    ))
                reconstructed_wiki_text += current_snippet

                out_file_path = os.path.dirname(
                    f"data/versions/{self.opts.data_version_name}/wiki_training/raw/tmp/{file_name[len_prefix:]}"
                )
                if not os.path.exists(out_file_path):
                    os.makedirs(out_file_path, exist_ok=True)
                out_file_name = f"{out_file_path}/{wiki_article['id']}.tsv"
                pandas.DataFrame(wiki_text_toks).to_csv(out_file_name,
                                                        sep="\t",
                                                        header=None)
                out_file_names.append(out_file_name)
                local_entities_found_in_article.append(
                    (out_file_name, local_entities_found_counter))
        return out_file_names, local_mention_counter, local_entities_found_in_article