def corefers_with_title_entity(s): s_set = get_stopwordless_token_set(s) is_shorter = len(s_set) <= len(wiki_article_title_set) has_overlap = len( s_set.intersection(wiki_article_title_set)) / len( wiki_article_title_set) > 0 return is_shorter and has_overlap
def extract_data(self, file_name): len_prefix = len( f"data/versions/{self.opts.data_version_name}/wikiextractor_out/{self.opts.wiki_lang_version}/" ) local_mention_counter = Counter() out_file_names = list() local_entities_found_in_article = list() with io.open(file_name) as f: for i, wiki_article in enumerate(f.readlines()): wiki_article = json.loads(wiki_article) debug = wiki_article["id"] == "28490" print(wiki_article["title"]) print(file_name) wiki_article_title_set = get_stopwordless_token_set( wiki_article["title"]) def corefers_with_title_entity(s): s_set = get_stopwordless_token_set(s) is_shorter = len(s_set) <= len(wiki_article_title_set) has_overlap = len( s_set.intersection(wiki_article_title_set)) / len( wiki_article_title_set) > 0 return is_shorter and has_overlap wiki_article_normalized_wiki_entity = normalize_wiki_entity( wiki_article["title"], replace_ws=True) wiki_article_title_entity = self.redirects_en.get( wiki_article_normalized_wiki_entity, wiki_article_normalized_wiki_entity, ) local_entities_found_counter = Counter() start_offset_dict = dict() # for ((start, end), (mention, wiki_page_name)) in pickle.loads(base64.b64decode(wiki_article['internal_links'].encode('utf-8'))).items(): # # print(char_offsets, (mention, wiki_page_name)) # start_offset_dict[start] = (end, (mention, wiki_page_name)) links_offsets = sorted( pickle.loads( base64.b64decode(wiki_article["internal_links"].encode( "utf-8"))).items(), key=lambda x: x[0][0], ) keywords_found = self.keyword_processor.extract_keywords( wiki_article["text"], span_info=True) title_seen = False category_seen = False # print('-DOCSTART- ({} {})\n'.format(wiki_article['id'], wiki_article['title'])) wiki_text_toks = list() wiki_text_toks_len = 0 links_seen = 0 kw_seen = 0 inside_link = False entity = None inside_kw = False after_punct = True title_double_newline_seen = 0 last_char = None reconstructed_wiki_text = "" current_snippet = "" if len(links_offsets) == 0: continue if links_offsets[links_seen][0][0] == ( len(reconstructed_wiki_text) + len(current_snippet)): inside_link = True if len(keywords_found) > 0 and ( not inside_link and (keywords_found[kw_seen][1] == (len(reconstructed_wiki_text) + len(current_snippet)))): inside_kw = True # if debug: print('inside_kw', inside_kw, 'inside_link', inside_link) for char_idx, char in enumerate(list(wiki_article["text"])): if category_seen: break if char == "\n" and last_char != "." and not after_punct: char = "." if char == "." or last_char == ".": after_punct = True else: after_punct = False current_snippet = current_snippet + char # if debug: print('wiki_text_toks', wiki_text_toks) # # check if *beginning* of annotated link or keyword link # if links_seen < len(links_offsets) and ( links_offsets[links_seen][0][0] == (len(reconstructed_wiki_text) + len(current_snippet))): # clean up current snippet if len(current_snippet) > 0: current_snippet_tokenized = self.tokenizer.tokenize( current_snippet) wiki_text_toks.extend( zip( current_snippet_tokenized, ["O" for _ in current_snippet_tokenized], ["-" for _ in current_snippet_tokenized], )) reconstructed_wiki_text += current_snippet current_snippet = "" # check if KB known entity normalized_wiki_entity = normalize_wiki_entity( links_offsets[links_seen][1][1], replace_ws=True) entity = self.redirects_en.get(normalized_wiki_entity, normalized_wiki_entity) if entity in self.popular_entity_counter_dict: inside_link = True inside_kw = False if kw_seen < len(keywords_found) and ( not inside_link and (keywords_found[kw_seen][1] == (len(reconstructed_wiki_text) + len(current_snippet)))): # clean up current snippet if len(current_snippet) > 0: current_snippet_tokenized = self.tokenizer.tokenize( current_snippet) wiki_text_toks.extend( zip( current_snippet_tokenized, ["O" for _ in current_snippet_tokenized], ["-" for _ in current_snippet_tokenized], )) reconstructed_wiki_text += current_snippet current_snippet = "" inside_kw = True # # check if *end* of annotated link or keyword link # if links_seen < len(links_offsets) and ( links_offsets[links_seen][0][1] == (len(reconstructed_wiki_text) + len(current_snippet))): # ignore if its Category link if (char_idx < len(wiki_article["text"]) - 1 and "Category:" in current_snippet + wiki_article["text"][char_idx + 1]): category_seen = True continue # normalized_wiki_entity = normalize_wiki_entity(links_offsets[links_seen][1][1], replace_ws=True) # entity = redirects_en.get(normalized_wiki_entity, normalized_wiki_entity) if inside_link: current_snippet_tokenized = self.tokenizer.tokenize( current_snippet) wiki_text_toks.extend( zip( current_snippet_tokenized, [ entity for _ in current_snippet_tokenized ], [ current_snippet for _ in current_snippet_tokenized ], )) local_entities_found_counter[entity] += 1 local_mention_counter[current_snippet] += 1 reconstructed_wiki_text += current_snippet current_snippet = "" links_seen += 1 inside_link = False entity = None # # check if *end* of keyword link and if any keyword matches have been seen. # if kw_seen < len(keywords_found) and ( keywords_found[kw_seen][2] == (len(reconstructed_wiki_text) + len(current_snippet))): # ignore if its Category link if (char_idx < len(wiki_article["text"]) - 1 and "Category:" in current_snippet + wiki_article["text"][char_idx + 1]): category_seen = True continue if inside_kw: current_snippet_tokenized = self.tokenizer.tokenize( current_snippet) if (current_snippet in self. mention_entity_counter_popular_entities and wiki_article_normalized_wiki_entity in dict( self. mention_entity_counter_popular_entities[ current_snippet]) and corefers_with_title_entity( current_snippet)): local_mention_counter[current_snippet] += 1 wiki_text_toks.extend( zip( current_snippet_tokenized, [ wiki_article_title_entity for _ in current_snippet_tokenized ], [ current_snippet for _ in current_snippet_tokenized ], )) elif current_snippet in self.mention_entity_counter_popular_entities: local_mention_counter[current_snippet] += 1 wiki_text_toks.extend( zip( current_snippet_tokenized, [ "UNK" for _ in current_snippet_tokenized ], [ current_snippet for _ in current_snippet_tokenized ], )) else: wiki_text_toks.extend( zip( current_snippet_tokenized, [ "O" for _ in current_snippet_tokenized ], [ "-" for _ in current_snippet_tokenized ], )) reconstructed_wiki_text += current_snippet current_snippet = "" inside_kw = False inside_link = False entity = None kw_seen += 1 last_char = char current_snippet_tokenized = self.tokenizer.tokenize( current_snippet) wiki_text_toks.extend( zip( current_snippet_tokenized, ["O" for _ in current_snippet_tokenized], ["-" for _ in current_snippet_tokenized], )) reconstructed_wiki_text += current_snippet out_file_path = os.path.dirname( f"data/versions/{self.opts.data_version_name}/wiki_training/raw/tmp/{file_name[len_prefix:]}" ) if not os.path.exists(out_file_path): os.makedirs(out_file_path, exist_ok=True) out_file_name = f"{out_file_path}/{wiki_article['id']}.tsv" pandas.DataFrame(wiki_text_toks).to_csv(out_file_name, sep="\t", header=None) out_file_names.append(out_file_name) local_entities_found_in_article.append( (out_file_name, local_entities_found_counter)) return out_file_names, local_mention_counter, local_entities_found_in_article