def load_trie(language_path): trie_index2indices_values = OffsetArray.load( join(language_path, "trie_index2indices")) trie_index2indices_counts = OffsetArray( np.load(join(language_path, "trie_index2indices_counts.npy")), trie_index2indices_values.offsets) trie = marisa_trie.Trie().load(join(language_path, "trie.marisa")) return trie_index2indices_values, trie_index2indices_counts, trie
def fix_and_parse_tags(config, collection, size): trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) prefix = get_prefix(config) redirections = load_redirections(config.redirections) docs = load_wikipedia_docs(config.wiki, size) while True: try: collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) except (ValueError, ) as e: print("issue reading blacklist, please fix.") print(str(e)) enter_or_quit() continue break print("Load first_names") with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin: first_names = set(fin.read().splitlines()) all_tags = [] for doc in get_progress_bar('fixing links', item='article')(docs): tags = obtain_tags( doc, wiki_trie=wiki_trie, anchor_trie=anchor_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions=trie_index2indices_transitions, redirections=redirections, prefix=prefix, first_names=first_names, collection=collection, fix_destination=fix_destination, min_count=config.min_count, min_percent=config.min_percent) if any(x is not None for _, x in tags): all_tags.append(tags) collection.reset_cache() return all_tags
def main(): args = parse_args() config = load_config(args.config, ["wiki", "language_path", "wikidata", "redirections"], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100 }, relative_to=args.relative_to) prefix = config.prefix or induce_wikipedia_prefix(config.wiki) collection = TypeCollection(config.wikidata, num_names_to_load=0) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) redirections = load_redirections(config.redirections) seen = 0 with open(args.out, "wt") as fout: try: for i, (article_name, article) in tqdm(enumerate(iterate_articles(config.wiki))): if i == 5409: continue fixed_article, article_qid = convert( article_name, article, collection=collection, anchor_trie=anchor_trie, wiki_trie=wiki_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions= trie_index2indices_transitions, redirections=redirections, prefix=prefix) if fixed_article is False: continue for paragraph in fixed_article: for word, qids in paragraph: if len(qids) > 0: fout.write(word.rstrip() + "\t" + "\t".join(qids + [article_qid]) + "\n") else: fout.write(word.rstrip() + "\n") fout.write("\n") seen += 1 if seen >= config.sample_size: break finally: fout.flush() fout.close()