def load_trie(language_path): trie_index2indices_values = OffsetArray.load( join(language_path, "trie_index2indices")) trie_index2indices_counts = OffsetArray( np.load(join(language_path, "trie_index2indices_counts.npy")), trie_index2indices_values.offsets) trie = marisa_trie.Trie().load(join(language_path, "trie.marisa")) return trie_index2indices_values, trie_index2indices_counts, trie
def fix_and_parse_tags(config, collection, size): trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) prefix = get_prefix(config) redirections = load_redirections(config.redirections) docs = load_wikipedia_docs(config.wiki, size) while True: try: collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) except (ValueError, ) as e: print("issue reading blacklist, please fix.") print(str(e)) enter_or_quit() continue break print("Load first_names") with open(join(PROJECT_DIR, "data", "first_names.txt"), "rt") as fin: first_names = set(fin.read().splitlines()) all_tags = [] for doc in get_progress_bar('fixing links', item='article')(docs): tags = obtain_tags( doc, wiki_trie=wiki_trie, anchor_trie=anchor_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions=trie_index2indices_transitions, redirections=redirections, prefix=prefix, first_names=first_names, collection=collection, fix_destination=fix_destination, min_count=config.min_count, min_percent=config.min_percent) if any(x is not None for _, x in tags): all_tags.append(tags) collection.reset_cache() return all_tags
def main(): args = parse_args() config = load_config(args.config, ["wiki", "language_path", "wikidata", "redirections"], defaults={ "num_names_to_load": 0, "prefix": None, "sample_size": 100 }, relative_to=args.relative_to) prefix = config.prefix or induce_wikipedia_prefix(config.wiki) collection = TypeCollection(config.wikidata, num_names_to_load=0) collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) trie_index2indices = OffsetArray.load(join(config.language_path, "trie_index2indices"), compress=True) trie_index2indices_counts = OffsetArray( np.load(join(config.language_path, "trie_index2indices_counts.npy")), trie_index2indices.offsets) if exists( join(config.language_path, "trie_index2indices_transition_values.npy")): trie_index2indices_transitions = OffsetArray( np.load( join(config.language_path, "trie_index2indices_transition_values.npy")), np.load( join(config.language_path, "trie_index2indices_transition_offsets.npy")), ) else: trie_index2indices_transitions = None anchor_trie = marisa_trie.Trie().load( join(config.language_path, "trie.marisa")) wiki_trie = marisa_trie.RecordTrie('i').load( join(config.wikidata, "wikititle2wikidata.marisa")) redirections = load_redirections(config.redirections) seen = 0 with open(args.out, "wt") as fout: try: for i, (article_name, article) in tqdm(enumerate(iterate_articles(config.wiki))): if i == 5409: continue fixed_article, article_qid = convert( article_name, article, collection=collection, anchor_trie=anchor_trie, wiki_trie=wiki_trie, trie_index2indices=trie_index2indices, trie_index2indices_counts=trie_index2indices_counts, trie_index2indices_transitions= trie_index2indices_transitions, redirections=redirections, prefix=prefix) if fixed_article is False: continue for paragraph in fixed_article: for word, qids in paragraph: if len(qids) > 0: fout.write(word.rstrip() + "\t" + "\t".join(qids + [article_qid]) + "\n") else: fout.write(word.rstrip() + "\n") fout.write("\n") seen += 1 if seen >= config.sample_size: break finally: fout.flush() fout.close()
def main(): args = parse_args() if args.new_language_path == args.language_path: raise ValueError("new_language_path and language_path must be " "different: cannot generate a fixed trie in " "the same directory as the original trie.") c = TypeCollection(args.wikidata, num_names_to_load=0) c.load_blacklist(join(SCRIPT_DIR, "blacklist.json")) original_values = np.load( join(args.language_path, "trie_index2indices_values.npy")) original_offsets = np.load( join(args.language_path, "trie_index2indices_offsets.npy")) original_counts = np.load( join(args.language_path, "trie_index2indices_counts.npy")) original_trie_path = join(args.language_path, 'trie.marisa') trie = marisa_trie.Trie().load(original_trie_path) initialize_globals(c) t0 = time.time() old_location_shift = None values, offsets, counts = original_values, original_offsets, original_counts for step in range(args.steps): anchor_length = get_trie_properties(trie, offsets, values) (offsets, values, counts), location_shift = fix(collection=c, offsets=offsets, values=values, counts=counts, anchor_length=anchor_length, num_category_link=8) if old_location_shift is not None: # see where newly shifted values are now pointing # to (extra indirection level): location_shift = location_shift[old_location_shift] location_shift[old_location_shift == -1] = -1 old_location_shift = location_shift pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 num_changes = int((pre_reduced_values != original_values).sum()) change_volume = int( (original_counts[pre_reduced_values != original_values].sum())) print("step %d with %d changes, %d total links" % (step, num_changes, change_volume)) pre_reduced_values = values[location_shift] pre_reduced_values[location_shift == -1] = -1 t1 = time.time() num_changes = int((pre_reduced_values != original_values).sum()) print("Done with link fixing in %.3fs, with %d changes." % (t1 - t0, num_changes)) # show some remappings: np.random.seed(1234) num_samples = 10 samples = np.random.choice(np.where( np.logical_and( np.logical_and(pre_reduced_values != original_values, pre_reduced_values != -1), original_values != -1))[0], size=num_samples, replace=False) print("Sample fixes:") for index in samples: print(" %r (%d) -> %r (%d)" % (c.get_name(int( original_values[index])), int(original_values[index]), c.get_name(int(pre_reduced_values[index])), int(pre_reduced_values[index]))) print("") samples = np.random.choice( np.where(OffsetArray(values, offsets).edges() == 0)[0], size=num_samples, replace=False) print("Sample deletions:") for index in samples: print(" %r" % (trie.restore_key(int(index)))) # prune out anchors where there are no more linked items: print("Removing empty anchors from trie...") t0 = time.time() non_empty_offsets = np.where(OffsetArray(values, offsets).edges() != 0)[0] fixed_trie = filter_trie(trie, non_empty_offsets) contexts_found = true_exists( join(args.language_path, "trie_index2contexts_values.npy")) if contexts_found: contexts_values = np.load( join(args.language_path, "trie_index2contexts_values.npy")) contexts_offsets = np.load( join(args.language_path, "trie_index2contexts_offsets.npy")) contexts_counts = np.load( join(args.language_path, "trie_index2contexts_counts.npy")) to_port = [(offsets, values, counts), (original_offsets, pre_reduced_values, original_values)] if contexts_found: to_port.append((contexts_offsets, contexts_values, contexts_counts)) ported = remap_trie_offset_array(trie, fixed_trie, to_port) offsets, values, counts = ported[0] original_offsets, pre_reduced_values, original_values = ported[1] t1 = time.time() print("Removed %d empty anchors from trie in %.3fs" % ( len(trie) - len(fixed_trie), t1 - t0, )) print("Saving...") makedirs(args.new_language_path, exist_ok=True) np.save(join(args.new_language_path, "trie_index2indices_values.npy"), values) np.save(join(args.new_language_path, "trie_index2indices_offsets.npy"), offsets) np.save(join(args.new_language_path, "trie_index2indices_counts.npy"), counts) if contexts_found: contexts_offsets, contexts_values, contexts_counts = ported[2] np.save(join(args.new_language_path, "trie_index2contexts_values.npy"), contexts_values) np.save( join(args.new_language_path, "trie_index2contexts_offsets.npy"), contexts_offsets) np.save(join(args.new_language_path, "trie_index2contexts_counts.npy"), contexts_counts) new_trie_path = join(args.new_language_path, 'trie.marisa') fixed_trie.save(new_trie_path) transition = np.vstack([original_values, pre_reduced_values]).T np.save( join(args.new_language_path, "trie_index2indices_transition_values.npy"), transition) np.save( join(args.new_language_path, "trie_index2indices_transition_offsets.npy"), original_offsets) print("Done.")