コード例 #1
0
def get_prefix(config):
    return config.prefix or induce_wikipedia_prefix(config.wiki)
コード例 #2
0
def main():
    args = parse_args()
    config = load_config(args.config,
                         ["wiki", "language_path", "wikidata", "redirections"],
                         defaults={
                             "num_names_to_load": 0,
                             "prefix": None,
                             "sample_size": 100
                         },
                         relative_to=args.relative_to)
    prefix = config.prefix or induce_wikipedia_prefix(config.wiki)

    collection = TypeCollection(config.wikidata, num_names_to_load=0)
    collection.load_blacklist(join(SCRIPT_DIR, "blacklist.json"))

    trie_index2indices = OffsetArray.load(join(config.language_path,
                                               "trie_index2indices"),
                                          compress=True)
    trie_index2indices_counts = OffsetArray(
        np.load(join(config.language_path, "trie_index2indices_counts.npy")),
        trie_index2indices.offsets)
    if exists(
            join(config.language_path,
                 "trie_index2indices_transition_values.npy")):
        trie_index2indices_transitions = OffsetArray(
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_values.npy")),
            np.load(
                join(config.language_path,
                     "trie_index2indices_transition_offsets.npy")),
        )
    else:
        trie_index2indices_transitions = None

    anchor_trie = marisa_trie.Trie().load(
        join(config.language_path, "trie.marisa"))
    wiki_trie = marisa_trie.RecordTrie('i').load(
        join(config.wikidata, "wikititle2wikidata.marisa"))
    redirections = load_redirections(config.redirections)

    seen = 0
    with open(args.out, "wt") as fout:
        try:
            for i, (article_name,
                    article) in tqdm(enumerate(iterate_articles(config.wiki))):
                if i == 5409:
                    continue
                fixed_article, article_qid = convert(
                    article_name,
                    article,
                    collection=collection,
                    anchor_trie=anchor_trie,
                    wiki_trie=wiki_trie,
                    trie_index2indices=trie_index2indices,
                    trie_index2indices_counts=trie_index2indices_counts,
                    trie_index2indices_transitions=
                    trie_index2indices_transitions,
                    redirections=redirections,
                    prefix=prefix)
                if fixed_article is False:
                    continue
                for paragraph in fixed_article:
                    for word, qids in paragraph:
                        if len(qids) > 0:
                            fout.write(word.rstrip() + "\t" +
                                       "\t".join(qids + [article_qid]) + "\n")
                        else:
                            fout.write(word.rstrip() + "\n")
                    fout.write("\n")
                seen += 1
                if seen >= config.sample_size:
                    break
        finally:
            fout.flush()
            fout.close()