Esempio n. 1
0
def prep_dataset():
    wiki_path = WIKI_PATH

    if CONTEXT_CAPACITY % 2 != 0:
        raise Exception("Context length should be even")

    context_window = CONTEXT_CAPACITY + 1

    print("Loading...", end="")
    wiki = WikiDataLoader(wiki_path)
    voc = Vocabulary()
    tok = Tokenizer()
    print("done")

    wiki_doc = wiki.next_doc()
    wikiprep = open("WikiPrepData.txt", "w")

    i = 0
    while wiki_doc:
        doc = tok(wiki_doc)
        voc.add(doc)

        sample = np.array(voc.text2ids(doc))
        indexer = np.arange(context_window)[None, :] + np.arange(
            len(sample) - context_window)[:, None]

        smpl = sample[indexer]

        for row in smpl:
            for val in row:
                wikiprep.write("%d " % val)
            wikiprep.write("\n")

        i += 1
        if i == 2000:
            break
        wiki_doc = wiki.next_doc()

    pickle.dump(voc, open("WikiPrepVoc.pkl", "wb"))
    print("Vocabulary ready")