def prep_dataset(): wiki_path = WIKI_PATH if CONTEXT_CAPACITY % 2 != 0: raise Exception("Context length should be even") context_window = CONTEXT_CAPACITY + 1 print("Loading...", end="") wiki = WikiDataLoader(wiki_path) voc = Vocabulary() tok = Tokenizer() print("done") wiki_doc = wiki.next_doc() wikiprep = open("WikiPrepData.txt", "w") i = 0 while wiki_doc: doc = tok(wiki_doc) voc.add(doc) sample = np.array(voc.text2ids(doc)) indexer = np.arange(context_window)[None, :] + np.arange( len(sample) - context_window)[:, None] smpl = sample[indexer] for row in smpl: for val in row: wikiprep.write("%d " % val) wikiprep.write("\n") i += 1 if i == 2000: break wiki_doc = wiki.next_doc() pickle.dump(voc, open("WikiPrepVoc.pkl", "wb")) print("Vocabulary ready")