def load_batch(names, load=comp(np.load, "trial/data/grams/{}.npy".format)): names = names.astype(np.str) x = vpack(map(load, names), complex('(nan+nanj)'), 1, 1) # x = vpack(map(comp(load, path), names), complex('(nan+nanj)'), 1, 1) x[:, 0] = 0j x = c2r(x) _, t, d = x.shape assert t <= len_cap assert d == dim_tgt return x
def encode(vocab, sents, length=None, dtype=np.int32): """-> array dtype encodes `sents : seq str` with `vocab : SentencePieceProcessor`. returns a rank 2 array whose second axis is padded to `length` or the maximum length. """ sents = list(map(vocab.encode_as_ids, sents)) if length is None: length = max(map(len, sents)) return vpack(sents, (len(sents), length), vocab.eos_id(), dtype)
def batch(size=T.batch_train, path=P.train, vocab=vocab, seed=A.seed, kudo=A.sample, max_len=T.max_len): pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32) enc = encode_capped_sample_pair if kudo else encode_capped raw = tuple(load_txt(path)) eos = vocab.eos_id() bat = [] for i in sample(len(raw), seed): if size == len(bat): if kudo: src, tgt = map(pac, zip(*bat)) else: src = tgt = pac(bat) yield src, tgt bat = [] bat.append(enc(vocab, raw[i], cap=max_len))
#!/usr/bin/env python3 path = "trial/data" from os.path import join from util import partial, PointedIndex from util_io import load, chartab, encode from util_np import np, vpack src = list(load(join(path, "train_src"))) tgt = list(load(join(path, "train_tgt"))) idx_src = PointedIndex(chartab(src)) idx_tgt = PointedIndex(chartab(tgt)) enc_src = partial(encode, idx_src) enc_tgt = partial(encode, idx_tgt) assert 1 == idx_src("\n") == idx_tgt("\n") pack = lambda txt: vpack(map(partial(np.array, dtype=np.uint8), txt), fill=1) np.save(join(path, "index_src"), idx_src.vec) np.save(join(path, "index_tgt"), idx_tgt.vec) np.save(join(path, "train_src"), pack(map(enc_src, src))) np.save(join(path, "train_tgt"), pack(map(enc_tgt, tgt))) np.save(join(path, "valid_src"), pack(map(enc_src, load(join(path, "valid_src"))))) np.save(join(path, "valid_tgt"), pack(map(enc_tgt, load(join(path, "valid_tgt")))))
def infer_avg(sent, samples=128): bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)] bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32) z = model.z.eval({model.src: bat}) return np.mean(z, axis=0)
# load sentencepiece model vocab = sp.load_spm(path_vocab) # Load the model model = vAe('infer') # Restore the session sess = tf.InteractiveSession() tf.train.Saver().restore(sess, path_ckpt) ################################ # deterministic representation # ################################ # encode text with sentence piece model data = list(map(partial(sp.encode_capped, vocab), text)) data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32) # calculate z for the test data in batches inpt = [model.z.eval({model.src: data[i:j]}) for i, j in partition(len(data), 128)] inpt = np.concatenate(inpt, axis=0) np.save(path_emb, inpt) ####################################################### # averaged representation with sentencepiece sampling # ####################################################### def infer_avg(sent, samples=128): bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)] bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32) z = model.z.eval({model.src: bat})
################## # prep and split # ################## # train one sentencepiece model for each language vocab = tuple( spm(pform(P.data, "vocab_{}".format(lang)), pform(P.raw, lang), C.dim_voc, C.bos, C.eos, C.unk) for lang in langs) # remove long sentences short = [] for sents in zip(*(load_txt(pform(P.raw, lang)) for lang in langs)): sents = [v.encode_as_ids(s) for v, s in zip(vocab, sents)] if all(len(sent) <= C.cap for sent in sents): short.append(sents) np.random.seed(C.seed) np.random.shuffle(short) # pack instances into arrays corpora = tuple( vpack(corp, (len(corp), C.cap), C.eos, np.uint16) for corp in zip(*short)) del short # split and save for lang, voc, corp in zip(langs, vocab, corpora): save_txt(pform(P.data, "eval_{}.txt".format(lang)), decode(voc, corp[:4096])) np.save(pform(P.data, "valid_{}.npy".format(lang)), corp[4096:5120]) np.save(pform(P.data, "train_{}.npy".format(lang)), corp[5120:])
#################### # filter and split # #################### train_src = [] train_tgt = [] valid_src = [] valid_tgt = [] valid_raw = [] for src, tgt in src_tgt: s = vocab_src.encode_as_ids(src) t = vocab_tgt.encode_as_ids(tgt) if 0 < len(s) <= C.cap and 0 < len(t) <= C.cap: if len(valid_raw) < C.total_valid: valid_src.append(s) valid_tgt.append(t) valid_raw.append(tgt) else: train_src.append(src) train_tgt.append(tgt) ############# # save data # ############# save_txt(pform(P.data, "train_src.txt"), train_src) save_txt(pform(P.data, "train_tgt.txt"), train_tgt) save_txt(pform(P.data, "valid_tgt.txt"), valid_raw) np.save( pform(P.data, "valid_tgt.npy"), vpack(valid_tgt, (C.total_valid, C.cap), C.eos, np.uint16)) np.save( pform(P.data, "valid_src.npy"), vpack(valid_src, (C.total_valid, C.cap), C.eos, np.uint16))
#!/usr/bin/env python3 from util import comp, partial, PointedIndex from util_io import path, load_meta, load from util_np import np, vpack names, texts = load_meta() chars = {char for text in texts for char in text} chars.remove("\n") chars.remove(" ") index = PointedIndex(" \n" + "".join(sorted(chars))) texts = vpack( map(comp(partial(np.fromiter, dtype=np.uint8), partial(map, index)), texts), index("\n")) np.save("trial/data/index", index.vec) np.save("trial/data/texts", texts) np.save("trial/data/names", names) for name in names: np.save("trial/data/grams/" + name, load(path(name)))
posts = tuple( clean(post[3]) # extract the cleaned raw texts for filename in sorted(os.listdir(path_raw)) # each json: posts, annotations, metadata for post in load_json(pform(path_raw, filename))[0] # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp ) # removes empty posts posts = tuple(post for post in posts if 0 < len(post)) # saves raw texts save_txt(path_txt, posts) # train a sentence piece model spm(name=path_vocab, path=path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # length control posts = [encode_capped(vocab, post, cap=512) for post in posts] save_txt(path_train, map(vocab.decode_ids, posts)) # validation data posts = tuple(map(clean, load_txt(path_val))) posts = [encode_capped(vocab, post, cap=512) for post in posts] np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))