Ejemplo n.º 1
0
def load_batch(names, load=comp(np.load, "trial/data/grams/{}.npy".format)):
    names = names.astype(np.str)
    x = vpack(map(load, names), complex('(nan+nanj)'), 1, 1)
    # x = vpack(map(comp(load, path), names), complex('(nan+nanj)'), 1, 1)
    x[:, 0] = 0j
    x = c2r(x)
    _, t, d = x.shape
    assert t <= len_cap
    assert d == dim_tgt
    return x
Ejemplo n.º 2
0
def encode(vocab, sents, length=None, dtype=np.int32):
    """-> array dtype

    encodes `sents : seq str` with `vocab : SentencePieceProcessor`.
    returns a rank 2 array whose second axis is padded to `length` or
    the maximum length.

    """
    sents = list(map(vocab.encode_as_ids, sents))
    if length is None: length = max(map(len, sents))
    return vpack(sents, (len(sents), length), vocab.eos_id(), dtype)
Ejemplo n.º 3
0
def batch(size=T.batch_train,
          path=P.train,
          vocab=vocab,
          seed=A.seed,
          kudo=A.sample,
          max_len=T.max_len):
    pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32)
    enc = encode_capped_sample_pair if kudo else encode_capped
    raw = tuple(load_txt(path))
    eos = vocab.eos_id()
    bat = []
    for i in sample(len(raw), seed):
        if size == len(bat):
            if kudo:
                src, tgt = map(pac, zip(*bat))
            else:
                src = tgt = pac(bat)
            yield src, tgt
            bat = []
        bat.append(enc(vocab, raw[i], cap=max_len))
Ejemplo n.º 4
0
#!/usr/bin/env python3

path = "trial/data"

from os.path import join
from util import partial, PointedIndex
from util_io import load, chartab, encode
from util_np import np, vpack

src = list(load(join(path, "train_src")))
tgt = list(load(join(path, "train_tgt")))

idx_src = PointedIndex(chartab(src))
idx_tgt = PointedIndex(chartab(tgt))
enc_src = partial(encode, idx_src)
enc_tgt = partial(encode, idx_tgt)

assert 1 == idx_src("\n") == idx_tgt("\n")
pack = lambda txt: vpack(map(partial(np.array, dtype=np.uint8), txt), fill=1)

np.save(join(path, "index_src"), idx_src.vec)
np.save(join(path, "index_tgt"), idx_tgt.vec)
np.save(join(path, "train_src"), pack(map(enc_src, src)))
np.save(join(path, "train_tgt"), pack(map(enc_tgt, tgt)))
np.save(join(path, "valid_src"),
        pack(map(enc_src, load(join(path, "valid_src")))))
np.save(join(path, "valid_tgt"),
        pack(map(enc_tgt, load(join(path, "valid_tgt")))))
Ejemplo n.º 5
0
def infer_avg(sent, samples=128):
   bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)]
   bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32)
   z = model.z.eval({model.src: bat})
   return np.mean(z, axis=0)
Ejemplo n.º 6
0
# load sentencepiece model
vocab = sp.load_spm(path_vocab)

# Load the model
model = vAe('infer')
# Restore the session
sess = tf.InteractiveSession()
tf.train.Saver().restore(sess, path_ckpt)

################################
# deterministic representation #
################################

# encode text with sentence piece model
data = list(map(partial(sp.encode_capped, vocab), text))
data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32)

# calculate z for the test data in batches
inpt = [model.z.eval({model.src: data[i:j]}) for i, j in partition(len(data), 128)]
inpt = np.concatenate(inpt, axis=0)

np.save(path_emb, inpt)

#######################################################
# averaged representation with sentencepiece sampling #
#######################################################

def infer_avg(sent, samples=128):
   bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)]
   bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32)
   z = model.z.eval({model.src: bat})
Ejemplo n.º 7
0
##################
# prep and split #
##################

# train one sentencepiece model for each language
vocab = tuple(
    spm(pform(P.data, "vocab_{}".format(lang)), pform(P.raw, lang), C.dim_voc,
        C.bos, C.eos, C.unk) for lang in langs)

# remove long sentences
short = []
for sents in zip(*(load_txt(pform(P.raw, lang)) for lang in langs)):
    sents = [v.encode_as_ids(s) for v, s in zip(vocab, sents)]
    if all(len(sent) <= C.cap for sent in sents):
        short.append(sents)

np.random.seed(C.seed)
np.random.shuffle(short)

# pack instances into arrays
corpora = tuple(
    vpack(corp, (len(corp), C.cap), C.eos, np.uint16) for corp in zip(*short))
del short

# split and save
for lang, voc, corp in zip(langs, vocab, corpora):
    save_txt(pform(P.data, "eval_{}.txt".format(lang)),
             decode(voc, corp[:4096]))
    np.save(pform(P.data, "valid_{}.npy".format(lang)), corp[4096:5120])
    np.save(pform(P.data, "train_{}.npy".format(lang)), corp[5120:])
Ejemplo n.º 8
0
Archivo: data.py Proyecto: ysmiraak/eti
####################
# filter and split #
####################

train_src = []
train_tgt = []
valid_src = []
valid_tgt = []
valid_raw = []
for src, tgt in src_tgt:
    s = vocab_src.encode_as_ids(src)
    t = vocab_tgt.encode_as_ids(tgt)
    if 0 < len(s) <= C.cap and 0 < len(t) <= C.cap:
        if len(valid_raw) < C.total_valid:
            valid_src.append(s)
            valid_tgt.append(t)
            valid_raw.append(tgt)
        else:
            train_src.append(src)
            train_tgt.append(tgt)

#############
# save data #
#############

save_txt(pform(P.data, "train_src.txt"), train_src)
save_txt(pform(P.data, "train_tgt.txt"), train_tgt)
save_txt(pform(P.data, "valid_tgt.txt"), valid_raw)
np.save( pform(P.data, "valid_tgt.npy"), vpack(valid_tgt, (C.total_valid, C.cap), C.eos, np.uint16))
np.save( pform(P.data, "valid_src.npy"), vpack(valid_src, (C.total_valid, C.cap), C.eos, np.uint16))
Ejemplo n.º 9
0
#!/usr/bin/env python3

from util import comp, partial, PointedIndex
from util_io import path, load_meta, load
from util_np import np, vpack

names, texts = load_meta()

chars = {char for text in texts for char in text}
chars.remove("\n")
chars.remove(" ")
index = PointedIndex(" \n" + "".join(sorted(chars)))
texts = vpack(
    map(comp(partial(np.fromiter, dtype=np.uint8), partial(map, index)),
        texts), index("\n"))

np.save("trial/data/index", index.vec)
np.save("trial/data/texts", texts)
np.save("trial/data/names", names)

for name in names:
    np.save("trial/data/grams/" + name, load(path(name)))
Ejemplo n.º 10
0
posts = tuple(
    clean(post[3])
    # extract the cleaned raw texts
    for filename in sorted(os.listdir(path_raw))
    # each json: posts, annotations, metadata
    for post in load_json(pform(path_raw, filename))[0]
    # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp
)

# removes empty posts
posts = tuple(post for post in posts if 0 < len(post))

# saves raw texts
save_txt(path_txt, posts)

# train a sentence piece model
spm(name=path_vocab, path=path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# length control
posts = [encode_capped(vocab, post, cap=512) for post in posts]
save_txt(path_train, map(vocab.decode_ids, posts))

# validation data
posts = tuple(map(clean, load_txt(path_val)))
posts = [encode_capped(vocab, post, cap=512) for post in posts]
np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))