Ejemplo n.º 1
0
def batch(size=T.batch_train,
          path=P.train,
          vocab=vocab,
          seed=A.seed,
          kudo=A.sample,
          max_len=T.max_len):
    pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32)
    enc = encode_capped_sample_pair if kudo else encode_capped
    raw = tuple(load_txt(path))
    eos = vocab.eos_id()
    bat = []
    for i in sample(len(raw), seed):
        if size == len(bat):
            if kudo:
                src, tgt = map(pac, zip(*bat))
            else:
                src = tgt = pac(bat)
            yield src, tgt
            bat = []
        bat.append(enc(vocab, raw[i], cap=max_len))
Ejemplo n.º 2
0
import os
import numpy as np
from util_io import load_txt, save_txt

############################################
### Processes and saves the Test Dataset ###
############################################

datadir = '../data/reason/reason'

file2fold = {}
for filename in os.listdir("{}/folds".format(datadir)):
    topic, fold = filename.split("-")
    for i in range(1, 6):
        for line in load_txt("{}/folds/{}".format(datadir, filename)):
            file2fold[line.split()[0]] = fold

folds, labels, arguments = [], [], []
for topic in 'abortion', 'gayRights', 'marijuana', 'obama':
    dirname = "{}/{}".format(datadir, topic)
    for filename in sorted(os.listdir(dirname)):
        fold = int(file2fold[filename.split(".")[0]]) - 1
        text = list(
            load_txt(os.path.join(dirname, filename), encoding="Windows-1252"))
        for idx, sentence in enumerate(text):
            if sentence[:7] == "Label##":
                stance, reason = sentence[7:].lower().split("-")
                if "other" == reason: continue  #exclude OTHER class
                label = "{}-{}-{}".format(topic, stance, reason)
                count = 1
                try:
Ejemplo n.º 3
0
#!/usr/bin/env python3

from util_cw import chars, CharWright
from util_io import load_txt, save_txt
from util_np import np

path_src = "../data/europarl-v7.de-en.de"
path_tgt = "../data/europarl-v7.de-en.en"
max_char = 256

#############
# load data #
#############

src_tgt = []
for src, tgt in zip(load_txt(path_src), load_txt(path_tgt)):
    src = src.strip()
    tgt = tgt.strip()
    if 3 <= len(src) <= max_char and 3 <= len(tgt) <= max_char:
        src_tgt.append((src, tgt))

np.random.seed(0)
np.random.shuffle(src_tgt)

src, tgt = zip(*src_tgt)
del src_tgt

#############
# save data #
#############
Ejemplo n.º 4
0
from itertools import islice
from util_cw import CharWright
from util_io import load_txt, save_txt
from util_np import np, partition
from util_tf import tf
sess = tf.InteractiveSession()

# load model
cws = CharWright.load("../data/cws.pkl")
cwt = CharWright.load("../data/cwt.pkl")
m = model('infer', cws.dwh(), cwt.dwh())
saver = tf.train.Saver()
saver.restore(sess, "../ckpt/{}".format(ckpt))

# the first 4096 instances are used for validation
src = np.array(list(islice(load_txt("../data/src.txt"), 4096)))
tgt = np.array(list(islice(load_txt("../data/tgt.txt"), 4096)))
val = np.array(sorted(range(len(src)), key=lambda i: len(src[i])))
src = src[val]
tgt = tgt[val]


def translate(src, mode):
    for i, j in partition(len(src), 256):
        src_idx, len_src = cws(src[i:j], ret_img=False, ret_idx=True)
        pred, pidx = infer(mode, m, sess, cwt, src_idx, len_src)
        yield from trim_str(pidx, cwt)


save_txt("../tmp/prd", translate(src, mode))
save_txt("../tmp/tgt", tgt)
Ejemplo n.º 5
0
from util import partial
from util_io import pform, load_txt, save_txt
from util_np import np, vpack
from util_sp import spm, load_spm, decode

langs = 'en', 'nl', 'de', 'da', 'sv'

#######################
# align all 5 corpora #
#######################

# load all corpora
corp2pairs = {
    corp: tuple((s, t) for s, t in zip(
        map(str.strip,
            load_txt(pform(P.raw, "europarl-v7.{}-en.{}".format(corp, corp)))),
        map(str.strip,
            load_txt(pform(P.raw, "europarl-v7.{}-en.en".format(corp)))))
                if 0 < len(s) and 0 < len(t))
    for corp in langs[1:]
}

# partition into equivalence classes
sent2class = defaultdict(set)
for corp, pairs in corp2pairs.items():
    for s, t in tqdm(pairs, ncols=70):
        s = s, corp
        t = t, 'en'
        c = set.union(sent2class[s], sent2class[t])
        c.add(s)
        c.add(t)
Ejemplo n.º 6
0
import os
import numpy as np
from util_io import load_txt, save_txt, clean

topics = tuple("abortion gayRights marijuana obama".split())

top2folds = {
    top: tuple(
        set(
            load_txt("../data/reason/stance/folds/{}_folds/Fold-{}".format(
                top, fold))) for fold in range(1, 6))
    for top in topics
}

dataset = []
for top, folds in top2folds.items():
    path = "../data/reason/stance/{}".format(top)
    for fold, names in enumerate(folds):
        for name in names:
            data = list(load_txt("{}/{}.data".format(path, name)))
            assert len(data) == 1
            data = data[0]
            meta = dict(
                line.split("=")
                for line in load_txt("{}/{}.meta".format(path, name)))
            stn = meta['Stance']
            try:
                stn = int(meta['Stance'])
            except ValueError:
                print(top, name)
                continue
Ejemplo n.º 7
0
    trial = 'cgc'
    ckpt = None

    from tqdm import tqdm
    from util_cw import CharWright
    from util_io import load_txt
    from util_np import np, batch_sample, partition
    from util_tf import pipe

    tf.set_random_seed(0)
    sess = tf.InteractiveSession()

    cws = CharWright.load("../data/cws.pkl")
    cwt = CharWright.load("../data/cwt.pkl")
    src = np.array(list(load_txt("../data/src.txt")))
    tgt = np.array(list(load_txt("../data/tgt.txt")))
    src_valid, src_train = src[:4096], src[4096:]
    tgt_valid, tgt_train = tgt[:4096], tgt[4096:]
    val = np.array(
        sorted(range(len(tgt_valid)),
               key=lambda i: max(len(src_valid[i]), len(tgt_valid[i]))))
    src_valid = src_valid[val]
    tgt_valid = tgt_valid[val]

    def feed(src, tgt, cws=cws, cwt=cwt):
        src_idx, len_src = cws(src, ret_img=False, ret_idx=True)
        tgt_img, tgt_idx, len_tgt = cwt(tgt, ret_img=True, ret_idx=True)
        return src_idx, len_src, tgt_img, tgt_idx, len_tgt

    def batch(src=src_train, tgt=tgt_train, size=128, seed=0):
Ejemplo n.º 8
0
Archivo: data.py Proyecto: ysmiraak/eti
path_src = pform(P.raw, "europarl-v7.de-en.de")
path_tgt = pform(P.raw, "europarl-v7.de-en.en")

###############
# build vocab #
###############

vocab_src = spm(pform(P.data, "vocab_src"), path_src, C.dim_src, C.bos, C.eos, C.unk)
vocab_tgt = spm(pform(P.data, "vocab_tgt"), path_tgt, C.dim_tgt, C.bos, C.eos, C.unk)

#############
# load data #
#############

src_tgt = list(zip(load_txt(path_src), load_txt(path_tgt)))
np.random.seed(C.seed)
np.random.shuffle(src_tgt)

####################
# filter and split #
####################

train_src = []
train_tgt = []
valid_src = []
valid_tgt = []
valid_raw = []
for src, tgt in src_tgt:
    s = vocab_src.encode_as_ids(src)
    t = vocab_tgt.encode_as_ids(tgt)
Ejemplo n.º 9
0
        #print("{}\n{}\n{}\n{}\n".format(idx,_[0],_[1],_[2])) #print: sentence1, path, sentence2


path_vocab = "../trial/data/vocab.model"
path_txt = "../data/test_data.txt"
path_ckpt = "../trial/ckpt/kudo18"
path_use_dim = "../data/useful_dimension.npy"

# load and restore model
vae = vAe('infer')
sess = tf.InteractiveSession()
tf.train.Saver().restore(sess, path_ckpt)

# load vocab and text
vocab = sp.load_spm(path_vocab)
text = list(load_txt(path_txt))

#pick 2 random sentences to explore
np.random.seed(23)
sen_idx = np.random.random_integers(0, len(text), 2)
sentences = [text[idx] for idx in sen_idx]
print("sentence 1: {}\nsentence 2: {}".format(sentences[0], sentences[1]))

# encode sentences with sentence piece model
data = sp.encode(vocab, sentences)

### full high dimensional space
z = vae.z.eval({vae.tgt: data})
analyze(z)

### only the dimensions that turned out usefull for our task
Ejemplo n.º 10
0
def load_ibm_claim(path):
    rows = csv.reader(load_txt(path))
    next(rows)
    for row in rows:
        yield row[3]
Ejemplo n.º 11
0
    rows = csv.reader(load_txt(path))
    next(rows)
    for row in rows:
        yield row[3]

def load_all():
    for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv":
        yield from load_ibm_claim("{}/{}".format(path_csv, split))

# extract all sentences
save_txt(path_txt, load_all())

# train a sentence piece model
spm(name= path_vocab, path= path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# load and shuffle sentences
sents = list(load_txt(path_txt))
np.random.seed(0)
np.random.shuffle(sents)

# train valid split
valid = sents[:valid_size]
train = sents[valid_size:]

# save train and valid data
save_txt(path_train, train)
np.save(path_valid, encode(vocab, valid))
Ejemplo n.º 12
0
from trial import config as C, paths as P, train as T
from util import partial, comp, select
from util_io import pform, load_txt, save_txt
from util_np import np, partition, batch_sample
from util_sp import load_spm, encode, decode
from util_tf import tf, pipe
tf.set_random_seed(C.seed)

C.trial = "m3_"
C.ckpt = 3

langs = 'en', 'nl', 'de', 'da', 'sv'
vocab = tuple(
    load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs)
sents = tuple(
    encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang))))
    for lang, voc in zip(langs, vocab))

index = tuple(permutations(range(5), 2))
model = Model.new(**select(C, *Model._new))
model = tuple(model.data(i, j).infer() for i, j in index)

sess = tf.InteractiveSession()
saver = tf.train.Saver()


def trans(sents, model, vocab):
    for preds in batch_run(sess, model, model.pred, sents,
                           batch=C.batch_infer):
        yield from decode(vocab, preds)
Ejemplo n.º 13
0
posts = tuple(
    clean(post[3])
    # extract the cleaned raw texts
    for filename in sorted(os.listdir(path_raw))
    # each json: posts, annotations, metadata
    for post in load_json(pform(path_raw, filename))[0]
    # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp
)

# removes empty posts
posts = tuple(post for post in posts if 0 < len(post))

# saves raw texts
save_txt(path_txt, posts)

# train a sentence piece model
spm(name=path_vocab, path=path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# length control
posts = [encode_capped(vocab, post, cap=512) for post in posts]
save_txt(path_train, map(vocab.decode_ids, posts))

# validation data
posts = tuple(map(clean, load_txt(path_val)))
posts = [encode_capped(vocab, post, cap=512) for post in posts]
np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))