def batch(size=T.batch_train, path=P.train, vocab=vocab, seed=A.seed, kudo=A.sample, max_len=T.max_len): pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32) enc = encode_capped_sample_pair if kudo else encode_capped raw = tuple(load_txt(path)) eos = vocab.eos_id() bat = [] for i in sample(len(raw), seed): if size == len(bat): if kudo: src, tgt = map(pac, zip(*bat)) else: src = tgt = pac(bat) yield src, tgt bat = [] bat.append(enc(vocab, raw[i], cap=max_len))
import os import numpy as np from util_io import load_txt, save_txt ############################################ ### Processes and saves the Test Dataset ### ############################################ datadir = '../data/reason/reason' file2fold = {} for filename in os.listdir("{}/folds".format(datadir)): topic, fold = filename.split("-") for i in range(1, 6): for line in load_txt("{}/folds/{}".format(datadir, filename)): file2fold[line.split()[0]] = fold folds, labels, arguments = [], [], [] for topic in 'abortion', 'gayRights', 'marijuana', 'obama': dirname = "{}/{}".format(datadir, topic) for filename in sorted(os.listdir(dirname)): fold = int(file2fold[filename.split(".")[0]]) - 1 text = list( load_txt(os.path.join(dirname, filename), encoding="Windows-1252")) for idx, sentence in enumerate(text): if sentence[:7] == "Label##": stance, reason = sentence[7:].lower().split("-") if "other" == reason: continue #exclude OTHER class label = "{}-{}-{}".format(topic, stance, reason) count = 1 try:
#!/usr/bin/env python3 from util_cw import chars, CharWright from util_io import load_txt, save_txt from util_np import np path_src = "../data/europarl-v7.de-en.de" path_tgt = "../data/europarl-v7.de-en.en" max_char = 256 ############# # load data # ############# src_tgt = [] for src, tgt in zip(load_txt(path_src), load_txt(path_tgt)): src = src.strip() tgt = tgt.strip() if 3 <= len(src) <= max_char and 3 <= len(tgt) <= max_char: src_tgt.append((src, tgt)) np.random.seed(0) np.random.shuffle(src_tgt) src, tgt = zip(*src_tgt) del src_tgt ############# # save data # #############
from itertools import islice from util_cw import CharWright from util_io import load_txt, save_txt from util_np import np, partition from util_tf import tf sess = tf.InteractiveSession() # load model cws = CharWright.load("../data/cws.pkl") cwt = CharWright.load("../data/cwt.pkl") m = model('infer', cws.dwh(), cwt.dwh()) saver = tf.train.Saver() saver.restore(sess, "../ckpt/{}".format(ckpt)) # the first 4096 instances are used for validation src = np.array(list(islice(load_txt("../data/src.txt"), 4096))) tgt = np.array(list(islice(load_txt("../data/tgt.txt"), 4096))) val = np.array(sorted(range(len(src)), key=lambda i: len(src[i]))) src = src[val] tgt = tgt[val] def translate(src, mode): for i, j in partition(len(src), 256): src_idx, len_src = cws(src[i:j], ret_img=False, ret_idx=True) pred, pidx = infer(mode, m, sess, cwt, src_idx, len_src) yield from trim_str(pidx, cwt) save_txt("../tmp/prd", translate(src, mode)) save_txt("../tmp/tgt", tgt)
from util import partial from util_io import pform, load_txt, save_txt from util_np import np, vpack from util_sp import spm, load_spm, decode langs = 'en', 'nl', 'de', 'da', 'sv' ####################### # align all 5 corpora # ####################### # load all corpora corp2pairs = { corp: tuple((s, t) for s, t in zip( map(str.strip, load_txt(pform(P.raw, "europarl-v7.{}-en.{}".format(corp, corp)))), map(str.strip, load_txt(pform(P.raw, "europarl-v7.{}-en.en".format(corp))))) if 0 < len(s) and 0 < len(t)) for corp in langs[1:] } # partition into equivalence classes sent2class = defaultdict(set) for corp, pairs in corp2pairs.items(): for s, t in tqdm(pairs, ncols=70): s = s, corp t = t, 'en' c = set.union(sent2class[s], sent2class[t]) c.add(s) c.add(t)
import os import numpy as np from util_io import load_txt, save_txt, clean topics = tuple("abortion gayRights marijuana obama".split()) top2folds = { top: tuple( set( load_txt("../data/reason/stance/folds/{}_folds/Fold-{}".format( top, fold))) for fold in range(1, 6)) for top in topics } dataset = [] for top, folds in top2folds.items(): path = "../data/reason/stance/{}".format(top) for fold, names in enumerate(folds): for name in names: data = list(load_txt("{}/{}.data".format(path, name))) assert len(data) == 1 data = data[0] meta = dict( line.split("=") for line in load_txt("{}/{}.meta".format(path, name))) stn = meta['Stance'] try: stn = int(meta['Stance']) except ValueError: print(top, name) continue
trial = 'cgc' ckpt = None from tqdm import tqdm from util_cw import CharWright from util_io import load_txt from util_np import np, batch_sample, partition from util_tf import pipe tf.set_random_seed(0) sess = tf.InteractiveSession() cws = CharWright.load("../data/cws.pkl") cwt = CharWright.load("../data/cwt.pkl") src = np.array(list(load_txt("../data/src.txt"))) tgt = np.array(list(load_txt("../data/tgt.txt"))) src_valid, src_train = src[:4096], src[4096:] tgt_valid, tgt_train = tgt[:4096], tgt[4096:] val = np.array( sorted(range(len(tgt_valid)), key=lambda i: max(len(src_valid[i]), len(tgt_valid[i])))) src_valid = src_valid[val] tgt_valid = tgt_valid[val] def feed(src, tgt, cws=cws, cwt=cwt): src_idx, len_src = cws(src, ret_img=False, ret_idx=True) tgt_img, tgt_idx, len_tgt = cwt(tgt, ret_img=True, ret_idx=True) return src_idx, len_src, tgt_img, tgt_idx, len_tgt def batch(src=src_train, tgt=tgt_train, size=128, seed=0):
path_src = pform(P.raw, "europarl-v7.de-en.de") path_tgt = pform(P.raw, "europarl-v7.de-en.en") ############### # build vocab # ############### vocab_src = spm(pform(P.data, "vocab_src"), path_src, C.dim_src, C.bos, C.eos, C.unk) vocab_tgt = spm(pform(P.data, "vocab_tgt"), path_tgt, C.dim_tgt, C.bos, C.eos, C.unk) ############# # load data # ############# src_tgt = list(zip(load_txt(path_src), load_txt(path_tgt))) np.random.seed(C.seed) np.random.shuffle(src_tgt) #################### # filter and split # #################### train_src = [] train_tgt = [] valid_src = [] valid_tgt = [] valid_raw = [] for src, tgt in src_tgt: s = vocab_src.encode_as_ids(src) t = vocab_tgt.encode_as_ids(tgt)
#print("{}\n{}\n{}\n{}\n".format(idx,_[0],_[1],_[2])) #print: sentence1, path, sentence2 path_vocab = "../trial/data/vocab.model" path_txt = "../data/test_data.txt" path_ckpt = "../trial/ckpt/kudo18" path_use_dim = "../data/useful_dimension.npy" # load and restore model vae = vAe('infer') sess = tf.InteractiveSession() tf.train.Saver().restore(sess, path_ckpt) # load vocab and text vocab = sp.load_spm(path_vocab) text = list(load_txt(path_txt)) #pick 2 random sentences to explore np.random.seed(23) sen_idx = np.random.random_integers(0, len(text), 2) sentences = [text[idx] for idx in sen_idx] print("sentence 1: {}\nsentence 2: {}".format(sentences[0], sentences[1])) # encode sentences with sentence piece model data = sp.encode(vocab, sentences) ### full high dimensional space z = vae.z.eval({vae.tgt: data}) analyze(z) ### only the dimensions that turned out usefull for our task
def load_ibm_claim(path): rows = csv.reader(load_txt(path)) next(rows) for row in rows: yield row[3]
rows = csv.reader(load_txt(path)) next(rows) for row in rows: yield row[3] def load_all(): for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv": yield from load_ibm_claim("{}/{}".format(path_csv, split)) # extract all sentences save_txt(path_txt, load_all()) # train a sentence piece model spm(name= path_vocab, path= path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # load and shuffle sentences sents = list(load_txt(path_txt)) np.random.seed(0) np.random.shuffle(sents) # train valid split valid = sents[:valid_size] train = sents[valid_size:] # save train and valid data save_txt(path_train, train) np.save(path_valid, encode(vocab, valid))
from trial import config as C, paths as P, train as T from util import partial, comp, select from util_io import pform, load_txt, save_txt from util_np import np, partition, batch_sample from util_sp import load_spm, encode, decode from util_tf import tf, pipe tf.set_random_seed(C.seed) C.trial = "m3_" C.ckpt = 3 langs = 'en', 'nl', 'de', 'da', 'sv' vocab = tuple( load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs) sents = tuple( encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang)))) for lang, voc in zip(langs, vocab)) index = tuple(permutations(range(5), 2)) model = Model.new(**select(C, *Model._new)) model = tuple(model.data(i, j).infer() for i, j in index) sess = tf.InteractiveSession() saver = tf.train.Saver() def trans(sents, model, vocab): for preds in batch_run(sess, model, model.pred, sents, batch=C.batch_infer): yield from decode(vocab, preds)
posts = tuple( clean(post[3]) # extract the cleaned raw texts for filename in sorted(os.listdir(path_raw)) # each json: posts, annotations, metadata for post in load_json(pform(path_raw, filename))[0] # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp ) # removes empty posts posts = tuple(post for post in posts if 0 < len(post)) # saves raw texts save_txt(path_txt, posts) # train a sentence piece model spm(name=path_vocab, path=path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # length control posts = [encode_capped(vocab, post, cap=512) for post in posts] save_txt(path_train, map(vocab.decode_ids, posts)) # validation data posts = tuple(map(clean, load_txt(path_val))) posts = [encode_capped(vocab, post, cap=512) for post in posts] np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))