from util_np import np, partition from util_tf import tf sess = tf.InteractiveSession() # load model cws = CharWright.load("../data/cws.pkl") cwt = CharWright.load("../data/cwt.pkl") m = model('infer', cws.dwh(), cwt.dwh()) saver = tf.train.Saver() saver.restore(sess, "../ckpt/{}".format(ckpt)) # the first 4096 instances are used for validation src = np.array(list(islice(load_txt("../data/src.txt"), 4096))) tgt = np.array(list(islice(load_txt("../data/tgt.txt"), 4096))) val = np.array(sorted(range(len(src)), key=lambda i: len(src[i]))) src = src[val] tgt = tgt[val] def translate(src, mode): for i, j in partition(len(src), 256): src_idx, len_src = cws(src[i:j], ret_img=False, ret_idx=True) pred, pidx = infer(mode, m, sess, cwt, src_idx, len_src) yield from trim_str(pidx, cwt) save_txt("../tmp/prd", translate(src, mode)) save_txt("../tmp/tgt", tgt) # sacrebleu -tok intl -b -i ../tmp/prd ../tmp/tgt
if sentence[:7] == "Label##": stance, reason = sentence[7:].lower().split("-") if "other" == reason: continue #exclude OTHER class label = "{}-{}-{}".format(topic, stance, reason) count = 1 try: while text[idx + count][:6] == "Line##": folds.append(fold) labels.append(label) arguments.append(text[idx + count][6:]) count += 1 except IndexError: continue # save the data save_txt("../data/test_data.txt", arguments) np.save("../data/test_labels.npy", np.asarray(labels)) np.save("../data/test_folds.npy", np.asarray(folds)) ####### # NEW # import os import numpy as np from util_io import load_txt, save_txt, clean import re datadir = '../data/reason/reason' posts, labels, topics = [], [], [] for topic in 'abortion', 'gayRights', 'marijuana', 'obama':
classes = {id(cls): cls for cls in sent2class.values()} del sent2class aligned = [] for sent_lang in tqdm(classes.values(), ncols=70): lang2sents = defaultdict(list) for sent, lang in sent_lang: lang2sents[lang].append(sent) if len(langs) == len(lang2sents) and all(1 == len(sents) for sents in lang2sents.values()): aligned.append(tuple(lang2sents[lang][0] for lang in langs)) aligned.sort() del classes # save aligned corpora for lang, sents in zip(langs, zip(*aligned)): save_txt(pform(P.raw, lang), sents) del aligned ################## # prep and split # ################## # train one sentencepiece model for each language vocab = tuple( spm(pform(P.data, "vocab_{}".format(lang)), pform(P.raw, lang), C.dim_voc, C.bos, C.eos, C.unk) for lang in langs) # remove long sentences short = [] for sents in zip(*(load_txt(pform(P.raw, lang)) for lang in langs)): sents = [v.encode_as_ids(s) for v, s in zip(vocab, sents)]
############# # load data # ############# src_tgt = [] for src, tgt in zip(load_txt(path_src), load_txt(path_tgt)): src = src.strip() tgt = tgt.strip() if 3 <= len(src) <= max_char and 3 <= len(tgt) <= max_char: src_tgt.append((src, tgt)) np.random.seed(0) np.random.shuffle(src_tgt) src, tgt = zip(*src_tgt) del src_tgt ############# # save data # ############# cws = CharWright.new(chars(src)) cwt = CharWright.new(chars(tgt)) cws.save("../data/cws.pkl") cwt.save("../data/cwt.pkl") save_txt("../data/src.txt", src) save_txt("../data/tgt.txt", tgt)
#################### # filter and split # #################### train_src = [] train_tgt = [] valid_src = [] valid_tgt = [] valid_raw = [] for src, tgt in src_tgt: s = vocab_src.encode_as_ids(src) t = vocab_tgt.encode_as_ids(tgt) if 0 < len(s) <= C.cap and 0 < len(t) <= C.cap: if len(valid_raw) < C.total_valid: valid_src.append(s) valid_tgt.append(t) valid_raw.append(tgt) else: train_src.append(src) train_tgt.append(tgt) ############# # save data # ############# save_txt(pform(P.data, "train_src.txt"), train_src) save_txt(pform(P.data, "train_tgt.txt"), train_tgt) save_txt(pform(P.data, "valid_tgt.txt"), valid_raw) np.save( pform(P.data, "valid_tgt.npy"), vpack(valid_tgt, (C.total_valid, C.cap), C.eos, np.uint16)) np.save( pform(P.data, "valid_src.npy"), vpack(valid_src, (C.total_valid, C.cap), C.eos, np.uint16))
from util_io import load_txt, save_txt from util_np import np, partition, vpack from util_np import vpack import pandas as pd import util_sp as sp # load data df = pd.read_csv(path_csv) emb = np.load(path_emb) emb_sp = np.load(path_emb_sp) # load sentencepiece model vocab = sp.load_spm(path_vocab) # Load the model model = vAe('infer') # Restore the session sess = tf.InteractiveSession() tf.train.Saver().restore(sess, path_ckpt) ########################### # generate from centroids # ########################### for col in "euc euc_sp cos cos_sp".split(): cluster = df["cls_{}".format(col)].values centroids = np.stack( [np.mean(emb[cluster == c], axis=0) for c in range(cluster.max() + 1)]) y = decode(sess, model, centroids, steps=512) save_txt("../trial/centroids_{}".format(col), sp.decode(vocab, y))
from util_np import np, vpack from util_sp import load_spm, spm, encode import csv def load_ibm_claim(path): rows = csv.reader(load_txt(path)) next(rows) for row in rows: yield row[3] def load_all(): for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv": yield from load_ibm_claim("{}/{}".format(path_csv, split)) # extract all sentences save_txt(path_txt, load_all()) # train a sentence piece model spm(name= path_vocab, path= path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # load and shuffle sentences sents = list(load_txt(path_txt)) np.random.seed(0) np.random.shuffle(sents) # train valid split valid = sents[:valid_size] train = sents[valid_size:]
C.trial = "m3_" C.ckpt = 3 langs = 'en', 'nl', 'de', 'da', 'sv' vocab = tuple( load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs) sents = tuple( encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang)))) for lang, voc in zip(langs, vocab)) index = tuple(permutations(range(5), 2)) model = Model.new(**select(C, *Model._new)) model = tuple(model.data(i, j).infer() for i, j in index) sess = tf.InteractiveSession() saver = tf.train.Saver() def trans(sents, model, vocab): for preds in batch_run(sess, model, model.pred, sents, batch=C.batch_infer): yield from decode(vocab, preds) saver.restore(sess, pform(P.ckpt, C.trial, C.ckpt)) for (i, j), m in zip(index, model): print(langs[j], "<-", langs[i]) save_txt(pform(P.pred, C.trial, langs[j], "_", langs[i]), trans(sents[i], m, vocab[j]))
import os posts = tuple( clean(post[3]) # extract the cleaned raw texts for filename in sorted(os.listdir(path_raw)) # each json: posts, annotations, metadata for post in load_json(pform(path_raw, filename))[0] # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp ) # removes empty posts posts = tuple(post for post in posts if 0 < len(post)) # saves raw texts save_txt(path_txt, posts) # train a sentence piece model spm(name=path_vocab, path=path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # length control posts = [encode_capped(vocab, post, cap=512) for post in posts] save_txt(path_train, map(vocab.decode_ids, posts)) # validation data posts = tuple(map(clean, load_txt(path_val))) posts = [encode_capped(vocab, post, cap=512) for post in posts] np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))