def preprocess(split): """ preprocess data for filtering model """ origfile = "all_cands_%s.txt" % split with open(origfile) as f: orig_prompts = read_transfile(f.readlines(), weighted=False) with open(origfile) as f: src_lines = read_trans_prompts(f.readlines()) src_lines = {a: b for (a, b) in src_lines} data = [] for (key, x) in orig_prompts.items(): candidates = [] for k in x.keys(): score = 0 sent = sp.EncodeAsPieces(k) candidates.append((sent, score)) candidates = sorted(candidates, key=lambda x: x[1], reverse=True) candidates = [{ "id": i, "tokens": x[0], "score": x[1] } for (i, x) in enumerate(candidates)] data.append({"src": key, "cand": candidates}) cand = data origfile = "%s_split.gold.txt" % split with open(origfile) as f: orig_prompts = read_transfile(f.readlines(), weighted=True) data = [] for (key, x) in orig_prompts.items(): candidates = [] for k in x.keys(): score = 0 sent = sp.EncodeAsPieces(k) candidates.append((sent, score)) candidates = sorted(candidates, key=lambda x: x[1], reverse=True) candidates = [{ "id": i, "tokens": x[0], "score": x[1] } for (i, x) in enumerate(candidates)] data.append({"src": key, "cand": candidates}) gold = data print(len(gold)) gold = {x["src"]: x["cand"] for x in gold} cand = {x["src"]: x["cand"] for x in cand} data = [] for (k, v) in gold.items(): x = cand[k] real = set([" ".join(y["tokens"]) for y in v]) num = len(x) cnt = 0 for item in x: if " ".join(item["tokens"]) in real: item["score"] = 1 cnt += 1 src_tokens = en_sp.EncodeAsPieces(src_lines[k]) data.append({"src": src_tokens, "cand": x}) with open("./data/%s.ja.rank.aug.json" % split, "w") as f: json.dump(data, f)
def get_data(fname: str, srcfname: str, tgtfname: str, prefix: str) -> None: """ This converts data in the shared task format into standard machine translation format (one sentence per line, languages in separate files.) For training data, it combines the prompt with all accepted translations. For dev or test data, it combines the prompt only with the most popular translation. """ with open(fname) as f: lines = f.readlines() d = read_transfile(lines, strip_punc=False, weighted=True) id_text = dict(read_trans_prompts(lines)) with open(srcfname, "w") as src, open(tgtfname, "w") as tgt: for idstring in d.keys(): # prompt is combination of id and text. prompt = id_text[idstring] ats = d[idstring] # make sure that the first element is the largest. ats = sorted(ats.items(), key=lambda p: p[1], reverse=True) # if it is train if prefix == "train": # write all pairs. for p in ats: print(prompt, file=src) print(p[0], file=tgt) else: # write just the first pair (evaluate only on first line.) top_ranked_text = ats[0][0] print(prompt, file=src) print(top_ranked_text, file=tgt)
def main(): #with open(args.goldfile) as f: #print("reading gold") # gold = read_transfile(f.readlines()) #with open(args.predfile) as f: # print("reading pred") # pred = read_transfile(f.readlines()) now = datetime.now().time() print(now) F = open('sample.txt') PRED = read_transfile(F.readlines()) G = open('reference.txt') GOLD = read_transfile(G.readlines(),weighted=True) avg_BLEUs = {} count = 0 for K in GOLD.keys(): #refs = GOLD[K] ref = max(GOLD[K].items(), key = operator.itemgetter(1))[0] trans_set = PRED[K] avg_BLEU = round(range_BLEU(trans_set, ref,'subword.trg.model'),ndigits=4) avg_BLEUs[K] = avg_BLEU f = open('sample.txt') pred = lee_transfile(f.readlines()) dp_score_list = {} for k, d in pred.items(): trans_set = list(d) trans_dict = {} for item in trans_set: if item not in trans_dict: trans_dict[item] = 1 else: trans_dict[item] += 1 eprint(len(trans_dict)) dp_score = round(discrepancy_score(trans_dict,'subword.trg.model'),ndigits=4) dp_score_list[k] = dp_score now = datetime.now().time() print(now) for K in GOLD.keys(): DP = dp_score_list[K] AB = avg_BLEUs[K] print(str(DP) + '\t' + str(AB))
def main(): #with open(args.goldfile) as f: #print("reading gold") # gold = read_transfile(f.readlines()) with open(args.predfile) as f: print("reading pred") pred = read_transfile(f.readlines()) dp_score_list = [] for k, d in pred.items(): trans_set = list(d.keys()) print(trans_set) dp_score = round(discrepancy_score(trans_set), ndigits=4) dp_score_list.append(dp_score) for score in dp_score_list: print(score)
def split_data(langs: str, duo_data_dir: str, output_dir: str): langs = langs.split() data = defaultdict(dict) all_prompts = {} shared_prompts = {} prompts = defaultdict(dict) # data_path="/Users/hudakhayrallah/Downloads/staple-2020-train/" # data_path = "" # Build list of prompts shared across languages for lang in langs: trainset = read_transfile(open( f"{duo_data_dir}/en_{lang}/train.en_{lang}.2020-01-13.gold.txt"). readlines(), weighted=True, strip_punc=False) lang_prompts = read_trans_prompts( open( f"{duo_data_dir}/en_{lang}/train.en_{lang}.2020-01-13.gold.txt" ).readlines()) for key, prompt in lang_prompts: prompts[lang][key] = prompt for prompt, translations in trainset.items(): # print(f"{lang} {prompt} {prompts[lang][prompt]}") data[lang][prompt] = translations if prompt in all_prompts: shared_prompts[prompt] = 1 all_prompts[prompt] = 1 for lang in langs: not_shared = list( filter(lambda x: x not in shared_prompts, data[lang].keys())) print( f"{lang}: {len(data[lang].keys())} not shared: {len(not_shared)}") # Build test sets, seeded with shared prompts for lang in langs: test_split = random.sample( list(filter(lambda x: x not in shared_prompts, data[lang].keys())), 500) print(f"Writing files for {lang}", file=sys.stderr) with open(f"{output_dir}/en_{lang}_split.test", "w") as out: for prompt in test_split: print("|".join([prompt, prompts[lang][prompt]]), file=out) for translation, value in data[lang][prompt].items(): print(translation, value, sep="|", file=out) print(file=out) # further splitting in to 3 sub test splits. of sizes 100, 100, 300 test_split0, test_split1, test_split2 = test_split[:100], test_split[ 100:200], test_split[200:] with open(f"{output_dir}/en_{lang}_split.test0", "w") as out: for prompt in test_split0: print("|".join([prompt, prompts[lang][prompt]]), file=out) for translation, value in data[lang][prompt].items(): print(translation, value, sep="|", file=out) print(file=out) with open(f"{output_dir}/en_{lang}_split.test1", "w") as out: for prompt in test_split1: print("|".join([prompt, prompts[lang][prompt]]), file=out) for translation, value in data[lang][prompt].items(): print(translation, value, sep="|", file=out) print(file=out) with open(f"{output_dir}/en_{lang}_split.test2", "w") as out: for prompt in test_split2: print("|".join([prompt, prompts[lang][prompt]]), file=out) for translation, value in data[lang][prompt].items(): print(translation, value, sep="|", file=out) print(file=out) with open(f"{output_dir}/en_{lang}_split.train", "w") as out: for prompt in data[lang].keys(): if prompt not in test_split: print("|".join([prompt, prompts[lang][prompt]]), file=out) for translation in data[lang][prompt]: print(translation, value, sep="|", file=out) print(file=out)
from collections import defaultdict langs = "hu ja ko pt vi".split() data = defaultdict(dict) all_prompts = {} shared_prompts = {} prompts = defaultdict(dict) #data_path="/Users/hudakhayrallah/Downloads/staple-2020-train/" data_path = "" # Build list of prompts shared across languages for lang in langs: trainset = read_transfile( open(f"{data_path}en_{lang}/train.gold").readlines(), weighted=True, strip_punc=False) lang_prompts = read_trans_prompts( open(f"{data_path}en_{lang}/train.gold").readlines()) for key, prompt in lang_prompts: prompts[lang][key] = prompt for prompt, translations in trainset.items(): # print(f"{lang} {prompt} {prompts[lang][prompt]}") data[lang][prompt] = translations if prompt in all_prompts: shared_prompts[prompt] = 1 all_prompts[prompt] = 1 for lang in langs: not_shared = list(
import argparse from utils import read_transfile if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--goldfile", help="gold file", required=True) parser.add_argument("--predfile1", help="pred file 1", required=True) parser.add_argument("--predfile2", help="pred file 2", required=True) args = parser.parse_args() with open(args.goldfile) as f: print("reading gold") gold = read_transfile(f.readlines(), weighted=True) with open(args.predfile1) as f: print("reading pred1") pred1 = read_transfile(f.readlines()) with open(args.predfile2) as f: print("reading pred2") pred2 = read_transfile(f.readlines()) # Choose a prompt keys = list(gold.keys()) key = keys[0] # Get the translations for that prompt gold_set = set(gold[key].keys()) pred1_keys = set(pred1[key].keys()) pred2_keys = set(pred2[key].keys()) print(f'Pred1 {len(gold_set.intersection(pred1_keys))} / {len(pred1_keys)} correct')