Ejemplo n.º 1
0
def preprocess(split):
    """ preprocess data for filtering model """
    origfile = "all_cands_%s.txt" % split
    with open(origfile) as f:
        orig_prompts = read_transfile(f.readlines(), weighted=False)
    with open(origfile) as f:
        src_lines = read_trans_prompts(f.readlines())
    src_lines = {a: b for (a, b) in src_lines}
    data = []
    for (key, x) in orig_prompts.items():
        candidates = []
        for k in x.keys():
            score = 0
            sent = sp.EncodeAsPieces(k)
            candidates.append((sent, score))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        candidates = [{
            "id": i,
            "tokens": x[0],
            "score": x[1]
        } for (i, x) in enumerate(candidates)]
        data.append({"src": key, "cand": candidates})
    cand = data
    origfile = "%s_split.gold.txt" % split
    with open(origfile) as f:
        orig_prompts = read_transfile(f.readlines(), weighted=True)
    data = []
    for (key, x) in orig_prompts.items():
        candidates = []
        for k in x.keys():
            score = 0
            sent = sp.EncodeAsPieces(k)
            candidates.append((sent, score))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        candidates = [{
            "id": i,
            "tokens": x[0],
            "score": x[1]
        } for (i, x) in enumerate(candidates)]
        data.append({"src": key, "cand": candidates})
    gold = data
    print(len(gold))
    gold = {x["src"]: x["cand"] for x in gold}
    cand = {x["src"]: x["cand"] for x in cand}
    data = []
    for (k, v) in gold.items():
        x = cand[k]
        real = set([" ".join(y["tokens"]) for y in v])
        num = len(x)
        cnt = 0
        for item in x:
            if " ".join(item["tokens"]) in real:
                item["score"] = 1
                cnt += 1
        src_tokens = en_sp.EncodeAsPieces(src_lines[k])
        data.append({"src": src_tokens, "cand": x})
    with open("./data/%s.ja.rank.aug.json" % split, "w") as f:
        json.dump(data, f)
def get_data(fname: str, srcfname: str, tgtfname: str, prefix: str) -> None:
    """
    This converts data in the shared task format into standard machine translation format (one sentence per line, languages in separate files.)
    For training data, it combines the prompt with all accepted translations. 
    For dev or test data, it combines the prompt only with the most popular translation.
    """

    with open(fname) as f:
        lines = f.readlines()
    d = read_transfile(lines, strip_punc=False, weighted=True)
    id_text = dict(read_trans_prompts(lines))

    with open(srcfname, "w") as src, open(tgtfname, "w") as tgt:
        for idstring in d.keys():

            # prompt is combination of id and text.
            prompt = id_text[idstring]
            ats = d[idstring]

            # make sure that the first element is the largest.
            ats = sorted(ats.items(), key=lambda p: p[1], reverse=True)

            # if it is train
            if prefix == "train":
                # write all pairs.
                for p in ats:
                    print(prompt, file=src)
                    print(p[0], file=tgt)
            else:
                # write just the first pair (evaluate only on first line.)
                top_ranked_text = ats[0][0]
                print(prompt, file=src)
                print(top_ranked_text, file=tgt)
def main():
  #with open(args.goldfile) as f:
    #print("reading gold")
   # gold = read_transfile(f.readlines())

  #with open(args.predfile) as f:
  #  print("reading pred")
  #  pred = read_transfile(f.readlines())
  now = datetime.now().time()
  print(now)
  F = open('sample.txt')
  PRED = read_transfile(F.readlines())
  G = open('reference.txt')
  GOLD = read_transfile(G.readlines(),weighted=True)
  avg_BLEUs = {}
  count = 0
  for K in GOLD.keys():
    #refs = GOLD[K]
    ref = max(GOLD[K].items(), key = operator.itemgetter(1))[0]
    trans_set = PRED[K]
    avg_BLEU = round(range_BLEU(trans_set, ref,'subword.trg.model'),ndigits=4)
    avg_BLEUs[K] = avg_BLEU

  f = open('sample.txt')
  pred = lee_transfile(f.readlines())

  dp_score_list = {}
  for k, d in pred.items():
    trans_set = list(d)
    trans_dict = {}
    for item in trans_set:
      if item not in trans_dict:
        trans_dict[item] = 1
      else:
        trans_dict[item] += 1
    eprint(len(trans_dict))
    dp_score = round(discrepancy_score(trans_dict,'subword.trg.model'),ndigits=4)
    dp_score_list[k] = dp_score

  now = datetime.now().time()
  print(now)
  for K in GOLD.keys():
    DP = dp_score_list[K]
    AB = avg_BLEUs[K]
    print(str(DP) + '\t' + str(AB))
def main():
    #with open(args.goldfile) as f:
    #print("reading gold")
    # gold = read_transfile(f.readlines())

    with open(args.predfile) as f:
        print("reading pred")
        pred = read_transfile(f.readlines())

    dp_score_list = []
    for k, d in pred.items():
        trans_set = list(d.keys())
        print(trans_set)
        dp_score = round(discrepancy_score(trans_set), ndigits=4)
        dp_score_list.append(dp_score)

    for score in dp_score_list:
        print(score)
Ejemplo n.º 5
0
def split_data(langs: str, duo_data_dir: str, output_dir: str):
    langs = langs.split()

    data = defaultdict(dict)
    all_prompts = {}
    shared_prompts = {}
    prompts = defaultdict(dict)

    # data_path="/Users/hudakhayrallah/Downloads/staple-2020-train/"
    # data_path = ""

    # Build list of prompts shared across languages
    for lang in langs:
        trainset = read_transfile(open(
            f"{duo_data_dir}/en_{lang}/train.en_{lang}.2020-01-13.gold.txt").
                                  readlines(),
                                  weighted=True,
                                  strip_punc=False)
        lang_prompts = read_trans_prompts(
            open(
                f"{duo_data_dir}/en_{lang}/train.en_{lang}.2020-01-13.gold.txt"
            ).readlines())
        for key, prompt in lang_prompts:
            prompts[lang][key] = prompt

        for prompt, translations in trainset.items():
            #        print(f"{lang} {prompt} {prompts[lang][prompt]}")
            data[lang][prompt] = translations
            if prompt in all_prompts:
                shared_prompts[prompt] = 1
            all_prompts[prompt] = 1

    for lang in langs:
        not_shared = list(
            filter(lambda x: x not in shared_prompts, data[lang].keys()))
        print(
            f"{lang}: {len(data[lang].keys())} not shared: {len(not_shared)}")

    # Build test sets, seeded with shared prompts
    for lang in langs:
        test_split = random.sample(
            list(filter(lambda x: x not in shared_prompts, data[lang].keys())),
            500)

        print(f"Writing files for {lang}", file=sys.stderr)
        with open(f"{output_dir}/en_{lang}_split.test", "w") as out:
            for prompt in test_split:
                print("|".join([prompt, prompts[lang][prompt]]), file=out)
                for translation, value in data[lang][prompt].items():
                    print(translation, value, sep="|", file=out)
                print(file=out)

        # further splitting in to 3 sub test splits. of sizes 100, 100, 300
        test_split0, test_split1, test_split2 = test_split[:100], test_split[
            100:200], test_split[200:]

        with open(f"{output_dir}/en_{lang}_split.test0", "w") as out:
            for prompt in test_split0:
                print("|".join([prompt, prompts[lang][prompt]]), file=out)
                for translation, value in data[lang][prompt].items():
                    print(translation, value, sep="|", file=out)
                print(file=out)

        with open(f"{output_dir}/en_{lang}_split.test1", "w") as out:
            for prompt in test_split1:
                print("|".join([prompt, prompts[lang][prompt]]), file=out)
                for translation, value in data[lang][prompt].items():
                    print(translation, value, sep="|", file=out)
                print(file=out)

        with open(f"{output_dir}/en_{lang}_split.test2", "w") as out:
            for prompt in test_split2:
                print("|".join([prompt, prompts[lang][prompt]]), file=out)
                for translation, value in data[lang][prompt].items():
                    print(translation, value, sep="|", file=out)
                print(file=out)

        with open(f"{output_dir}/en_{lang}_split.train", "w") as out:
            for prompt in data[lang].keys():
                if prompt not in test_split:
                    print("|".join([prompt, prompts[lang][prompt]]), file=out)
                    for translation in data[lang][prompt]:
                        print(translation, value, sep="|", file=out)
                    print(file=out)
from collections import defaultdict

langs = "hu ja ko pt vi".split()

data = defaultdict(dict)
all_prompts = {}
shared_prompts = {}
prompts = defaultdict(dict)

#data_path="/Users/hudakhayrallah/Downloads/staple-2020-train/"
data_path = ""

# Build list of prompts shared across languages
for lang in langs:
    trainset = read_transfile(
        open(f"{data_path}en_{lang}/train.gold").readlines(),
        weighted=True,
        strip_punc=False)
    lang_prompts = read_trans_prompts(
        open(f"{data_path}en_{lang}/train.gold").readlines())
    for key, prompt in lang_prompts:
        prompts[lang][key] = prompt

    for prompt, translations in trainset.items():
        #        print(f"{lang} {prompt} {prompts[lang][prompt]}")
        data[lang][prompt] = translations
        if prompt in all_prompts:
            shared_prompts[prompt] = 1
        all_prompts[prompt] = 1

for lang in langs:
    not_shared = list(
Ejemplo n.º 7
0
import argparse
from utils import read_transfile

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--goldfile", help="gold file", required=True)
    parser.add_argument("--predfile1", help="pred file 1", required=True)
    parser.add_argument("--predfile2", help="pred file 2", required=True)
    args = parser.parse_args()

    with open(args.goldfile) as f:
        print("reading gold")
        gold = read_transfile(f.readlines(), weighted=True)

    with open(args.predfile1) as f:
        print("reading pred1")
        pred1 = read_transfile(f.readlines())

    with open(args.predfile2) as f:
        print("reading pred2")
        pred2 = read_transfile(f.readlines())

    # Choose a prompt
    keys = list(gold.keys())
    key = keys[0]

    # Get the translations for that prompt
    gold_set = set(gold[key].keys())
    pred1_keys = set(pred1[key].keys())
    pred2_keys = set(pred2[key].keys())
    print(f'Pred1 {len(gold_set.intersection(pred1_keys))} / {len(pred1_keys)} correct')