Ejemplo n.º 1
0
def test_decode():
    generate_artifacts()
    text_in = " ".join("".join([random.choice("abcd ") for _ in range(50)]).split())

    with open("decode_text_in.txt", "w") as fout:
        fout.write(text_in)
    cmd_args = ["yttm", "encode", f"--model={BASE_MODEL_FILE}", "--output_type=id"]
    run(
        cmd_args,
        stdin=open("decode_text_in.txt", "r"),
        stdout=open("decode_id.txt", "w"),
        check=True,
    )

    cmd_args = ["yttm", "decode", f"--model={BASE_MODEL_FILE}"]
    run(
        cmd_args,
        stdin=open("decode_id.txt", "r"),
        stdout=open("decode_text_out.txt", "w"),
        check=True,
    )

    with open("decode_text_out.txt", "r") as fin:
        text_out = fin.readline()

    os.remove("decode_text_in.txt")
    os.remove("decode_text_out.txt")
    os.remove("decode_id.txt")

    assert text_in == text_out[:-1]
Ejemplo n.º 2
0
def test_encode_decode():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)

    yttm.BPE.train(
        data=TRAIN_FILE,
        vocab_size=16000,
        model=BASE_MODEL_FILE,
        bos_id=BOS_ID,
        eos_id=EOS_ID,
    )

    bpe = yttm.BPE(BASE_MODEL_FILE)
    text_in = [
        " ".join("".join([random.choice("abcd ") for _ in range(50)]).split())
    ]
    ids = bpe.encode(text_in, yttm.OutputType.ID)
    # It is necessary to add first empty line, since everything in BPE starts from a new line
    text_in[0] = "\n" + text_in[0]
    assert text_in == bpe.decode(ids)
    ids_bos_eos = bpe.encode(text_in, yttm.OutputType.ID, bos=True, eos=True)
    assert text_in == bpe.decode(ids_bos_eos, ignore_ids=[BOS_ID, EOS_ID])
    assert bpe.decode(ids,
                      ignore_ids=[]) == bpe.decode(ids_bos_eos,
                                                   ignore_ids=[BOS_ID, EOS_ID])
Ejemplo n.º 3
0
def test_renaming():
    generate_artifacts()
    cmd_args = [
        "yttm",
        "encode",
        f"--model={RENAME_ID_MODEL_FILE}",
        "--output_type=id",
        "--bos",
        "--n_threads=1",
    ]
    run(cmd_args, stdin=open(TEST_FILE, "r"), stdout=open("log.txt", "w"), check=True)
    file_starts_with("log.txt", "29")

    cmd_args = [
        "yttm",
        "encode",
        f"--model={RENAME_ID_MODEL_FILE}",
        "--output_type=id",
        "--eos",
        "--reverse",
        "--n_threads=1",
    ]
    run(cmd_args, stdin=open(TEST_FILE, "r"), stdout=open("log.txt", "w"), check=True)
    file_starts_with("log.txt", "1148")
    os.remove("log.txt")
Ejemplo n.º 4
0
def test_bos_eos_reverse():
    generate_artifacts()
    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=subword",
        "--n_threads=1",
        "--bos",
    ]
    run(cmd_args,
        stdin=open(TEST_FILE, "r"),
        stdout=open("log.txt", "w"),
        check=True)
    file_starts_with("log.txt", "<BOS>")

    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=subword",
        "--n_threads=1",
        "--reverse",
        "--eos",
    ]
    run(cmd_args,
        stdin=open(TEST_FILE, "r"),
        stdout=open("log.txt", "w"),
        check=True)
    file_starts_with("log.txt", "<EOS>")

    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=id",
        "--n_threads=1",
        "--bos",
    ]
    run(cmd_args,
        stdin=open(TEST_FILE, "r"),
        stdout=open("log.txt", "w"),
        check=True)
    file_starts_with("log.txt", "2")

    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=id",
        "--n_threads=1",
        "--reverse",
        "--eos",
    ]
    run(cmd_args,
        stdin=open(TEST_FILE, "r"),
        stdout=open("log.txt", "w"),
        check=True)
    file_starts_with("log.txt", "3")
    os.remove("log.txt")
Ejemplo n.º 5
0
def test_encode_decode():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)
    yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE)

    bpe = yttm.BPE(BASE_MODEL_FILE)
    text_in = [" ".join("".join([random.choice("abcd ") for _ in range(50)]).split())]
    ids = bpe.encode(text_in, yttm.OutputType.ID)
    assert text_in == bpe.decode(ids)
Ejemplo n.º 6
0
def test_multithreading():
    generate_artifacts()
    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=subword",
        "--n_threads=10",
    ]
    run(cmd_args, stdin=open(TEST_FILE, "r"), stdout=open("log.txt", "w"), check=True)
Ejemplo n.º 7
0
def test_interactive_mode():
    generate_artifacts()
    print("interactive helper running id ...")
    cmd = f"python interactor.py | yttm encode --stream --model={BASE_MODEL_FILE} --output_type=id > log.txt"
    assert os.system(cmd) == 0

    print("interactive helper running subword ...")
    cmd = f"python interactor.py | yttm encode --stream --model={BASE_MODEL_FILE} --output_type=subword > log.txt"
    assert os.system(cmd) == 0
    os.remove("log.txt")
Ejemplo n.º 8
0
def test_vocabulary_consistency():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)
    yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE)

    bpe = yttm.BPE(BASE_MODEL_FILE)
    assert bpe.vocab_size() == len(bpe.vocab())
    assert bpe.vocab_size() == len(set(bpe.vocab()))
    vc = bpe.vocab()
    for i, subword in enumerate(vc):
        assert i == bpe.subword_to_id(subword)
        assert subword == bpe.id_to_subword(i)
Ejemplo n.º 9
0
def test_encode_decode():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)

    yttm.BPE.train(
        data=TRAIN_FILE,
        vocab_size=16000,
        model=BASE_MODEL_FILE,
        bos_id=BOS_ID,
        eos_id=EOS_ID,
    )

    bpe = yttm.BPE(BASE_MODEL_FILE)
    text_in = [
        " ".join("".join([random.choice("abcd ") for _ in range(50)]).split())
    ]
    ids = bpe.encode(text_in, yttm.OutputType.ID)
    assert text_in == bpe.decode(ids)
    ids_bos_eos = bpe.encode(text_in, yttm.OutputType.ID, bos=True, eos=True)
    assert text_in == bpe.decode(ids_bos_eos, ignore_ids=[BOS_ID, EOS_ID])
    assert bpe.decode(ids,
                      ignore_ids=[]) == bpe.decode(ids_bos_eos,
                                                   ignore_ids=[BOS_ID, EOS_ID])
Ejemplo n.º 10
0
def test_renaming_unknown():
    generate_artifacts()
    with open("local_test.txt", "w") as fout:
        fout.write("z")

    cmd_args = [
        "yttm",
        "encode",
        f"--model={RENAME_ID_MODEL_FILE}",
        "--output_type=id",
        "--reverse",
        "--n_threads=1",
    ]
    run(
        cmd_args,
        stdin=open("local_test.txt", "r"),
        stdout=open("log.txt", "w"),
        check=True,
    )

    file_starts_with("log.txt", "2922")
    os.remove("local_test.txt")
    os.remove("log.txt")
    return
Ejemplo n.º 11
0
def test_vocab():
    generate_artifacts()
    run(["yttm", "vocab", f"--model={BASE_MODEL_FILE}"], check=True)
    run(["yttm", "vocab", f"--model={BASE_MODEL_FILE}", "--verbose"], check=True)
Ejemplo n.º 12
0
def test_decode():
    generate_artifacts()
    text_in = " ".join("".join([random.choice("abcd ")
                                for _ in range(50)]).split())

    with open("decode_text_in.txt", "w") as fout:
        fout.write(text_in)
    cmd_args = [
        "yttm", "encode", f"--model={BASE_MODEL_FILE}", "--output_type=id"
    ]
    run(
        cmd_args,
        stdin=open("decode_text_in.txt", "r"),
        stdout=open("decode_id.txt", "w"),
        check=True,
    )

    cmd_args = ["yttm", "decode", f"--model={BASE_MODEL_FILE}"]
    run(
        cmd_args,
        stdin=open("decode_id.txt", "r"),
        stdout=open("decode_text_out.txt", "w"),
        check=True,
    )

    with open("decode_text_out.txt", "r") as fin:
        fin.readline()
        text_out = fin.readline()

    assert text_in == text_out[:-1]

    cmd_args = [
        "yttm",
        "encode",
        f"--model={BASE_MODEL_FILE}",
        "--output_type=id",
        "--bos",
        "--eos",
    ]
    run(
        cmd_args,
        stdin=open("decode_text_in.txt", "r"),
        stdout=open("decode_id.txt", "w"),
        check=True,
    )

    cmd_args = [
        "yttm",
        "decode",
        f"--model={BASE_MODEL_FILE}",
        f"--ignore_ids={BOS_ID},{EOS_ID}",
    ]
    run(
        cmd_args,
        stdin=open("decode_id.txt", "r"),
        stdout=open("decode_text_out.txt", "w"),
        check=True,
    )

    with open("decode_text_out.txt", "r") as fin:
        # It is necessary to skip the first line, since everything in BPE starts from a new line
        fin.readline()
        text_out = fin.readline()

    assert text_in == text_out[:-1]

    os.remove("decode_text_in.txt")
    os.remove("decode_text_out.txt")
    os.remove("decode_id.txt")