Example #1
0
def test_subword_regularization():
    pyonmttok.set_random_seed(42)

    tokenizer = pyonmttok.Tokenizer(
        "none",
        sp_model_path=os.path.join(_DATA_DIR, "sp-models", "wmtende.model"),
        sp_nbest_size=10,
        sp_alpha=0.1,
    )
    assert tokenizer.tokenize("appealing")[0] == ["▁app", "e", "al", "ing"]
    assert tokenizer.tokenize("appealing", training=False)[0] == ["▁appealing"]

    tokenizer = pyonmttok.Tokenizer(
        "conservative",
        bpe_model_path=os.path.join(_DATA_DIR, "bpe-models", "testcode.v0.1"),
        bpe_dropout=0.3,
    )
    assert tokenizer.tokenize("improvement")[0] == [
        "i",
        "m",
        "pr",
        "ove",
        "m",
        "e",
        "n",
        "t",
    ]
    assert tokenizer.tokenize("improvement", training=False)[0] == [
        "impr",
        "ovemen",
        "t",
    ]
Example #2
0
 def _set_seed(self, seed):
     """set seed to ensure reproducibility."""
     import pyonmttok
     pyonmttok.set_random_seed(seed)