def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) assert len(ja_tokenizer(text)) == len_a assert len(nlp_a(text)) == len_a assert len(nlp_b(text)) == len_b assert len(nlp_c(text)) == len_c
def test_ja_tokenizer_sub_tokens( ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c ): nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
def test_ja_tokenizer_serialize(ja_tokenizer): tokenizer_bytes = ja_tokenizer.to_bytes() nlp = Japanese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.split_mode is None with make_tempdir() as d: file_path = d / "tokenizer" ja_tokenizer.to_disk(file_path) nlp = Japanese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.split_mode is None # split mode is (de)serialized correctly nlp = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_r = Japanese() nlp_bytes = nlp.to_bytes() nlp_r.from_bytes(nlp_bytes) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B" with make_tempdir() as d: nlp.to_disk(d) nlp_r = Japanese() nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B"