def test_tokenizer_convert_url(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words = tokenizer.texts_to_words("http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words)) words_en = tokenizer._texts_to_words_en( "http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words_en) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words_en)) words_jp = tokenizer._texts_to_words_jp( "http://192.168.1.10/index.html") self.assertEqual([ "http", "://", "192", ".", "168", ".", "1", ".", "10", "/", "index", ".", "html" ], words_jp) self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html", tokenizer.words_to_texts(words_jp)) words_mix = tokenizer.texts_to_words( "URLはhttp://192.168.1.10/index.html") self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"], words_mix) self.assertEqual("URLはhttp://192.168.1.10/index.html", tokenizer.words_to_texts(words_mix))
def test_tokenizer_words_to_texts_en_with_symbol(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( "http :// 192.168.1.10 / index.html", tokenizer.words_to_texts( ["http", "://", "192.168.1.10", "/", "index.html"])) self.assertEqual( "Hello world", tokenizer.words_to_texts(["Hello", " ", " ", "world"])) self.assertEqual( "Hello . i don ' t know", tokenizer.words_to_texts( ["Hello", ".", "i", "don", "'", "t", "know"])) self.assertEqual( "Hello _ 1 friend_ 1", tokenizer.words_to_texts(["Hello", "_", "1", "friend_", "1"])) self.assertEqual( "Hello < my friend >", tokenizer.words_to_texts(["Hello", "<", "my", "friend", ">"])) self.assertEqual( 'Hello " my friend "', tokenizer.words_to_texts(["Hello", '"', "my", "friend", '"'])) self.assertEqual( 'Hello ` my friend `', tokenizer.words_to_texts(["Hello", "`", "my", "friend", "`"]))
def test_tokenizer_words_to_texts_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( "こんにちは10日はHappy dayですね", tokenizer.words_to_texts( ["こんにちは", "10", "日", "は", "Happy", "day", "です", "ね"])) self.assertEqual( "=こんにちは10日はHappy dayですね=", tokenizer.words_to_texts( ["=", "こんにちは", "10", "日", "は", "Happy", "day", "です", "ね", "="])) self.assertEqual( "pen lightはありますか", tokenizer.words_to_texts(['pen', 'light', 'は', 'あり', 'ます', 'か']))
def test_tokenizer_words_to_texts_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual( "こんにちは 良い天気ですね", tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"])) self.assertEqual( "こんにちは<良い天気>ですね", tokenizer.words_to_texts( ["こんにちは", "<", "良い", "天気", ">", "です", "ね"])) self.assertEqual( "こんにちは<良い天気>ですね", tokenizer.words_to_texts( ["こんにちは", "<", "良い", "天気", ">", "です", "ね"])) self.assertEqual( "<こんにちは良い天気ですね>", tokenizer.words_to_texts( ["<", "こんにちは", "良い", "天気", "です", "ね", ">"]))
def test_tokenizer_words_to_texts_with_quote(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( 'Hello " very good " World', tokenizer.words_to_texts( ["Hello", '"', "very", "good", '"', "World"])) self.assertEqual( 'Hello "very good" World', tokenizer.words_to_texts(["Hello", '"very', 'good"', "World"])) self.assertEqual( 'こんにちは"良い天気"ですね', tokenizer.words_to_texts(["こんにちは", '"', "良い天気", '"', "です", "ね"])) self.assertEqual( 'こんにちは"良い天気"ですね', tokenizer.words_to_texts(["こんにちは", '"良い天気"', "です", "ね"]))
def test_tokenizer_words_to_texts_url(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( "http :// 192.168.1.10 / index.html", tokenizer.words_to_texts( ["http", "://", "192.168.1.10", "/", "index.html"]))
def test_tokenizer_words_to_texts_with_text_en_json_jp(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words1 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"] self.assertEqual("Hello\uF010データ\uF011you", tokenizer.words_to_texts(words1)) words2 = [ "Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you" ] self.assertEqual('Hello "\uF010データ\uF011" you', tokenizer.words_to_texts(words2)) words3 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT] self.assertEqual('Hello\uF010データ\uF011', tokenizer.words_to_texts(words3)) words4 = ["Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"'] self.assertEqual('Hello "\uF010データ\uF011"', tokenizer.words_to_texts(words4)) words5 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"] self.assertEqual('\uF010データ\uF011you', tokenizer.words_to_texts(words5)) words6 = ['"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"] self.assertEqual('"\uF010データ\uF011" you', tokenizer.words_to_texts(words6))
def test_tokenizer_words_to_texts_json_tag(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words1 = [JSON_CHILD_IN, "json", "data", JSON_CHILD_OUT] self.assertEqual("\uF010json data\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010データ設定\uF011", tokenizer.words_to_texts(words2)) words1 = [JSON_CHILD_IN, "json", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010json設定\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "json", JSON_CHILD_OUT] self.assertEqual("\uF010データjson\uF011", tokenizer.words_to_texts(words2)) words1 = [JSON_CHILD_IN, "json", "設定", "data", JSON_CHILD_OUT] self.assertEqual("\uF010json設定data\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "json", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010データjson設定\uF011", tokenizer.words_to_texts(words2))
def test_tokenizer_normal_texts(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual(["Hello"], tokenizer.texts_to_words("Hello")) self.assertEqual(["Hello", "World"], tokenizer.texts_to_words("Hello World")) self.assertEqual(["Hello", "World"], tokenizer.texts_to_words(" Hello World ")) self.assertEqual(["こんにちは"], tokenizer.texts_to_words("こんにちは")) self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"], tokenizer.texts_to_words("こんにちは良い天気ですね")) self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"], tokenizer.texts_to_words(" こんにちは 良い天気ですね ")) self.assertEqual("", tokenizer.words_to_texts([])) self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"])) self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "World"])) self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "", "World"])) self.assertEqual("Hello World", tokenizer.words_to_texts([" Hello ", " World "])) self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"])) self.assertEqual("こんにちは", tokenizer.words_to_texts(["こんにちは"])) self.assertEqual( "こんにちは 良い天気ですね", tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"])) self.assertEqual( "こんにちは 良い 天気 です ね", tokenizer.words_to_texts( [" こんにちは ", " 良い ", " 天気 ", " です ", " ね "]))
def test_tokenizer_words_to_texts_with_json_jp(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words0 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT] self.assertEqual("\uF010データ\uF011", tokenizer.words_to_texts(words0)) words1 = ["こんにちは", JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "です", "ね"] self.assertEqual("こんにちは\uF010データ\uF011ですね", tokenizer.words_to_texts(words1)) words2 = [ "こんにちは", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "です", "ね" ] self.assertEqual('こんにちは"\uF010データ\uF011"ですね', tokenizer.words_to_texts(words2)) words3 = ["こんにちは", JSON_CHILD_IN, "データ", JSON_CHILD_OUT] self.assertEqual('こんにちは\uF010データ\uF011', tokenizer.words_to_texts(words3)) words4 = ["こんにちは", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"'] self.assertEqual('こんにちは"\uF010データ\uF011"', tokenizer.words_to_texts(words4)) words5 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "です", "ね"] self.assertEqual('\uF010データ\uF011ですね', tokenizer.words_to_texts(words5)) words6 = ['"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "です", "ね"] self.assertEqual('"\uF010データ\uF011"ですね', tokenizer.words_to_texts(words6))
def test_tokenizer_words_to_texts_en(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "World"])) self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "", "World"])) self.assertEqual("Hello 1 World", tokenizer.words_to_texts(["Hello", "1", "World"])) self.assertEqual( "Hello < 1 > World", tokenizer.words_to_texts(["Hello", "<", "1", ">", "World"])) self.assertEqual("Hello1 1World", tokenizer.words_to_texts(["Hello1", "1World"])) self.assertEqual( "= Hello1 World =", tokenizer.words_to_texts(["=", "Hello1", "World", "="]))