Example #1
0
    def test_tokenizer_words_to_texts_en_with_symbol(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            "http :// 192.168.1.10 / index.html",
            tokenizer.words_to_texts(
                ["http", "://", "192.168.1.10", "/", "index.html"]))

        self.assertEqual(
            "Hello world",
            tokenizer.words_to_texts(["Hello", " ", " ", "world"]))
        self.assertEqual(
            "Hello . i don ' t know",
            tokenizer.words_to_texts(
                ["Hello", ".", "i", "don", "'", "t", "know"]))
        self.assertEqual(
            "Hello _ 1 friend_ 1",
            tokenizer.words_to_texts(["Hello", "_", "1", "friend_", "1"]))
        self.assertEqual(
            "Hello < my friend >",
            tokenizer.words_to_texts(["Hello", "<", "my", "friend", ">"]))
        self.assertEqual(
            'Hello " my friend "',
            tokenizer.words_to_texts(["Hello", '"', "my", "friend", '"']))
        self.assertEqual(
            'Hello ` my friend `',
            tokenizer.words_to_texts(["Hello", "`", "my", "friend", "`"]))
Example #2
0
    def test_tokenizer_convert_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words = tokenizer.texts_to_words("http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words))

        words_en = tokenizer._texts_to_words_en(
            "http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words_en)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_en))

        words_jp = tokenizer._texts_to_words_jp(
            "http://192.168.1.10/index.html")
        self.assertEqual([
            "http", "://", "192", ".", "168", ".", "1", ".", "10", "/",
            "index", ".", "html"
        ], words_jp)
        self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html",
                         tokenizer.words_to_texts(words_jp))

        words_mix = tokenizer.texts_to_words(
            "URLはhttp://192.168.1.10/index.html")
        self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"],
                         words_mix)
        self.assertEqual("URLはhttp://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_mix))
Example #3
0
    def test_tokenizer_words_to_texts_with_text_en_json_jp(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"]
        self.assertEqual("Hello\uF010データ\uF011you",
                         tokenizer.words_to_texts(words1))

        words2 = [
            "Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"
        ]
        self.assertEqual('Hello "\uF010データ\uF011" you',
                         tokenizer.words_to_texts(words2))

        words3 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT]
        self.assertEqual('Hello\uF010データ\uF011',
                         tokenizer.words_to_texts(words3))

        words4 = ["Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"']
        self.assertEqual('Hello "\uF010データ\uF011"',
                         tokenizer.words_to_texts(words4))

        words5 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"]
        self.assertEqual('\uF010データ\uF011you',
                         tokenizer.words_to_texts(words5))

        words6 = ['"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"]
        self.assertEqual('"\uF010データ\uF011" you',
                         tokenizer.words_to_texts(words6))
Example #4
0
    def test_tokenizer_words_to_texts_json_tag(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = [JSON_CHILD_IN, "json", "data", JSON_CHILD_OUT]
        self.assertEqual("\uF010json data\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010データ設定\uF011", tokenizer.words_to_texts(words2))

        words1 = [JSON_CHILD_IN, "json", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010json設定\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "json", JSON_CHILD_OUT]
        self.assertEqual("\uF010データjson\uF011",
                         tokenizer.words_to_texts(words2))

        words1 = [JSON_CHILD_IN, "json", "設定", "data", JSON_CHILD_OUT]
        self.assertEqual("\uF010json設定data\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "json", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010データjson設定\uF011",
                         tokenizer.words_to_texts(words2))
Example #5
0
    def test_tokenizer_words_to_texts_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            "http :// 192.168.1.10 / index.html",
            tokenizer.words_to_texts(
                ["http", "://", "192.168.1.10", "/", "index.html"]))
Example #6
0
    def test_tokenizer_words_from_current_pos_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        with self.assertRaises(Exception):
            tokenizer.words_from_current_pos(None, 0)

        words = ["Yes", "か", "No"]
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, 0))
        self.assertEqual("かNo", tokenizer.words_from_current_pos(words, 1))
        self.assertEqual("No", tokenizer.words_from_current_pos(words, 2))
        self.assertEqual("", tokenizer.words_from_current_pos(words, 3))
        self.assertEqual("No", tokenizer.words_from_current_pos(words, -1))
        self.assertEqual("かNo", tokenizer.words_from_current_pos(words, -2))
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -3))
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -4))
Example #7
0
    def test_tokenizer_words_to_texts_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            "こんにちは10日はHappy dayですね",
            tokenizer.words_to_texts(
                ["こんにちは", "10", "日", "は", "Happy", "day", "です", "ね"]))
        self.assertEqual(
            "=こんにちは10日はHappy dayですね=",
            tokenizer.words_to_texts(
                ["=", "こんにちは", "10", "日", "は", "Happy", "day", "です", "ね",
                 "="]))
        self.assertEqual(
            "pen lightはありますか",
            tokenizer.words_to_texts(['pen', 'light', 'は', 'あり', 'ます', 'か']))
Example #8
0
    def test_tokenizer_words_to_texts_with_text_jp_json_en(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = [
            "こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね"
        ]
        self.assertEqual("こんにちは\uF010json-data\uF011ですね",
                         tokenizer.words_to_texts(words1))

        words2 = [
            "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"',
            "です", "ね"
        ]
        self.assertEqual('こんにちは"\uF010json-data\uF011"ですね',
                         tokenizer.words_to_texts(words2))

        words3 = ["こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT]
        self.assertEqual('こんにちは\uF010json-data\uF011',
                         tokenizer.words_to_texts(words3))

        words4 = [
            "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"'
        ]
        self.assertEqual('こんにちは"\uF010json-data\uF011"',
                         tokenizer.words_to_texts(words4))

        words5 = [JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね"]
        self.assertEqual('\uF010json-data\uF011ですね',
                         tokenizer.words_to_texts(words5))

        words6 = [
            '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"', "です", "ね"
        ]
        self.assertEqual('"\uF010json-data\uF011"ですね',
                         tokenizer.words_to_texts(words6))
Example #9
0
    def test_tokenizer_words_to_texts_with_quote(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            'Hello " very good " World',
            tokenizer.words_to_texts(
                ["Hello", '"', "very", "good", '"', "World"]))
        self.assertEqual(
            'Hello "very good" World',
            tokenizer.words_to_texts(["Hello", '"very', 'good"', "World"]))
        self.assertEqual(
            'こんにちは"良い天気"ですね',
            tokenizer.words_to_texts(["こんにちは", '"', "良い天気", '"', "です", "ね"]))
        self.assertEqual(
            'こんにちは"良い天気"ですね',
            tokenizer.words_to_texts(["こんにちは", '"良い天気"', "です", "ね"]))
Example #10
0
    def test_tokenizer_words_to_texts_en(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "World"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "", "World"]))
        self.assertEqual("Hello 1 World",
                         tokenizer.words_to_texts(["Hello", "1", "World"]))
        self.assertEqual(
            "Hello < 1 > World",
            tokenizer.words_to_texts(["Hello", "<", "1", ">", "World"]))
        self.assertEqual("Hello1 1World",
                         tokenizer.words_to_texts(["Hello1", "1World"]))
        self.assertEqual(
            "= Hello1 World =",
            tokenizer.words_to_texts(["=", "Hello1", "World", "="]))
Example #11
0
    def test_tokenizer_normal_texts(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(["Hello"], tokenizer.texts_to_words("Hello"))
        self.assertEqual(["Hello", "World"],
                         tokenizer.texts_to_words("Hello World"))
        self.assertEqual(["Hello", "World"],
                         tokenizer.texts_to_words(" Hello   World "))
        self.assertEqual(["こんにちは"], tokenizer.texts_to_words("こんにちは"))
        self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"],
                         tokenizer.texts_to_words("こんにちは良い天気ですね"))
        self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"],
                         tokenizer.texts_to_words(" こんにちは 良い天気ですね "))

        self.assertEqual("", tokenizer.words_to_texts([]))
        self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "World"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "", "World"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts([" Hello ", " World "]))
        self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"]))
        self.assertEqual("こんにちは", tokenizer.words_to_texts(["こんにちは"]))
        self.assertEqual(
            "こんにちは 良い天気ですね",
            tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"]))
        self.assertEqual(
            "こんにちは 良い 天気 です ね",
            tokenizer.words_to_texts(
                [" こんにちは ", " 良い ", " 天気 ", " です ", " ね "]))