Exemple #1
0
    def test_tokenizer_convert_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words = tokenizer.texts_to_words("http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words))

        words_en = tokenizer._texts_to_words_en(
            "http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words_en)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_en))

        words_jp = tokenizer._texts_to_words_jp(
            "http://192.168.1.10/index.html")
        self.assertEqual([
            "http", "://", "192", ".", "168", ".", "1", ".", "10", "/",
            "index", ".", "html"
        ], words_jp)
        self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html",
                         tokenizer.words_to_texts(words_jp))

        words_mix = tokenizer.texts_to_words(
            "URLはhttp://192.168.1.10/index.html")
        self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"],
                         words_mix)
        self.assertEqual("URLはhttp://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_mix))
Exemple #2
0
    def test_tokenizer_texts_to_words_en(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello,", "he", "is", "Mr.A", "(No", "name)"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
Exemple #3
0
    def test_tokenizer_texts_to_words_en_with_punctation(self):
        punctations = ';\'",!()[]:’”;、。!()「」'
        tokenizer = TokenizerJP(punctuation_chars=punctations)
        self.assertIsNotNone(tokenizer)

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello", "he", "is", "Mr.A", "No", "name"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
Exemple #4
0
    def test_tokenizer_template_texts_to_words_en(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello, he is Mr.A (No name)"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
Exemple #5
0
    def test_tokenizer_template_texts_to_words_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual(["こんにちはhappyですか"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは happy ですか"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは(happy)ですか"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))
Exemple #6
0
    def test_tokenizer_texts_to_words_mix_with_punctation(self):
        punctations = ';\'",!()[]:’”;、。!()「」'
        tokenizer = TokenizerJP(punctuation_chars=punctations)
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは「happy]ですか"))
        self.assertEqual(["こんにちは", "happy", "unhappy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy, unhappy ですか"))
Exemple #7
0
    def test_tokenizer_template_texts_to_words_jp(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual(["こんにちは「良い天気」ですね"],
                         tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
Exemple #8
0
    def test_tokenizer_texts_to_words_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "happy"],
                         tokenizer.texts_to_words("こんにちはhappy"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは", "(happy)", "です", "か"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))

        self.assertEqual(["Hello", "ハッピー"],
                         tokenizer.texts_to_words("Hello ハッピー"))
        self.assertEqual(["Hello", "ハッピー", "です", "か"],
                         tokenizer.texts_to_words("Helloハッピーですか"))
        self.assertEqual(["Hello", "(", "ハッピー", ")", "です", "か"],
                         tokenizer.texts_to_words("Hello (ハッピー)ですか"))
        self.assertEqual(["Hello", "ハッピー", "you"],
                         tokenizer.texts_to_words("Helloハッピーyou"))
        self.assertEqual(["Hello", "ハッピー", "you"],
                         tokenizer.texts_to_words("Hello ハッピー you"))
Exemple #9
0
    def test_tokenizer_normal_texts(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(["Hello"], tokenizer.texts_to_words("Hello"))
        self.assertEqual(["Hello", "World"],
                         tokenizer.texts_to_words("Hello World"))
        self.assertEqual(["Hello", "World"],
                         tokenizer.texts_to_words(" Hello   World "))
        self.assertEqual(["こんにちは"], tokenizer.texts_to_words("こんにちは"))
        self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"],
                         tokenizer.texts_to_words("こんにちは良い天気ですね"))
        self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"],
                         tokenizer.texts_to_words(" こんにちは 良い天気ですね "))

        self.assertEqual("", tokenizer.words_to_texts([]))
        self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "World"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts(["Hello", "", "World"]))
        self.assertEqual("Hello World",
                         tokenizer.words_to_texts([" Hello ", " World "]))
        self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"]))
        self.assertEqual("こんにちは", tokenizer.words_to_texts(["こんにちは"]))
        self.assertEqual(
            "こんにちは 良い天気ですね",
            tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"]))
        self.assertEqual(
            "こんにちは 良い 天気 です ね",
            tokenizer.words_to_texts(
                [" こんにちは ", " 良い ", " 天気 ", " です ", " ね "]))
Exemple #10
0
    def test_tokenizer_texts_to_words_jp(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "「", "良い", "天気", "」", "です", "ね"],
                         tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
Exemple #11
0
    def test_tokenizer_texts_to_words_jp_with_punctation(self):
        punctations = ';\'",!()[]:’”;、。!()「」'
        tokenizer = TokenizerJP(punctuation_chars=punctations)

        self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"],
                         tokenizer.texts_to_words("こんにちは「良い天気」ですね"))