Beispiel #1
0
 def setUp(self):
     self.tokenizer = MegaHALTokenizer()
Beispiel #2
0
class testMegaHALTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = MegaHALTokenizer()

    def testSplitEmpty(self):
        self.assertEqual(len(self.tokenizer.split("")), 0)

    def testSplitSentence(self):
        words = self.tokenizer.split("hi.")
        self.assertEqual(words, ["HI", "."])

    def testSplitComma(self):
        words = self.tokenizer.split("hi, cobe")
        self.assertEqual(words, ["HI", ", ", "COBE", "."])

    def testSplitImplicitStop(self):
        words = self.tokenizer.split("hi")
        self.assertEqual(words, ["HI", "."])

    def testSplitUrl(self):
        words = self.tokenizer.split("http://www.google.com/")
        self.assertEqual(
            words, ["HTTP", "://", "WWW", ".", "GOOGLE", ".", "COM", "/."])

    def testSplitNonUnicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def testSplitApostrophe(self):
        words = self.tokenizer.split("hal's brain")
        self.assertEqual(words, ["HAL'S", " ", "BRAIN", "."])

        words = self.tokenizer.split("',','")
        self.assertEqual(words, ["'", ",", "'", ",", "'", "."])

    def testSplitAlphaAndNumeric(self):
        words = self.tokenizer.split("hal9000, test blah 12312")
        self.assertEqual(
            words,
            ["HAL", "9000", ", ", "TEST", " ", "BLAH", " ", "12312", "."])

        words = self.tokenizer.split("hal9000's test")
        self.assertEqual(words, ["HAL", "9000", "'S", " ", "TEST", "."])

    def testCapitalize(self):
        words = self.tokenizer.split("this is a test")
        self.assertEqual("This is a test.", self.tokenizer.join(words))

        words = self.tokenizer.split("A.B. Hal test test. will test")
        self.assertEqual("A.b. Hal test test. Will test.",
                         self.tokenizer.join(words))

        words = self.tokenizer.split("2nd place test")
        self.assertEqual("2Nd place test.", self.tokenizer.join(words))
Beispiel #3
0
class testMegaHALTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = MegaHALTokenizer()

    def testSplitEmpty(self):
        self.assertEquals(len(self.tokenizer.split(u"")), 0)

    def testSplitSentence(self):
        words = self.tokenizer.split(u"hi.")
        self.assertEquals(words, ["HI", "."])

    def testSplitComma(self):
        words = self.tokenizer.split(u"hi, cobe")
        self.assertEquals(words, ["HI", ", ", "COBE", "."])

    def testSplitImplicitStop(self):
        words = self.tokenizer.split(u"hi")
        self.assertEquals(words, ["HI", "."])

    def testSplitUrl(self):
        words = self.tokenizer.split(u"http://www.google.com/")
        self.assertEquals(words, ["HTTP", "://", "WWW", ".", "GOOGLE", ".", "COM", "/."])

    def testSplitNonUnicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def testSplitApostrophe(self):
        words = self.tokenizer.split(u"hal's brain")
        self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."])

        words = self.tokenizer.split(u"',','")
        self.assertEquals(words, ["'", ",", "'", ",", "'", "."])

    def testSplitApostrophe(self):
        words = self.tokenizer.split(u"hal's brain")
        self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."])

    def testSplitAlphaAndNumeric(self):
        words = self.tokenizer.split(u"hal9000, test blah 12312")
        self.assertEquals(words, ["HAL", "9000", ", ", "TEST", " ", "BLAH", " ", "12312", "."])

        words = self.tokenizer.split(u"hal9000's test")
        self.assertEquals(words, ["HAL", "9000", "'S", " ", "TEST", "."])

    def testCapitalize(self):
        words = self.tokenizer.split(u"this is a test")
        self.assertEquals(u"This is a test.", self.tokenizer.join(words))

        words = self.tokenizer.split(u"A.B. Hal test test. will test")
        self.assertEquals(u"A.b. Hal test test. Will test.",
                          self.tokenizer.join(words))

        words = self.tokenizer.split(u"2nd place test")
        self.assertEquals(u"2Nd place test.", self.tokenizer.join(words))
Beispiel #4
0
 def setUp(self):
     self.tokenizer = MegaHALTokenizer()
Beispiel #5
0
class TestMegaHALTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = MegaHALTokenizer()

    def test_split_empty(self):
        self.assertEquals(len(self.tokenizer.split(u"")), 0)

    def test_split_sentence(self):
        words = self.tokenizer.split(u"hi.")
        self.assertEquals(words, ["HI", "."])

    def test_split_comma(self):
        words = self.tokenizer.split(u"hi, cobe")
        self.assertEquals(words, ["HI", ", ", "COBE", "."])

    def test_split_implicit_stop(self):
        words = self.tokenizer.split(u"hi")
        self.assertEquals(words, ["HI", "."])

    def test_split_url(self):
        words = self.tokenizer.split(u"http://www.google.com/")
        self.assertEquals(words, ["HTTP", "://", "WWW", ".", "GOOGLE",
                                  ".", "COM", "/."])

    def test_split_non_unicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def test_split_apostrophe(self):
        words = self.tokenizer.split(u"hal's brain")
        self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."])

        words = self.tokenizer.split(u"',','")
        self.assertEquals(words, ["'", ",", "'", ",", "'", "."])

    def test_split_alpha_and_numeric(self):
        words = self.tokenizer.split(u"hal9000, test blah 12312")
        self.assertEquals(words, ["HAL", "9000", ", ", "TEST", " ",
                                  "BLAH", " ", "12312", "."])

        words = self.tokenizer.split(u"hal9000's test")
        self.assertEquals(words, ["HAL", "9000", "'S", " ", "TEST", "."])

    def test_capitalize(self):
        words = self.tokenizer.split(u"this is a test")
        self.assertEquals(u"This is a test.", self.tokenizer.join(words))

        words = self.tokenizer.split(u"A.B. Hal test test. will test")
        self.assertEquals(u"A.b. Hal test test. Will test.",
                          self.tokenizer.join(words))

        words = self.tokenizer.split(u"2nd place test")
        self.assertEquals(u"2Nd place test.", self.tokenizer.join(words))