def setUp(self): self.tokenizer = MegaHALTokenizer()
class testMegaHALTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = MegaHALTokenizer() def testSplitEmpty(self): self.assertEqual(len(self.tokenizer.split("")), 0) def testSplitSentence(self): words = self.tokenizer.split("hi.") self.assertEqual(words, ["HI", "."]) def testSplitComma(self): words = self.tokenizer.split("hi, cobe") self.assertEqual(words, ["HI", ", ", "COBE", "."]) def testSplitImplicitStop(self): words = self.tokenizer.split("hi") self.assertEqual(words, ["HI", "."]) def testSplitUrl(self): words = self.tokenizer.split("http://www.google.com/") self.assertEqual( words, ["HTTP", "://", "WWW", ".", "GOOGLE", ".", "COM", "/."]) def testSplitNonUnicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def testSplitApostrophe(self): words = self.tokenizer.split("hal's brain") self.assertEqual(words, ["HAL'S", " ", "BRAIN", "."]) words = self.tokenizer.split("',','") self.assertEqual(words, ["'", ",", "'", ",", "'", "."]) def testSplitAlphaAndNumeric(self): words = self.tokenizer.split("hal9000, test blah 12312") self.assertEqual( words, ["HAL", "9000", ", ", "TEST", " ", "BLAH", " ", "12312", "."]) words = self.tokenizer.split("hal9000's test") self.assertEqual(words, ["HAL", "9000", "'S", " ", "TEST", "."]) def testCapitalize(self): words = self.tokenizer.split("this is a test") self.assertEqual("This is a test.", self.tokenizer.join(words)) words = self.tokenizer.split("A.B. Hal test test. will test") self.assertEqual("A.b. Hal test test. Will test.", self.tokenizer.join(words)) words = self.tokenizer.split("2nd place test") self.assertEqual("2Nd place test.", self.tokenizer.join(words))
class testMegaHALTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = MegaHALTokenizer() def testSplitEmpty(self): self.assertEquals(len(self.tokenizer.split(u"")), 0) def testSplitSentence(self): words = self.tokenizer.split(u"hi.") self.assertEquals(words, ["HI", "."]) def testSplitComma(self): words = self.tokenizer.split(u"hi, cobe") self.assertEquals(words, ["HI", ", ", "COBE", "."]) def testSplitImplicitStop(self): words = self.tokenizer.split(u"hi") self.assertEquals(words, ["HI", "."]) def testSplitUrl(self): words = self.tokenizer.split(u"http://www.google.com/") self.assertEquals(words, ["HTTP", "://", "WWW", ".", "GOOGLE", ".", "COM", "/."]) def testSplitNonUnicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def testSplitApostrophe(self): words = self.tokenizer.split(u"hal's brain") self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."]) words = self.tokenizer.split(u"',','") self.assertEquals(words, ["'", ",", "'", ",", "'", "."]) def testSplitApostrophe(self): words = self.tokenizer.split(u"hal's brain") self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."]) def testSplitAlphaAndNumeric(self): words = self.tokenizer.split(u"hal9000, test blah 12312") self.assertEquals(words, ["HAL", "9000", ", ", "TEST", " ", "BLAH", " ", "12312", "."]) words = self.tokenizer.split(u"hal9000's test") self.assertEquals(words, ["HAL", "9000", "'S", " ", "TEST", "."]) def testCapitalize(self): words = self.tokenizer.split(u"this is a test") self.assertEquals(u"This is a test.", self.tokenizer.join(words)) words = self.tokenizer.split(u"A.B. Hal test test. will test") self.assertEquals(u"A.b. Hal test test. Will test.", self.tokenizer.join(words)) words = self.tokenizer.split(u"2nd place test") self.assertEquals(u"2Nd place test.", self.tokenizer.join(words))
class TestMegaHALTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = MegaHALTokenizer() def test_split_empty(self): self.assertEquals(len(self.tokenizer.split(u"")), 0) def test_split_sentence(self): words = self.tokenizer.split(u"hi.") self.assertEquals(words, ["HI", "."]) def test_split_comma(self): words = self.tokenizer.split(u"hi, cobe") self.assertEquals(words, ["HI", ", ", "COBE", "."]) def test_split_implicit_stop(self): words = self.tokenizer.split(u"hi") self.assertEquals(words, ["HI", "."]) def test_split_url(self): words = self.tokenizer.split(u"http://www.google.com/") self.assertEquals(words, ["HTTP", "://", "WWW", ".", "GOOGLE", ".", "COM", "/."]) def test_split_non_unicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def test_split_apostrophe(self): words = self.tokenizer.split(u"hal's brain") self.assertEquals(words, ["HAL'S", " ", "BRAIN", "."]) words = self.tokenizer.split(u"',','") self.assertEquals(words, ["'", ",", "'", ",", "'", "."]) def test_split_alpha_and_numeric(self): words = self.tokenizer.split(u"hal9000, test blah 12312") self.assertEquals(words, ["HAL", "9000", ", ", "TEST", " ", "BLAH", " ", "12312", "."]) words = self.tokenizer.split(u"hal9000's test") self.assertEquals(words, ["HAL", "9000", "'S", " ", "TEST", "."]) def test_capitalize(self): words = self.tokenizer.split(u"this is a test") self.assertEquals(u"This is a test.", self.tokenizer.join(words)) words = self.tokenizer.split(u"A.B. Hal test test. will test") self.assertEquals(u"A.b. Hal test test. Will test.", self.tokenizer.join(words)) words = self.tokenizer.split(u"2nd place test") self.assertEquals(u"2Nd place test.", self.tokenizer.join(words))