class testCobeTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = CobeTokenizer() def testSplitEmpty(self): self.assertEquals(len(self.tokenizer.split(u"")), 0) def testSplitSentence(self): words = self.tokenizer.split(u"hi.") self.assertEquals(words, ["hi", "."]) def testSplitComma(self): words = self.tokenizer.split(u"hi, cobe") self.assertEquals(words, ["hi", ",", " ", "cobe"]) def testSplitDash(self): words = self.tokenizer.split(u"hi - cobe") self.assertEquals(words, ["hi", " ", "-", " ", "cobe"]) def testSplitMultipleSpacesWithDash(self): words = self.tokenizer.split(u"hi - cobe") self.assertEquals(words, ["hi", " ", "-", " ", "cobe"]) def testSplitLeadingDash(self): words = self.tokenizer.split(u"-foo") self.assertEquals(words, ["-foo"]) def testSplitSmiles(self): words = self.tokenizer.split(u":)") self.assertEquals(words, [":)"]) words = self.tokenizer.split(u";)") self.assertEquals(words, [";)"]) # not smiles words = self.tokenizer.split(u":(") self.assertEquals(words, [":("]) words = self.tokenizer.split(u";(") self.assertEquals(words, [";("]) def testSplitUrl(self): words = self.tokenizer.split(u"http://www.google.com/") self.assertEquals(words, ["http://www.google.com/"]) words = self.tokenizer.split(u"https://www.google.com/") self.assertEquals(words, ["https://www.google.com/"]) # odd protocols words = self.tokenizer.split(u"cobe://www.google.com/") self.assertEquals(words, ["cobe://www.google.com/"]) words = self.tokenizer.split(u"cobe:www.google.com/") self.assertEquals(words, ["cobe:www.google.com/"]) words = self.tokenizer.split(u":foo") self.assertEquals(words, [":", "foo"]) def testSplitMultipleSpaces(self): words = self.tokenizer.split(u"this is a test") self.assertEquals(words, ["this", " ", "is", " ", "a", " ", "test"]) def testSplitVerySadFrown(self): words = self.tokenizer.split(u"testing : (") self.assertEquals(words, ["testing", " ", ": ("]) words = self.tokenizer.split(u"testing : (") self.assertEquals(words, ["testing", " ", ": ("]) words = self.tokenizer.split(u"testing : ( ") self.assertEquals(words, ["testing", " ", ": (", " "]) def testSplitHyphenatedWord(self): words = self.tokenizer.split(u"test-ing") self.assertEquals(words, ["test-ing"]) words = self.tokenizer.split(u":-)") self.assertEquals(words, [":-)"]) words = self.tokenizer.split(u"test-ing :-) 1-2-3") self.assertEquals(words, ["test-ing", " ", ":-)", " ", "1-2-3"]) def testSplitApostrophes(self): words = self.tokenizer.split(u"don't :'(") self.assertEquals(words, ["don't", " ", ":'("]) def testSplitNonUnicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def testJoin(self): self.assertEquals("foo bar baz", self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))
class testCobeTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = CobeTokenizer() def testSplitEmpty(self): self.assertEqual(len(self.tokenizer.split("")), 0) def testSplitSentence(self): words = self.tokenizer.split("hi.") self.assertEqual(words, ["hi", "."]) def testSplitComma(self): words = self.tokenizer.split("hi, cobe") self.assertEqual(words, ["hi", ",", " ", "cobe"]) def testSplitDash(self): words = self.tokenizer.split("hi - cobe") self.assertEqual(words, ["hi", " ", "-", " ", "cobe"]) def testSplitMultipleSpacesWithDash(self): words = self.tokenizer.split("hi - cobe") self.assertEqual(words, ["hi", " ", "-", " ", "cobe"]) def testSplitLeadingDash(self): words = self.tokenizer.split("-foo") self.assertEqual(words, ["-foo"]) def testSplitLeadingSpace(self): words = self.tokenizer.split(" foo") self.assertEqual(words, ["foo"]) words = self.tokenizer.split(" foo") self.assertEqual(words, ["foo"]) def testSplitTrailingSpace(self): words = self.tokenizer.split("foo ") self.assertEqual(words, ["foo"]) words = self.tokenizer.split("foo ") self.assertEqual(words, ["foo"]) def testSplitSmiles(self): words = self.tokenizer.split(":)") self.assertEqual(words, [":)"]) words = self.tokenizer.split(";)") self.assertEqual(words, [";)"]) # not smiles words = self.tokenizer.split(":(") self.assertEqual(words, [":("]) words = self.tokenizer.split(";(") self.assertEqual(words, [";("]) def testSplitUrl(self): words = self.tokenizer.split("http://www.google.com/") self.assertEqual(words, ["http://www.google.com/"]) words = self.tokenizer.split("https://www.google.com/") self.assertEqual(words, ["https://www.google.com/"]) # odd protocols words = self.tokenizer.split("cobe://www.google.com/") self.assertEqual(words, ["cobe://www.google.com/"]) words = self.tokenizer.split("cobe:www.google.com/") self.assertEqual(words, ["cobe:www.google.com/"]) words = self.tokenizer.split(":foo") self.assertEqual(words, [":", "foo"]) def testSplitMultipleSpaces(self): words = self.tokenizer.split("this is a test") self.assertEqual(words, ["this", " ", "is", " ", "a", " ", "test"]) def testSplitVerySadFrown(self): words = self.tokenizer.split("testing : (") self.assertEqual(words, ["testing", " ", ": ("]) words = self.tokenizer.split("testing : (") self.assertEqual(words, ["testing", " ", ": ("]) words = self.tokenizer.split("testing : ( foo") self.assertEqual(words, ["testing", " ", ": (", " ", "foo"]) def testSplitHyphenatedWord(self): words = self.tokenizer.split("test-ing") self.assertEqual(words, ["test-ing"]) words = self.tokenizer.split(":-)") self.assertEqual(words, [":-)"]) words = self.tokenizer.split("test-ing :-) 1-2-3") self.assertEqual(words, ["test-ing", " ", ":-)", " ", "1-2-3"]) def testSplitApostrophes(self): words = self.tokenizer.split("don't :'(") self.assertEqual(words, ["don't", " ", ":'("]) def testSplitNonUnicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def testJoin(self): self.assertEqual("foo bar baz", self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))
class TestCobeTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = CobeTokenizer() def test_split_empty(self): self.assertEquals(len(self.tokenizer.split(u"")), 0) def test_split_sentence(self): words = self.tokenizer.split(u"hi.") self.assertEquals(words, ["hi", "."]) def test_split_comma(self): words = self.tokenizer.split(u"hi, cobe") self.assertEquals(words, ["hi", ",", " ", "cobe"]) def test_split_dash(self): words = self.tokenizer.split(u"hi - cobe") self.assertEquals(words, ["hi", " ", "-", " ", "cobe"]) def test_split_multiple_spaces_with_dash(self): words = self.tokenizer.split(u"hi - cobe") self.assertEquals(words, ["hi", " ", "-", " ", "cobe"]) def test_split_leading_dash(self): words = self.tokenizer.split(u"-foo") self.assertEquals(words, ["-foo"]) def test_split_leading_space(self): words = self.tokenizer.split(u" foo") self.assertEquals(words, ["foo"]) words = self.tokenizer.split(u" foo") self.assertEquals(words, ["foo"]) def test_split_trailing_space(self): words = self.tokenizer.split(u"foo ") self.assertEquals(words, ["foo"]) words = self.tokenizer.split(u"foo ") self.assertEquals(words, ["foo"]) def test_split_smiles(self): words = self.tokenizer.split(u":)") self.assertEquals(words, [":)"]) words = self.tokenizer.split(u";)") self.assertEquals(words, [";)"]) # not smiles words = self.tokenizer.split(u":(") self.assertEquals(words, [":("]) words = self.tokenizer.split(u";(") self.assertEquals(words, [";("]) def test_split_url(self): words = self.tokenizer.split(u"http://www.google.com/") self.assertEquals(words, ["http://www.google.com/"]) words = self.tokenizer.split(u"https://www.google.com/") self.assertEquals(words, ["https://www.google.com/"]) # odd protocols words = self.tokenizer.split(u"cobe://www.google.com/") self.assertEquals(words, ["cobe://www.google.com/"]) words = self.tokenizer.split(u"cobe:www.google.com/") self.assertEquals(words, ["cobe:www.google.com/"]) words = self.tokenizer.split(u":foo") self.assertEquals(words, [":", "foo"]) def test_split_multiple_spaces(self): words = self.tokenizer.split(u"this is a test") self.assertEquals(words, ["this", " ", "is", " ", "a", " ", "test"]) def test_split_very_sad_frown(self): words = self.tokenizer.split(u"testing : (") self.assertEquals(words, ["testing", " ", ": ("]) words = self.tokenizer.split(u"testing : (") self.assertEquals(words, ["testing", " ", ": ("]) words = self.tokenizer.split(u"testing : ( foo") self.assertEquals(words, ["testing", " ", ": (", " ", "foo"]) def test_split_hyphenated_word(self): words = self.tokenizer.split(u"test-ing") self.assertEquals(words, ["test-ing"]) words = self.tokenizer.split(u":-)") self.assertEquals(words, [":-)"]) words = self.tokenizer.split(u"test-ing :-) 1-2-3") self.assertEquals(words, ["test-ing", " ", ":-)", " ", "1-2-3"]) def test_split_apostrophes(self): words = self.tokenizer.split(u"don't :'(") self.assertEquals(words, ["don't", " ", ":'("]) def test_split_non_unicode(self): self.assertRaises(TypeError, self.tokenizer.split, "foo") def test_join(self): self.assertEquals("foo bar baz", self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))