Ejemplo n.º 1
0
 def setUp(self):
     self.tokenizer = CobeTokenizer()
Ejemplo n.º 2
0
class testCobeTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = CobeTokenizer()

    def testSplitEmpty(self):
        self.assertEqual(len(self.tokenizer.split("")), 0)

    def testSplitSentence(self):
        words = self.tokenizer.split("hi.")
        self.assertEqual(words, ["hi", "."])

    def testSplitComma(self):
        words = self.tokenizer.split("hi, cobe")
        self.assertEqual(words, ["hi", ",", " ", "cobe"])

    def testSplitDash(self):
        words = self.tokenizer.split("hi - cobe")
        self.assertEqual(words, ["hi", " ", "-", " ", "cobe"])

    def testSplitMultipleSpacesWithDash(self):
        words = self.tokenizer.split("hi  -  cobe")
        self.assertEqual(words, ["hi", " ", "-", " ", "cobe"])

    def testSplitLeadingDash(self):
        words = self.tokenizer.split("-foo")
        self.assertEqual(words, ["-foo"])

    def testSplitLeadingSpace(self):
        words = self.tokenizer.split(" foo")
        self.assertEqual(words, ["foo"])

        words = self.tokenizer.split("  foo")
        self.assertEqual(words, ["foo"])

    def testSplitTrailingSpace(self):
        words = self.tokenizer.split("foo ")
        self.assertEqual(words, ["foo"])

        words = self.tokenizer.split("foo  ")
        self.assertEqual(words, ["foo"])

    def testSplitSmiles(self):
        words = self.tokenizer.split(":)")
        self.assertEqual(words, [":)"])

        words = self.tokenizer.split(";)")
        self.assertEqual(words, [";)"])

        # not smiles
        words = self.tokenizer.split(":(")
        self.assertEqual(words, [":("])

        words = self.tokenizer.split(";(")
        self.assertEqual(words, [";("])

    def testSplitUrl(self):
        words = self.tokenizer.split("http://www.google.com/")
        self.assertEqual(words, ["http://www.google.com/"])

        words = self.tokenizer.split("https://www.google.com/")
        self.assertEqual(words, ["https://www.google.com/"])

        # odd protocols
        words = self.tokenizer.split("cobe://www.google.com/")
        self.assertEqual(words, ["cobe://www.google.com/"])

        words = self.tokenizer.split("cobe:www.google.com/")
        self.assertEqual(words, ["cobe:www.google.com/"])

        words = self.tokenizer.split(":foo")
        self.assertEqual(words, [":", "foo"])

    def testSplitMultipleSpaces(self):
        words = self.tokenizer.split("this is  a test")
        self.assertEqual(words, ["this", " ", "is", " ", "a", " ", "test"])

    def testSplitVerySadFrown(self):
        words = self.tokenizer.split("testing :    (")
        self.assertEqual(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split("testing          :    (")
        self.assertEqual(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split("testing          :    (  foo")
        self.assertEqual(words, ["testing", " ", ":    (", " ", "foo"])

    def testSplitHyphenatedWord(self):
        words = self.tokenizer.split("test-ing")
        self.assertEqual(words, ["test-ing"])

        words = self.tokenizer.split(":-)")
        self.assertEqual(words, [":-)"])

        words = self.tokenizer.split("test-ing :-) 1-2-3")
        self.assertEqual(words, ["test-ing", " ", ":-)", " ", "1-2-3"])

    def testSplitApostrophes(self):
        words = self.tokenizer.split("don't :'(")
        self.assertEqual(words, ["don't", " ", ":'("])

    def testSplitNonUnicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def testJoin(self):
        self.assertEqual("foo bar baz",
                         self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))
Ejemplo n.º 3
0
class testCobeTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = CobeTokenizer()

    def testSplitEmpty(self):
        self.assertEquals(len(self.tokenizer.split(u"")), 0)

    def testSplitSentence(self):
        words = self.tokenizer.split(u"hi.")
        self.assertEquals(words, ["hi", "."])

    def testSplitComma(self):
        words = self.tokenizer.split(u"hi, cobe")
        self.assertEquals(words, ["hi", ",", " ", "cobe"])

    def testSplitDash(self):
        words = self.tokenizer.split(u"hi - cobe")
        self.assertEquals(words, ["hi", " ", "-", " ", "cobe"])

    def testSplitMultipleSpacesWithDash(self):
        words = self.tokenizer.split(u"hi  -  cobe")
        self.assertEquals(words, ["hi", " ", "-", " ", "cobe"])

    def testSplitLeadingDash(self):
        words = self.tokenizer.split(u"-foo")
        self.assertEquals(words, ["-foo"])

    def testSplitSmiles(self):
        words = self.tokenizer.split(u":)")
        self.assertEquals(words, [":)"])

        words = self.tokenizer.split(u";)")
        self.assertEquals(words, [";)"])

        # not smiles
        words = self.tokenizer.split(u":(")
        self.assertEquals(words, [":("])

        words = self.tokenizer.split(u";(")
        self.assertEquals(words, [";("])

    def testSplitUrl(self):
        words = self.tokenizer.split(u"http://www.google.com/")
        self.assertEquals(words, ["http://www.google.com/"])

        words = self.tokenizer.split(u"https://www.google.com/")
        self.assertEquals(words, ["https://www.google.com/"])

        # odd protocols
        words = self.tokenizer.split(u"cobe://www.google.com/")
        self.assertEquals(words, ["cobe://www.google.com/"])

        words = self.tokenizer.split(u"cobe:www.google.com/")
        self.assertEquals(words, ["cobe:www.google.com/"])

        words = self.tokenizer.split(u":foo")
        self.assertEquals(words, [":", "foo"])

    def testSplitMultipleSpaces(self):
        words = self.tokenizer.split(u"this is  a test")
        self.assertEquals(words, ["this", " ", "is", " ", "a", " ", "test"])

    def testSplitVerySadFrown(self):
        words = self.tokenizer.split(u"testing :    (")
        self.assertEquals(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split(u"testing          :    (")
        self.assertEquals(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split(u"testing          :    (  ")
        self.assertEquals(words, ["testing", " ", ":    (", " "])

    def testSplitHyphenatedWord(self):
        words = self.tokenizer.split(u"test-ing")
        self.assertEquals(words, ["test-ing"])

        words = self.tokenizer.split(u":-)")
        self.assertEquals(words, [":-)"])

        words = self.tokenizer.split(u"test-ing :-) 1-2-3")
        self.assertEquals(words, ["test-ing", " ", ":-)", " ", "1-2-3"])

    def testSplitApostrophes(self):
        words = self.tokenizer.split(u"don't :'(")
        self.assertEquals(words, ["don't", " ", ":'("])

    def testSplitNonUnicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def testJoin(self):
        self.assertEquals("foo bar baz",
                          self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))
Ejemplo n.º 4
0
 def setUp(self):
     self.tokenizer = CobeTokenizer()
Ejemplo n.º 5
0
class TestCobeTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = CobeTokenizer()

    def test_split_empty(self):
        self.assertEquals(len(self.tokenizer.split(u"")), 0)

    def test_split_sentence(self):
        words = self.tokenizer.split(u"hi.")
        self.assertEquals(words, ["hi", "."])

    def test_split_comma(self):
        words = self.tokenizer.split(u"hi, cobe")
        self.assertEquals(words, ["hi", ",", " ", "cobe"])

    def test_split_dash(self):
        words = self.tokenizer.split(u"hi - cobe")
        self.assertEquals(words, ["hi", " ", "-", " ", "cobe"])

    def test_split_multiple_spaces_with_dash(self):
        words = self.tokenizer.split(u"hi  -  cobe")
        self.assertEquals(words, ["hi", " ", "-", " ", "cobe"])

    def test_split_leading_dash(self):
        words = self.tokenizer.split(u"-foo")
        self.assertEquals(words, ["-foo"])

    def test_split_leading_space(self):
        words = self.tokenizer.split(u" foo")
        self.assertEquals(words, ["foo"])

        words = self.tokenizer.split(u"  foo")
        self.assertEquals(words, ["foo"])

    def test_split_trailing_space(self):
        words = self.tokenizer.split(u"foo ")
        self.assertEquals(words, ["foo"])

        words = self.tokenizer.split(u"foo  ")
        self.assertEquals(words, ["foo"])

    def test_split_smiles(self):
        words = self.tokenizer.split(u":)")
        self.assertEquals(words, [":)"])

        words = self.tokenizer.split(u";)")
        self.assertEquals(words, [";)"])

        # not smiles
        words = self.tokenizer.split(u":(")
        self.assertEquals(words, [":("])

        words = self.tokenizer.split(u";(")
        self.assertEquals(words, [";("])

    def test_split_url(self):
        words = self.tokenizer.split(u"http://www.google.com/")
        self.assertEquals(words, ["http://www.google.com/"])

        words = self.tokenizer.split(u"https://www.google.com/")
        self.assertEquals(words, ["https://www.google.com/"])

        # odd protocols
        words = self.tokenizer.split(u"cobe://www.google.com/")
        self.assertEquals(words, ["cobe://www.google.com/"])

        words = self.tokenizer.split(u"cobe:www.google.com/")
        self.assertEquals(words, ["cobe:www.google.com/"])

        words = self.tokenizer.split(u":foo")
        self.assertEquals(words, [":", "foo"])

    def test_split_multiple_spaces(self):
        words = self.tokenizer.split(u"this is  a test")
        self.assertEquals(words, ["this", " ", "is", " ", "a", " ", "test"])

    def test_split_very_sad_frown(self):
        words = self.tokenizer.split(u"testing :    (")
        self.assertEquals(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split(u"testing          :    (")
        self.assertEquals(words, ["testing", " ", ":    ("])

        words = self.tokenizer.split(u"testing          :    (  foo")
        self.assertEquals(words, ["testing", " ", ":    (", " ", "foo"])

    def test_split_hyphenated_word(self):
        words = self.tokenizer.split(u"test-ing")
        self.assertEquals(words, ["test-ing"])

        words = self.tokenizer.split(u":-)")
        self.assertEquals(words, [":-)"])

        words = self.tokenizer.split(u"test-ing :-) 1-2-3")
        self.assertEquals(words, ["test-ing", " ", ":-)", " ", "1-2-3"])

    def test_split_apostrophes(self):
        words = self.tokenizer.split(u"don't :'(")
        self.assertEquals(words, ["don't", " ", ":'("])

    def test_split_non_unicode(self):
        self.assertRaises(TypeError, self.tokenizer.split, "foo")

    def test_join(self):
        self.assertEquals("foo bar baz",
                          self.tokenizer.join(["foo", " ", "bar", " ", "baz"]))