コード例 #1
0
 def test_bug2785373(self):
     """Testcases for bug #2785373"""
     input = "So, one dey when I wes 17, I left."
     for _ in tokenize_en(input):
         pass
     input = raw_unicode("So, one dey when I wes 17, I left.")
     for _ in tokenize_en(input):
         pass
コード例 #2
0
ファイル: tests.py プロジェクト: gloob/pyenchant
 def test_bug2785373(self):
     """Testcases for bug #2785373"""
     input = "So, one dey when I wes 17, I left."
     for _ in tokenize_en(input):
         pass
     input = raw_unicode("So, one dey when I wes 17, I left.")
     for _ in tokenize_en(input):
         pass
コード例 #3
0
def test_bug2785373():
    """Testcases for bug #2785373"""
    input = "So, one dey when I wes 17, I left."
    for _ in tokenize_en(input):
        pass
    input = "So, one dey when I wes 17, I left."
    for _ in tokenize_en(input):
        pass
コード例 #4
0
ファイル: tests.py プロジェクト: gloob/pyenchant
 def test_bug1591450(self):
     """Check for tokenization regressions identified in bug #1591450."""
     input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
     output = [
         ("Testing", 0),
         ("i", 9),
         ("markup", 11),
         ("i", 19),
         ("and", 22),
         ("y", 27),
         ("i", 29),
         ("so", 31),
         ("forth", 34),
         ("leading", 42),
         ("dots", 50),
         ("and", 55),
         ("trail", 59),
         ("well", 68),
         ("you", 74),
         ("get", 78),
         ("the", 82),
         ("point", 86),
         ("Also", 93),
         ("check", 98),
         ("numbers", 104),
         ("Done", 134),
     ]
     for (itmO, itmV) in zip(output, tokenize_en(input)):
         self.assertEqual(itmO, itmV)
コード例 #5
0
def test_bug1591450():
    """Check for tokenization regressions identified in bug #1591450."""
    input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
    output = [
        ("Testing", 0),
        ("i", 9),
        ("markup", 11),
        ("i", 19),
        ("and", 22),
        ("y", 27),
        ("i", 29),
        ("so", 31),
        ("forth", 34),
        ("leading", 42),
        ("dots", 50),
        ("and", 55),
        ("trail", 59),
        ("well", 68),
        ("you", 74),
        ("get", 78),
        ("the", 82),
        ("point", 86),
        ("Also", 93),
        ("check", 98),
        ("numbers", 104),
        ("Done", 134),
    ]
    for (itmO, itmV) in zip(output, tokenize_en(input)):
        assert itmO == itmV
コード例 #6
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
 def test_unicodeCombining(self):
     """Test tokenization with unicode combining symbols."""
     input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #7
0
 def test_unicodeBasic(self):
     """Test tokenization of a basic unicode string."""
     input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #8
0
def test_unicodeBasic():
    """Test tokenization of a basic unicode string."""
    input = "Ik ben geïnteresseerd in de coördinatie van mijn knieën, maar kan niet één à twee enquêtes vinden die recht doet aan mijn carrière op Curaçao"
    output = input.split(" ")
    output[8] = output[8][0:-1]
    for (itmO, itmV) in zip(output, tokenize_en(input)):
        assert itmO == itmV[0]
        assert input[itmV[1]:].startswith(itmO)
コード例 #9
0
 def test_unicodeCombining(self):
     """Test tokenization with unicode combining symbols."""
     input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #10
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
 def test_unicodeBasic(self):
     """Test tokenization of a basic unicode string."""
     input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #11
0
 def test_utf8_bytes_at_end(self):
     """Test tokenization of UTF8-encoded bytes at end of word."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
コード例 #12
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
 def test_utf8_bytes_at_end(self):
     """Test tokenization of UTF8-encoded bytes at end of word."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
コード例 #13
0
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
        outputT = [
(raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),]
        for (itmO,itmV) in zip(outputT,tokenize_en(inputT)):
            self.assertEqual(itmO,itmV)
コード例 #14
0
 def test_utf8_bytes(self):
     """Test tokenization of UTF8-encoded bytes (bug #2500184)."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #15
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
        outputT = [
(raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),]
        for (itmO,itmV) in zip(outputT,tokenize_en(inputT)):
            self.assertEqual(itmO,itmV)
コード例 #16
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
 def test_utf8_bytes(self):
     """Test tokenization of UTF8-encoded bytes (bug #2500184)."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
コード例 #17
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
 def test_utf8_bytes_in_an_array(self):
     """Test tokenization of UTF8-encoded bytes stored in an array."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     input = array.array('c',input)
     output = [array.array('c',w) for w in output]
     for (itmO,itmV) in zip(output,tokenize_en(array.array('c',input))):
         self.assertEqual(itmO,itmV[0])
         self.assertEqual(input[itmV[1]:itmV[1]+len(itmV[0])],itmO)
コード例 #18
0
 def test_utf8_bytes_in_an_array(self):
     """Test tokenization of UTF8-encoded bytes stored in an array."""
     # Python3 doesn't support bytestrings, don't run this test
     if str is unicode:
         return
     input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume"
     output = input.split(" ")
     output[1] = output[1][0:-1]
     input = array.array('c',input)
     output = [array.array('c',w) for w in output]
     for (itmO,itmV) in zip(output,tokenize_en(array.array('c',input))):
         self.assertEqual(itmO,itmV[0])
         self.assertEqual(input[itmV[1]:itmV[1]+len(itmV[0])],itmO)
コード例 #19
0
ファイル: tests.py プロジェクト: bopopescu/DomainSearch
    def test_tokenize_en(self):
        """Simple regression test for English tokenization."""
        input = """This is a paragraph.  It's not very special, but it's designed
2 show how the splitter works with many-different combos
of words. Also need to "test" the handling of 'quoted' words."""
        output = [("This", 0), ("is", 5), ("a", 8), ("paragraph", 10),
                  ("It's", 22), ("not", 27), ("very", 31), ("special", 36),
                  ("but", 45), ("it's", 49), ("designed", 54), ("show", 65),
                  ("how", 70), ("the", 74), ("splitter", 78), ("works", 87),
                  ("with", 93), ("many", 98), ("different", 103),
                  ("combos", 113), ("of", 120), ("words", 123), ("Also", 130),
                  ("need", 135), ("to", 140), ("test", 144), ("the", 150),
                  ("handling", 154), ("of", 163), ("quoted", 167),
                  ("words", 175)]
        for (itmO, itmV) in zip(output, tokenize_en(input)):
            self.assertEqual(itmO, itmV)
コード例 #20
0
def test_finnish_text():
    """Test tokenizing some Finnish text.

    This really should work since there are no special rules to apply,
    just lots of non-ascii characters.
    """
    text = textwrap.dedent("""\
        Tämä on kappale. Eipä ole kovin 2 nen, mutta tarkoitus on näyttää miten sanastaja
         toimii useiden-erilaisten sanaryppäiden kimpussa.
        Pitääpä vielä 'tarkistaa' sanat jotka "lainausmerkeissä". Heittomerkki ja vaa'an.
        Ulkomaisia sanoja süss, spaß.
    """)
    expected_tokens = [
        ("Tämä", 0),
        ("on", 5),
        ("kappale", 8),
        ("Eipä", 17),
        ("ole", 22),
        ("kovin", 26),
        ("nen", 34),
        ("mutta", 39),
        ("tarkoitus", 45),
        ("on", 55),
        ("näyttää", 58),
        ("miten", 66),
        ("sanastaja", 72),
        ("toimii", 83),
        ("useiden", 90),
        ("erilaisten", 98),
        ("sanaryppäiden", 109),
        ("kimpussa", 123),
        ("Pitääpä", 133),
        ("vielä", 141),
        ("tarkistaa", 148),
        ("sanat", 159),
        ("jotka", 165),
        ("lainausmerkeissä", 172),
        ("Heittomerkki", 191),
        ("ja", 204),
        ("vaa'an", 207),
        ("Ulkomaisia", 215),
        ("sanoja", 226),
        ("süss", 233),
        ("spaß", 239),
    ]
    assert list(tokenize_en(text)) == expected_tokens
コード例 #21
0
ファイル: tests.py プロジェクト: CatCookie/DomainSearch
    def test_tokenize_en(self):
        """Simple regression test for English tokenization."""
        input = """This is a paragraph.  It's not very special, but it's designed
2 show how the splitter works with many-different combos
of words. Also need to "test" the handling of 'quoted' words."""
        output = [
                  ("This",0),("is",5),("a",8),("paragraph",10),("It's",22),
                  ("not",27),("very",31),("special",36),("but",45),("it's",49),
                  ("designed",54),("show",65),("how",70),("the",74),
                  ("splitter",78),("works",87),("with",93),("many",98),
                  ("different",103),("combos",113),("of",120),("words",123),
                  ("Also",130),("need",135),
                  ("to",140),("test",144),("the",150),("handling",154),
                  ("of",163),("quoted",167),("words",175)
                 ]
        for (itmO,itmV) in zip(output,tokenize_en(input)):
            self.assertEqual(itmO,itmV)
コード例 #22
0
ファイル: tests.py プロジェクト: gloob/pyenchant
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode(
            "T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 'tarkistaa' sanat jotka \"lainausmerkeiss\\xe4\". Heittomerkki ja vaa'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf."
        )
        outputT = [
            (raw_unicode("T\\xe4m\\xe4"), 0),
            (raw_unicode("on"), 5),
            (raw_unicode("kappale"), 8),
            (raw_unicode("Eip\\xe4"), 17),
            (raw_unicode("ole"), 22),
            (raw_unicode("kovin"), 26),
            (raw_unicode("nen"), 34),
            (raw_unicode("mutta"), 39),
            (raw_unicode("tarkoitus"), 45),
            (raw_unicode("on"), 55),
            (raw_unicode("n\\xe4ytt\\xe4\\xe4"), 58),
            (raw_unicode("miten"), 66),
            (raw_unicode("sanastaja"), 72),
            (raw_unicode("toimii"), 83),
            (raw_unicode("useiden"), 90),
            (raw_unicode("erilaisten"), 98),
            (raw_unicode("sanarypp\\xe4iden"), 109),
            (raw_unicode("kimpussa"), 123),
            (raw_unicode("Pit\\xe4\\xe4p\\xe4"), 133),
            (raw_unicode("viel\\xe4"), 141),
            (raw_unicode("tarkistaa"), 148),
            (raw_unicode("sanat"), 159),
            (raw_unicode("jotka"), 165),
            (raw_unicode("lainausmerkeiss\\xe4"), 172),
            (raw_unicode("Heittomerkki"), 191),
            (raw_unicode("ja"), 204),
            (raw_unicode("vaa'an"), 207),
            (raw_unicode("Ulkomaisia"), 215),
            (raw_unicode("sanoja"), 226),
            (raw_unicode("s\\xfcss"), 233),
            (raw_unicode("spa\\xdf"), 239),
        ]
        for (itmO, itmV) in zip(outputT, tokenize_en(inputT)):
            self.assertEqual(itmO, itmV)
コード例 #23
0
ファイル: test_tokenize.py プロジェクト: godays/pyenchantRU
def test_typographic_apostrophe():
    """"Typographic apostrophes should be word separators in English."""
    text = "They\u2019re here"
    expected_tokens = [("They", 0), ("re", 5), ("here", 8)]
    assert list(tokenize_en(text)) == expected_tokens