Beispiel #1
0
    def test_ngram(self):
        ngram = NGram()
        self.assertTrue(ngram.get(0) is None)
        self.assertTrue(ngram.get(1) is None)
        self.assertTrue(ngram.get(2) is None)
        self.assertTrue(ngram.get(3) is None)
        self.assertTrue(ngram.get(4) is None)
        ngram.add_char(' ')
        self.assertTrue(ngram.get(1) is None)
        self.assertTrue(ngram.get(2) is None)
        self.assertTrue(ngram.get(3) is None)
        ngram.add_char('A')
        self.assertEqual(ngram.get(1), 'A')
        self.assertEqual(ngram.get(2), ' A')
        self.assertTrue(ngram.get(3) is None)
        ngram.add_char(six.u('\u06cc'))
        self.assertEqual(ngram.get(1), six.u('\u064a'))
        self.assertEqual(ngram.get(2), six.u('A\u064a'))
        self.assertEqual(ngram.get(3), six.u(' A\u064a'))
        ngram.add_char(six.u('\u1ea0'))
        self.assertEqual(ngram.get(1), six.u('\u1ec3'))
        self.assertEqual(ngram.get(2), six.u('\u064a\u1ec3'))
        self.assertEqual(ngram.get(3), six.u('A\u064a\u1ec3'))
        ngram.add_char(six.u('\u3044'))
        self.assertEqual(ngram.get(1), six.u('\u3042'))
        self.assertEqual(ngram.get(2), six.u('\u1ec3\u3042'))
        self.assertEqual(ngram.get(3), six.u('\u064a\u1ec3\u3042'))

        ngram.add_char(six.u('\u30a4'))
        self.assertEqual(ngram.get(1), six.u('\u30a2'))
        self.assertEqual(ngram.get(2), six.u('\u3042\u30a2'))
        self.assertEqual(ngram.get(3), six.u('\u1ec3\u3042\u30a2'))
        ngram.add_char(six.u('\u3106'))
        self.assertEqual(ngram.get(1), six.u('\u3105'))
        self.assertEqual(ngram.get(2), six.u('\u30a2\u3105'))
        self.assertEqual(ngram.get(3), six.u('\u3042\u30a2\u3105'))
        ngram.add_char(six.u('\uac01'))
        self.assertEqual(ngram.get(1), six.u('\uac00'))
        self.assertEqual(ngram.get(2), six.u('\u3105\uac00'))
        self.assertEqual(ngram.get(3), six.u('\u30a2\u3105\uac00'))
        ngram.add_char(six.u('\u2010'))
        self.assertTrue(ngram.get(1) is None)
        self.assertEqual(ngram.get(2), six.u('\uac00 '))
        self.assertEqual(ngram.get(3), six.u('\u3105\uac00 '))

        ngram.add_char('a')
        self.assertEqual(ngram.get(1), 'a')
        self.assertEqual(ngram.get(2), ' a')
        self.assertTrue(ngram.get(3) is None)
Beispiel #2
0
 def test_normalize_with_cjk_kanji(self):
     self.assertEqual(NGram.normalize(six.u('\u4E00')), six.u('\u4E00'))
     self.assertEqual(NGram.normalize(six.u('\u4E01')), six.u('\u4E01'))
     self.assertEqual(NGram.normalize(six.u('\u4E02')), six.u('\u4E02'))
     self.assertEqual(NGram.normalize(six.u('\u4E03')), six.u('\u4E01'))
     self.assertEqual(NGram.normalize(six.u('\u4E04')), six.u('\u4E04'))
     self.assertEqual(NGram.normalize(six.u('\u4E05')), six.u('\u4E05'))
     self.assertEqual(NGram.normalize(six.u('\u4E06')), six.u('\u4E06'))
     self.assertEqual(NGram.normalize(six.u('\u4E07')), six.u('\u4E07'))
     self.assertEqual(NGram.normalize(six.u('\u4E08')), six.u('\u4E08'))
     self.assertEqual(NGram.normalize(six.u('\u4E09')), six.u('\u4E09'))
     self.assertEqual(NGram.normalize(six.u('\u4E10')), six.u('\u4E10'))
     self.assertEqual(NGram.normalize(six.u('\u4E11')), six.u('\u4E11'))
     self.assertEqual(NGram.normalize(six.u('\u4E12')), six.u('\u4E12'))
     self.assertEqual(NGram.normalize(six.u('\u4E13')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E14')), six.u('\u4E14'))
     self.assertEqual(NGram.normalize(six.u('\u4E15')), six.u('\u4E15'))
     self.assertEqual(NGram.normalize(six.u('\u4E1e')), six.u('\u4E1e'))
     self.assertEqual(NGram.normalize(six.u('\u4E1f')), six.u('\u4E1f'))
     self.assertEqual(NGram.normalize(six.u('\u4E20')), six.u('\u4E20'))
     self.assertEqual(NGram.normalize(six.u('\u4E21')), six.u('\u4E21'))
     self.assertEqual(NGram.normalize(six.u('\u4E22')), six.u('\u4E22'))
     self.assertEqual(NGram.normalize(six.u('\u4E23')), six.u('\u4E23'))
     self.assertEqual(NGram.normalize(six.u('\u4E24')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E25')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E30')), six.u('\u4E30'))
Beispiel #3
0
 def test_normalize_for_romanian(self):
     self.assertEqual(NGram.normalize(six.u('\u015f')), six.u('\u015f'))
     self.assertEqual(NGram.normalize(six.u('\u0163')), six.u('\u0163'))
     self.assertEqual(NGram.normalize(six.u('\u0219')), six.u('\u015f'))
     self.assertEqual(NGram.normalize(six.u('\u021b')), six.u('\u0163'))
Beispiel #4
0
    def test_ngram3(self):
        ngram = NGram()

        ngram.add_char('A')
        self.assertEqual(ngram.get(1), 'A')
        self.assertEqual(ngram.get(2), ' A')
        self.assertTrue(ngram.get(3) is None)

        ngram.add_char('1')
        self.assertTrue(ngram.get(1) is None)
        self.assertEqual(ngram.get(2), 'A ')
        self.assertEqual(ngram.get(3), ' A ')

        ngram.add_char('B')
        self.assertEqual(ngram.get(1), 'B')
        self.assertEqual(ngram.get(2), ' B')
        self.assertTrue(ngram.get(3) is None)
Beispiel #5
0
    def test_normalize_vietnamese(self):
        self.assertEqual(NGram.normalize_vi(six.u('')), '')
        self.assertEqual(NGram.normalize_vi(six.u('ABC')), 'ABC')
        self.assertEqual(NGram.normalize_vi(six.u('012')), '012')
        self.assertEqual(NGram.normalize_vi(six.u('\u00c0')), six.u('\u00c0'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0300')),
                         six.u('\u00C0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0300')),
                         six.u('\u00C8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0300')),
                         six.u('\u00CC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0300')),
                         six.u('\u00D2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0300')),
                         six.u('\u00D9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0300')),
                         six.u('\u1EF2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0300')),
                         six.u('\u00E0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0300')),
                         six.u('\u00E8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0300')),
                         six.u('\u00EC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0300')),
                         six.u('\u00F2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0300')),
                         six.u('\u00F9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0300')),
                         six.u('\u1EF3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0300')),
                         six.u('\u1EA6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0300')),
                         six.u('\u1EC0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0300')),
                         six.u('\u1ED2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0300')),
                         six.u('\u1EA7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0300')),
                         six.u('\u1EC1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0300')),
                         six.u('\u1ED3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0300')),
                         six.u('\u1EB0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0300')),
                         six.u('\u1EB1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0300')),
                         six.u('\u1EDC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0300')),
                         six.u('\u1EDD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0300')),
                         six.u('\u1EEA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0300')),
                         six.u('\u1EEB'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0301')),
                         six.u('\u00C1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0301')),
                         six.u('\u00C9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0301')),
                         six.u('\u00CD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0301')),
                         six.u('\u00D3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0301')),
                         six.u('\u00DA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0301')),
                         six.u('\u00DD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0301')),
                         six.u('\u00E1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0301')),
                         six.u('\u00E9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0301')),
                         six.u('\u00ED'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0301')),
                         six.u('\u00F3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0301')),
                         six.u('\u00FA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0301')),
                         six.u('\u00FD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0301')),
                         six.u('\u1EA4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0301')),
                         six.u('\u1EBE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0301')),
                         six.u('\u1ED0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0301')),
                         six.u('\u1EA5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0301')),
                         six.u('\u1EBF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0301')),
                         six.u('\u1ED1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0301')),
                         six.u('\u1EAE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0301')),
                         six.u('\u1EAF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0301')),
                         six.u('\u1EDA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0301')),
                         six.u('\u1EDB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0301')),
                         six.u('\u1EE8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0301')),
                         six.u('\u1EE9'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0303')),
                         six.u('\u00C3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0303')),
                         six.u('\u1EBC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0303')),
                         six.u('\u0128'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0303')),
                         six.u('\u00D5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0303')),
                         six.u('\u0168'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0303')),
                         six.u('\u1EF8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0303')),
                         six.u('\u00E3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0303')),
                         six.u('\u1EBD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0303')),
                         six.u('\u0129'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0303')),
                         six.u('\u00F5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0303')),
                         six.u('\u0169'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0303')),
                         six.u('\u1EF9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0303')),
                         six.u('\u1EAA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0303')),
                         six.u('\u1EC4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0303')),
                         six.u('\u1ED6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0303')),
                         six.u('\u1EAB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0303')),
                         six.u('\u1EC5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0303')),
                         six.u('\u1ED7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0303')),
                         six.u('\u1EB4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0303')),
                         six.u('\u1EB5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0303')),
                         six.u('\u1EE0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0303')),
                         six.u('\u1EE1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0303')),
                         six.u('\u1EEE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0303')),
                         six.u('\u1EEF'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0309')),
                         six.u('\u1EA2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0309')),
                         six.u('\u1EBA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0309')),
                         six.u('\u1EC8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0309')),
                         six.u('\u1ECE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0309')),
                         six.u('\u1EE6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0309')),
                         six.u('\u1EF6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0309')),
                         six.u('\u1EA3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0309')),
                         six.u('\u1EBB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0309')),
                         six.u('\u1EC9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0309')),
                         six.u('\u1ECF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0309')),
                         six.u('\u1EE7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0309')),
                         six.u('\u1EF7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0309')),
                         six.u('\u1EA8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0309')),
                         six.u('\u1EC2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0309')),
                         six.u('\u1ED4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0309')),
                         six.u('\u1EA9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0309')),
                         six.u('\u1EC3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0309')),
                         six.u('\u1ED5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0309')),
                         six.u('\u1EB2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0309')),
                         six.u('\u1EB3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0309')),
                         six.u('\u1EDE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0309')),
                         six.u('\u1EDF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0309')),
                         six.u('\u1EEC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0309')),
                         six.u('\u1EED'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0323')),
                         six.u('\u1EA0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0323')),
                         six.u('\u1EB8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0323')),
                         six.u('\u1ECA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0323')),
                         six.u('\u1ECC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0323')),
                         six.u('\u1EE4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0323')),
                         six.u('\u1EF4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0323')),
                         six.u('\u1EA1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0323')),
                         six.u('\u1EB9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0323')),
                         six.u('\u1ECB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0323')),
                         six.u('\u1ECD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0323')),
                         six.u('\u1EE5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0323')),
                         six.u('\u1EF5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0323')),
                         six.u('\u1EAC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0323')),
                         six.u('\u1EC6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0323')),
                         six.u('\u1ED8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0323')),
                         six.u('\u1EAD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0323')),
                         six.u('\u1EC7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0323')),
                         six.u('\u1ED9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0323')),
                         six.u('\u1EB6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0323')),
                         six.u('\u1EB7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0323')),
                         six.u('\u1EE2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0323')),
                         six.u('\u1EE3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0323')),
                         six.u('\u1EF0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0323')),
                         six.u('\u1EF1'))
Beispiel #6
0
    def test_ngram(self):
        ngram = NGram()
        self.assertTrue(ngram.get(0) is None)
        self.assertTrue(ngram.get(1) is None)
        self.assertTrue(ngram.get(2) is None)
        self.assertTrue(ngram.get(3) is None)
        self.assertTrue(ngram.get(4) is None)
        ngram.add_char(' ')
        self.assertTrue(ngram.get(1) is None)
        self.assertTrue(ngram.get(2) is None)
        self.assertTrue(ngram.get(3) is None)
        ngram.add_char('A')
        self.assertEqual(ngram.get(1), 'A')
        self.assertEqual(ngram.get(2), ' A')
        self.assertTrue(ngram.get(3) is None)
        ngram.add_char(six.u('\u06cc'))
        self.assertEqual(ngram.get(1), six.u('\u064a'))
        self.assertEqual(ngram.get(2), six.u('A\u064a'))
        self.assertEqual(ngram.get(3), six.u(' A\u064a'))
        ngram.add_char(six.u('\u1ea0'))
        self.assertEqual(ngram.get(1), six.u('\u1ec3'))
        self.assertEqual(ngram.get(2), six.u('\u064a\u1ec3'))
        self.assertEqual(ngram.get(3), six.u('A\u064a\u1ec3'))
        ngram.add_char(six.u('\u3044'))
        self.assertEqual(ngram.get(1), six.u('\u3042'))
        self.assertEqual(ngram.get(2), six.u('\u1ec3\u3042'))
        self.assertEqual(ngram.get(3), six.u('\u064a\u1ec3\u3042'))

        ngram.add_char(six.u('\u30a4'))
        self.assertEqual(ngram.get(1), six.u('\u30a2'))
        self.assertEqual(ngram.get(2), six.u('\u3042\u30a2'))
        self.assertEqual(ngram.get(3), six.u('\u1ec3\u3042\u30a2'))
        ngram.add_char(six.u('\u3106'))
        self.assertEqual(ngram.get(1), six.u('\u3105'))
        self.assertEqual(ngram.get(2), six.u('\u30a2\u3105'))
        self.assertEqual(ngram.get(3), six.u('\u3042\u30a2\u3105'))
        ngram.add_char(six.u('\uac01'))
        self.assertEqual(ngram.get(1), six.u('\uac00'))
        self.assertEqual(ngram.get(2), six.u('\u3105\uac00'))
        self.assertEqual(ngram.get(3), six.u('\u30a2\u3105\uac00'))
        ngram.add_char(six.u('\u2010'))
        self.assertTrue(ngram.get(1) is None)
        self.assertEqual(ngram.get(2), six.u('\uac00 '))
        self.assertEqual(ngram.get(3), six.u('\u3105\uac00 '))

        ngram.add_char('a')
        self.assertEqual(ngram.get(1), 'a')
        self.assertEqual(ngram.get(2), ' a')
        self.assertTrue(ngram.get(3) is None)
Beispiel #7
0
 def test_normalize_with_latin(self):
     self.assertEqual(NGram.normalize(six.u('\u0000')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0009')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0020')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0030')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0040')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0041')), six.u('\u0041'))
     self.assertEqual(NGram.normalize(six.u('\u005a')), six.u('\u005a'))
     self.assertEqual(NGram.normalize(six.u('\u005b')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0060')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0061')), six.u('\u0061'))
     self.assertEqual(NGram.normalize(six.u('\u007a')), six.u('\u007a'))
     self.assertEqual(NGram.normalize(six.u('\u007b')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u007f')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0080')), six.u('\u0080'))
     self.assertEqual(NGram.normalize(six.u('\u00a0')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u00a1')), six.u('\u00a1'))
Beispiel #8
0
 def test_normalize_for_romanian(self):
     self.assertEqual(NGram.normalize(six.u('\u015f')), six.u('\u015f'))
     self.assertEqual(NGram.normalize(six.u('\u0163')), six.u('\u0163'))
     self.assertEqual(NGram.normalize(six.u('\u0219')), six.u('\u015f'))
     self.assertEqual(NGram.normalize(six.u('\u021b')), six.u('\u0163'))
Beispiel #9
0
 def test_normalize_with_cjk_kanji(self):
     self.assertEqual(NGram.normalize(six.u('\u4E00')), six.u('\u4E00'))
     self.assertEqual(NGram.normalize(six.u('\u4E01')), six.u('\u4E01'))
     self.assertEqual(NGram.normalize(six.u('\u4E02')), six.u('\u4E02'))
     self.assertEqual(NGram.normalize(six.u('\u4E03')), six.u('\u4E01'))
     self.assertEqual(NGram.normalize(six.u('\u4E04')), six.u('\u4E04'))
     self.assertEqual(NGram.normalize(six.u('\u4E05')), six.u('\u4E05'))
     self.assertEqual(NGram.normalize(six.u('\u4E06')), six.u('\u4E06'))
     self.assertEqual(NGram.normalize(six.u('\u4E07')), six.u('\u4E07'))
     self.assertEqual(NGram.normalize(six.u('\u4E08')), six.u('\u4E08'))
     self.assertEqual(NGram.normalize(six.u('\u4E09')), six.u('\u4E09'))
     self.assertEqual(NGram.normalize(six.u('\u4E10')), six.u('\u4E10'))
     self.assertEqual(NGram.normalize(six.u('\u4E11')), six.u('\u4E11'))
     self.assertEqual(NGram.normalize(six.u('\u4E12')), six.u('\u4E12'))
     self.assertEqual(NGram.normalize(six.u('\u4E13')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E14')), six.u('\u4E14'))
     self.assertEqual(NGram.normalize(six.u('\u4E15')), six.u('\u4E15'))
     self.assertEqual(NGram.normalize(six.u('\u4E1e')), six.u('\u4E1e'))
     self.assertEqual(NGram.normalize(six.u('\u4E1f')), six.u('\u4E1f'))
     self.assertEqual(NGram.normalize(six.u('\u4E20')), six.u('\u4E20'))
     self.assertEqual(NGram.normalize(six.u('\u4E21')), six.u('\u4E21'))
     self.assertEqual(NGram.normalize(six.u('\u4E22')), six.u('\u4E22'))
     self.assertEqual(NGram.normalize(six.u('\u4E23')), six.u('\u4E23'))
     self.assertEqual(NGram.normalize(six.u('\u4E24')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E25')), six.u('\u4E13'))
     self.assertEqual(NGram.normalize(six.u('\u4E30')), six.u('\u4E30'))
Beispiel #10
0
    def test_normalize_vietnamese(self):
        self.assertEqual(NGram.normalize_vi(six.u('')), '')
        self.assertEqual(NGram.normalize_vi(six.u('ABC')), 'ABC')
        self.assertEqual(NGram.normalize_vi(six.u('012')), '012')
        self.assertEqual(NGram.normalize_vi(six.u('\u00c0')), six.u('\u00c0'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0300')), six.u('\u00C0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0300')), six.u('\u00C8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0300')), six.u('\u00CC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0300')), six.u('\u00D2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0300')), six.u('\u00D9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0300')), six.u('\u1EF2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0300')), six.u('\u00E0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0300')), six.u('\u00E8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0300')), six.u('\u00EC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0300')), six.u('\u00F2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0300')), six.u('\u00F9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0300')), six.u('\u1EF3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0300')), six.u('\u1EA6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0300')), six.u('\u1EC0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0300')), six.u('\u1ED2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0300')), six.u('\u1EA7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0300')), six.u('\u1EC1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0300')), six.u('\u1ED3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0300')), six.u('\u1EB0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0300')), six.u('\u1EB1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0300')), six.u('\u1EDC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0300')), six.u('\u1EDD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0300')), six.u('\u1EEA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0300')), six.u('\u1EEB'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0301')), six.u('\u00C1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0301')), six.u('\u00C9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0301')), six.u('\u00CD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0301')), six.u('\u00D3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0301')), six.u('\u00DA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0301')), six.u('\u00DD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0301')), six.u('\u00E1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0301')), six.u('\u00E9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0301')), six.u('\u00ED'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0301')), six.u('\u00F3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0301')), six.u('\u00FA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0301')), six.u('\u00FD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0301')), six.u('\u1EA4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0301')), six.u('\u1EBE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0301')), six.u('\u1ED0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0301')), six.u('\u1EA5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0301')), six.u('\u1EBF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0301')), six.u('\u1ED1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0301')), six.u('\u1EAE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0301')), six.u('\u1EAF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0301')), six.u('\u1EDA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0301')), six.u('\u1EDB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0301')), six.u('\u1EE8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0301')), six.u('\u1EE9'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0303')), six.u('\u00C3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0303')), six.u('\u1EBC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0303')), six.u('\u0128'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0303')), six.u('\u00D5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0303')), six.u('\u0168'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0303')), six.u('\u1EF8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0303')), six.u('\u00E3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0303')), six.u('\u1EBD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0303')), six.u('\u0129'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0303')), six.u('\u00F5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0303')), six.u('\u0169'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0303')), six.u('\u1EF9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0303')), six.u('\u1EAA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0303')), six.u('\u1EC4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0303')), six.u('\u1ED6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0303')), six.u('\u1EAB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0303')), six.u('\u1EC5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0303')), six.u('\u1ED7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0303')), six.u('\u1EB4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0303')), six.u('\u1EB5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0303')), six.u('\u1EE0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0303')), six.u('\u1EE1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0303')), six.u('\u1EEE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0303')), six.u('\u1EEF'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0309')), six.u('\u1EA2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0309')), six.u('\u1EBA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0309')), six.u('\u1EC8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0309')), six.u('\u1ECE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0309')), six.u('\u1EE6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0309')), six.u('\u1EF6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0309')), six.u('\u1EA3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0309')), six.u('\u1EBB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0309')), six.u('\u1EC9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0309')), six.u('\u1ECF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0309')), six.u('\u1EE7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0309')), six.u('\u1EF7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0309')), six.u('\u1EA8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0309')), six.u('\u1EC2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0309')), six.u('\u1ED4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0309')), six.u('\u1EA9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0309')), six.u('\u1EC3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0309')), six.u('\u1ED5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0309')), six.u('\u1EB2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0309')), six.u('\u1EB3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0309')), six.u('\u1EDE'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0309')), six.u('\u1EDF'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0309')), six.u('\u1EEC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0309')), six.u('\u1EED'))

        self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0323')), six.u('\u1EA0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0323')), six.u('\u1EB8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0323')), six.u('\u1ECA'))
        self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0323')), six.u('\u1ECC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0323')), six.u('\u1EE4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0323')), six.u('\u1EF4'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0323')), six.u('\u1EA1'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0323')), six.u('\u1EB9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0323')), six.u('\u1ECB'))
        self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0323')), six.u('\u1ECD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0323')), six.u('\u1EE5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0323')), six.u('\u1EF5'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0323')), six.u('\u1EAC'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0323')), six.u('\u1EC6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0323')), six.u('\u1ED8'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0323')), six.u('\u1EAD'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0323')), six.u('\u1EC7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0323')), six.u('\u1ED9'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0323')), six.u('\u1EB6'))
        self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0323')), six.u('\u1EB7'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0323')), six.u('\u1EE2'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0323')), six.u('\u1EE3'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0323')), six.u('\u1EF0'))
        self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0323')), six.u('\u1EF1'))
Beispiel #11
0
    def test_ngram3(self):
        ngram = NGram()

        ngram.add_char('A')
        self.assertEqual(ngram.get(1), 'A')
        self.assertEqual(ngram.get(2), ' A')
        self.assertTrue(ngram.get(3) is None)

        ngram.add_char('1')
        self.assertTrue(ngram.get(1) is None)
        self.assertEqual(ngram.get(2), 'A ')
        self.assertEqual(ngram.get(3), ' A ')

        ngram.add_char('B')
        self.assertEqual(ngram.get(1), 'B')
        self.assertEqual(ngram.get(2), ' B')
        self.assertTrue(ngram.get(3) is None)
Beispiel #12
0
 def test_normalize_with_latin(self):
     self.assertEqual(NGram.normalize(six.u('\u0000')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0009')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0020')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0030')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0040')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0041')), six.u('\u0041'))
     self.assertEqual(NGram.normalize(six.u('\u005a')), six.u('\u005a'))
     self.assertEqual(NGram.normalize(six.u('\u005b')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0060')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0061')), six.u('\u0061'))
     self.assertEqual(NGram.normalize(six.u('\u007a')), six.u('\u007a'))
     self.assertEqual(NGram.normalize(six.u('\u007b')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u007f')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u0080')), six.u('\u0080'))
     self.assertEqual(NGram.normalize(six.u('\u00a0')), ' ')
     self.assertEqual(NGram.normalize(six.u('\u00a1')), six.u('\u00a1'))