def test_ngram(self): ngram = NGram() self.assertTrue(ngram.get(0) is None) self.assertTrue(ngram.get(1) is None) self.assertTrue(ngram.get(2) is None) self.assertTrue(ngram.get(3) is None) self.assertTrue(ngram.get(4) is None) ngram.add_char(' ') self.assertTrue(ngram.get(1) is None) self.assertTrue(ngram.get(2) is None) self.assertTrue(ngram.get(3) is None) ngram.add_char('A') self.assertEqual(ngram.get(1), 'A') self.assertEqual(ngram.get(2), ' A') self.assertTrue(ngram.get(3) is None) ngram.add_char(six.u('\u06cc')) self.assertEqual(ngram.get(1), six.u('\u064a')) self.assertEqual(ngram.get(2), six.u('A\u064a')) self.assertEqual(ngram.get(3), six.u(' A\u064a')) ngram.add_char(six.u('\u1ea0')) self.assertEqual(ngram.get(1), six.u('\u1ec3')) self.assertEqual(ngram.get(2), six.u('\u064a\u1ec3')) self.assertEqual(ngram.get(3), six.u('A\u064a\u1ec3')) ngram.add_char(six.u('\u3044')) self.assertEqual(ngram.get(1), six.u('\u3042')) self.assertEqual(ngram.get(2), six.u('\u1ec3\u3042')) self.assertEqual(ngram.get(3), six.u('\u064a\u1ec3\u3042')) ngram.add_char(six.u('\u30a4')) self.assertEqual(ngram.get(1), six.u('\u30a2')) self.assertEqual(ngram.get(2), six.u('\u3042\u30a2')) self.assertEqual(ngram.get(3), six.u('\u1ec3\u3042\u30a2')) ngram.add_char(six.u('\u3106')) self.assertEqual(ngram.get(1), six.u('\u3105')) self.assertEqual(ngram.get(2), six.u('\u30a2\u3105')) self.assertEqual(ngram.get(3), six.u('\u3042\u30a2\u3105')) ngram.add_char(six.u('\uac01')) self.assertEqual(ngram.get(1), six.u('\uac00')) self.assertEqual(ngram.get(2), six.u('\u3105\uac00')) self.assertEqual(ngram.get(3), six.u('\u30a2\u3105\uac00')) ngram.add_char(six.u('\u2010')) self.assertTrue(ngram.get(1) is None) self.assertEqual(ngram.get(2), six.u('\uac00 ')) self.assertEqual(ngram.get(3), six.u('\u3105\uac00 ')) ngram.add_char('a') self.assertEqual(ngram.get(1), 'a') self.assertEqual(ngram.get(2), ' a') self.assertTrue(ngram.get(3) is None)
def test_normalize_with_cjk_kanji(self): self.assertEqual(NGram.normalize(six.u('\u4E00')), six.u('\u4E00')) self.assertEqual(NGram.normalize(six.u('\u4E01')), six.u('\u4E01')) self.assertEqual(NGram.normalize(six.u('\u4E02')), six.u('\u4E02')) self.assertEqual(NGram.normalize(six.u('\u4E03')), six.u('\u4E01')) self.assertEqual(NGram.normalize(six.u('\u4E04')), six.u('\u4E04')) self.assertEqual(NGram.normalize(six.u('\u4E05')), six.u('\u4E05')) self.assertEqual(NGram.normalize(six.u('\u4E06')), six.u('\u4E06')) self.assertEqual(NGram.normalize(six.u('\u4E07')), six.u('\u4E07')) self.assertEqual(NGram.normalize(six.u('\u4E08')), six.u('\u4E08')) self.assertEqual(NGram.normalize(six.u('\u4E09')), six.u('\u4E09')) self.assertEqual(NGram.normalize(six.u('\u4E10')), six.u('\u4E10')) self.assertEqual(NGram.normalize(six.u('\u4E11')), six.u('\u4E11')) self.assertEqual(NGram.normalize(six.u('\u4E12')), six.u('\u4E12')) self.assertEqual(NGram.normalize(six.u('\u4E13')), six.u('\u4E13')) self.assertEqual(NGram.normalize(six.u('\u4E14')), six.u('\u4E14')) self.assertEqual(NGram.normalize(six.u('\u4E15')), six.u('\u4E15')) self.assertEqual(NGram.normalize(six.u('\u4E1e')), six.u('\u4E1e')) self.assertEqual(NGram.normalize(six.u('\u4E1f')), six.u('\u4E1f')) self.assertEqual(NGram.normalize(six.u('\u4E20')), six.u('\u4E20')) self.assertEqual(NGram.normalize(six.u('\u4E21')), six.u('\u4E21')) self.assertEqual(NGram.normalize(six.u('\u4E22')), six.u('\u4E22')) self.assertEqual(NGram.normalize(six.u('\u4E23')), six.u('\u4E23')) self.assertEqual(NGram.normalize(six.u('\u4E24')), six.u('\u4E13')) self.assertEqual(NGram.normalize(six.u('\u4E25')), six.u('\u4E13')) self.assertEqual(NGram.normalize(six.u('\u4E30')), six.u('\u4E30'))
def test_normalize_for_romanian(self): self.assertEqual(NGram.normalize(six.u('\u015f')), six.u('\u015f')) self.assertEqual(NGram.normalize(six.u('\u0163')), six.u('\u0163')) self.assertEqual(NGram.normalize(six.u('\u0219')), six.u('\u015f')) self.assertEqual(NGram.normalize(six.u('\u021b')), six.u('\u0163'))
def test_ngram3(self): ngram = NGram() ngram.add_char('A') self.assertEqual(ngram.get(1), 'A') self.assertEqual(ngram.get(2), ' A') self.assertTrue(ngram.get(3) is None) ngram.add_char('1') self.assertTrue(ngram.get(1) is None) self.assertEqual(ngram.get(2), 'A ') self.assertEqual(ngram.get(3), ' A ') ngram.add_char('B') self.assertEqual(ngram.get(1), 'B') self.assertEqual(ngram.get(2), ' B') self.assertTrue(ngram.get(3) is None)
def test_normalize_vietnamese(self): self.assertEqual(NGram.normalize_vi(six.u('')), '') self.assertEqual(NGram.normalize_vi(six.u('ABC')), 'ABC') self.assertEqual(NGram.normalize_vi(six.u('012')), '012') self.assertEqual(NGram.normalize_vi(six.u('\u00c0')), six.u('\u00c0')) self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0300')), six.u('\u00C0')) self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0300')), six.u('\u00C8')) self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0300')), six.u('\u00CC')) self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0300')), six.u('\u00D2')) self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0300')), six.u('\u00D9')) self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0300')), six.u('\u1EF2')) self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0300')), six.u('\u00E0')) self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0300')), six.u('\u00E8')) self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0300')), six.u('\u00EC')) self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0300')), six.u('\u00F2')) self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0300')), six.u('\u00F9')) self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0300')), six.u('\u1EF3')) self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0300')), six.u('\u1EA6')) self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0300')), six.u('\u1EC0')) self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0300')), six.u('\u1ED2')) self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0300')), six.u('\u1EA7')) self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0300')), six.u('\u1EC1')) self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0300')), six.u('\u1ED3')) self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0300')), six.u('\u1EB0')) self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0300')), six.u('\u1EB1')) self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0300')), six.u('\u1EDC')) self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0300')), six.u('\u1EDD')) self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0300')), six.u('\u1EEA')) self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0300')), six.u('\u1EEB')) self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0301')), six.u('\u00C1')) self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0301')), six.u('\u00C9')) self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0301')), six.u('\u00CD')) self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0301')), six.u('\u00D3')) self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0301')), six.u('\u00DA')) self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0301')), six.u('\u00DD')) self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0301')), six.u('\u00E1')) self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0301')), six.u('\u00E9')) self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0301')), six.u('\u00ED')) self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0301')), six.u('\u00F3')) self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0301')), six.u('\u00FA')) self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0301')), six.u('\u00FD')) self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0301')), six.u('\u1EA4')) self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0301')), six.u('\u1EBE')) self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0301')), six.u('\u1ED0')) self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0301')), six.u('\u1EA5')) self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0301')), six.u('\u1EBF')) self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0301')), six.u('\u1ED1')) self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0301')), six.u('\u1EAE')) self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0301')), six.u('\u1EAF')) self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0301')), six.u('\u1EDA')) self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0301')), six.u('\u1EDB')) self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0301')), six.u('\u1EE8')) self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0301')), six.u('\u1EE9')) self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0303')), six.u('\u00C3')) self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0303')), six.u('\u1EBC')) self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0303')), six.u('\u0128')) self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0303')), six.u('\u00D5')) self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0303')), six.u('\u0168')) self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0303')), six.u('\u1EF8')) self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0303')), six.u('\u00E3')) self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0303')), six.u('\u1EBD')) self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0303')), six.u('\u0129')) self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0303')), six.u('\u00F5')) self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0303')), six.u('\u0169')) self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0303')), six.u('\u1EF9')) self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0303')), six.u('\u1EAA')) self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0303')), six.u('\u1EC4')) self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0303')), six.u('\u1ED6')) self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0303')), six.u('\u1EAB')) self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0303')), six.u('\u1EC5')) self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0303')), six.u('\u1ED7')) self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0303')), six.u('\u1EB4')) self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0303')), six.u('\u1EB5')) self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0303')), six.u('\u1EE0')) self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0303')), six.u('\u1EE1')) self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0303')), six.u('\u1EEE')) self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0303')), six.u('\u1EEF')) self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0309')), six.u('\u1EA2')) self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0309')), six.u('\u1EBA')) self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0309')), six.u('\u1EC8')) self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0309')), six.u('\u1ECE')) self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0309')), six.u('\u1EE6')) self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0309')), six.u('\u1EF6')) self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0309')), six.u('\u1EA3')) self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0309')), six.u('\u1EBB')) self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0309')), six.u('\u1EC9')) self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0309')), six.u('\u1ECF')) self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0309')), six.u('\u1EE7')) self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0309')), six.u('\u1EF7')) self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0309')), six.u('\u1EA8')) self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0309')), six.u('\u1EC2')) self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0309')), six.u('\u1ED4')) self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0309')), six.u('\u1EA9')) self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0309')), six.u('\u1EC3')) self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0309')), six.u('\u1ED5')) self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0309')), six.u('\u1EB2')) self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0309')), six.u('\u1EB3')) self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0309')), six.u('\u1EDE')) self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0309')), six.u('\u1EDF')) self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0309')), six.u('\u1EEC')) self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0309')), six.u('\u1EED')) self.assertEqual(NGram.normalize_vi(six.u('\u0041\u0323')), six.u('\u1EA0')) self.assertEqual(NGram.normalize_vi(six.u('\u0045\u0323')), six.u('\u1EB8')) self.assertEqual(NGram.normalize_vi(six.u('\u0049\u0323')), six.u('\u1ECA')) self.assertEqual(NGram.normalize_vi(six.u('\u004F\u0323')), six.u('\u1ECC')) self.assertEqual(NGram.normalize_vi(six.u('\u0055\u0323')), six.u('\u1EE4')) self.assertEqual(NGram.normalize_vi(six.u('\u0059\u0323')), six.u('\u1EF4')) self.assertEqual(NGram.normalize_vi(six.u('\u0061\u0323')), six.u('\u1EA1')) self.assertEqual(NGram.normalize_vi(six.u('\u0065\u0323')), six.u('\u1EB9')) self.assertEqual(NGram.normalize_vi(six.u('\u0069\u0323')), six.u('\u1ECB')) self.assertEqual(NGram.normalize_vi(six.u('\u006F\u0323')), six.u('\u1ECD')) self.assertEqual(NGram.normalize_vi(six.u('\u0075\u0323')), six.u('\u1EE5')) self.assertEqual(NGram.normalize_vi(six.u('\u0079\u0323')), six.u('\u1EF5')) self.assertEqual(NGram.normalize_vi(six.u('\u00C2\u0323')), six.u('\u1EAC')) self.assertEqual(NGram.normalize_vi(six.u('\u00CA\u0323')), six.u('\u1EC6')) self.assertEqual(NGram.normalize_vi(six.u('\u00D4\u0323')), six.u('\u1ED8')) self.assertEqual(NGram.normalize_vi(six.u('\u00E2\u0323')), six.u('\u1EAD')) self.assertEqual(NGram.normalize_vi(six.u('\u00EA\u0323')), six.u('\u1EC7')) self.assertEqual(NGram.normalize_vi(six.u('\u00F4\u0323')), six.u('\u1ED9')) self.assertEqual(NGram.normalize_vi(six.u('\u0102\u0323')), six.u('\u1EB6')) self.assertEqual(NGram.normalize_vi(six.u('\u0103\u0323')), six.u('\u1EB7')) self.assertEqual(NGram.normalize_vi(six.u('\u01A0\u0323')), six.u('\u1EE2')) self.assertEqual(NGram.normalize_vi(six.u('\u01A1\u0323')), six.u('\u1EE3')) self.assertEqual(NGram.normalize_vi(six.u('\u01AF\u0323')), six.u('\u1EF0')) self.assertEqual(NGram.normalize_vi(six.u('\u01B0\u0323')), six.u('\u1EF1'))
def test_normalize_with_latin(self): self.assertEqual(NGram.normalize(six.u('\u0000')), ' ') self.assertEqual(NGram.normalize(six.u('\u0009')), ' ') self.assertEqual(NGram.normalize(six.u('\u0020')), ' ') self.assertEqual(NGram.normalize(six.u('\u0030')), ' ') self.assertEqual(NGram.normalize(six.u('\u0040')), ' ') self.assertEqual(NGram.normalize(six.u('\u0041')), six.u('\u0041')) self.assertEqual(NGram.normalize(six.u('\u005a')), six.u('\u005a')) self.assertEqual(NGram.normalize(six.u('\u005b')), ' ') self.assertEqual(NGram.normalize(six.u('\u0060')), ' ') self.assertEqual(NGram.normalize(six.u('\u0061')), six.u('\u0061')) self.assertEqual(NGram.normalize(six.u('\u007a')), six.u('\u007a')) self.assertEqual(NGram.normalize(six.u('\u007b')), ' ') self.assertEqual(NGram.normalize(six.u('\u007f')), ' ') self.assertEqual(NGram.normalize(six.u('\u0080')), six.u('\u0080')) self.assertEqual(NGram.normalize(six.u('\u00a0')), ' ') self.assertEqual(NGram.normalize(six.u('\u00a1')), six.u('\u00a1'))