def get_scripts(text): """Return the set of scripts in this text. Excludes some common chars.""" # ignore these chars, we assume they are ok in any script exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF} zyyy_chars = set() scripts = set() ustr = unicode(text, 'utf8') for cp in ustr: if ord(cp) in exclusions: continue script = unicode_data.script(cp) if script == 'Zyyy': # common/undetermined zyyy_chars.add(cp if cp < '\u00fe' else ord(cp)) elif not script == 'Zinh': # inherited scripts.add(script) return scripts, zyyy_chars
def get_script_histogram(utext): """Return a map from script to character count + chars, excluding some common whitespace, and inherited characters. utext is a unicode string.""" exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF} result = {} for cp in utext: if ord(cp) in exclusions: continue script = unicode_data.script(cp) if script == 'Zinh': continue if script not in result: result[script] = [1, set([cp])] else: r = result[script] r[0] += 1 r[1].add(cp) return result
def test_script(self): """Tests the script() method.""" self.assertEqual('Latn', unicode_data.script(0xA794)) self.assertEqual('Zzzz', unicode_data.script(0xE006))