Exemple #1
0
def get_scripts(text):
  """Return the set of scripts in this text.  Excludes
  some common chars."""
  # ignore these chars, we assume they are ok in any script
  exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF}
  zyyy_chars = set()
  scripts = set()
  ustr = unicode(text, 'utf8')
  for cp in ustr:
    if ord(cp) in exclusions:
      continue
    script = unicode_data.script(cp)
    if script == 'Zyyy': # common/undetermined
      zyyy_chars.add(cp if cp < '\u00fe' else ord(cp))
    elif not script == 'Zinh': # inherited
      scripts.add(script)
  return scripts, zyyy_chars
Exemple #2
0
def get_script_histogram(utext):
    """Return a map from script to character count + chars, excluding some common
  whitespace, and inherited characters.  utext is a unicode string."""
    exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF}
    result = {}
    for cp in utext:
        if ord(cp) in exclusions:
            continue
        script = unicode_data.script(cp)
        if script == 'Zinh':
            continue
        if script not in result:
            result[script] = [1, set([cp])]
        else:
            r = result[script]
            r[0] += 1
            r[1].add(cp)
    return result
def get_script_histogram(utext):
  """Return a map from script to character count + chars, excluding some common
  whitespace, and inherited characters.  utext is a unicode string."""
  exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF}
  result = {}
  for cp in utext:
    if ord(cp) in exclusions:
      continue
    script = unicode_data.script(cp)
    if script == 'Zinh':
      continue
    if script not in result:
      result[script] = [1, set([cp])]
    else:
      r = result[script]
      r[0] += 1
      r[1].add(cp)
  return result
 def test_script(self):
     """Tests the script() method."""
     self.assertEqual('Latn', unicode_data.script(0xA794))
     self.assertEqual('Zzzz', unicode_data.script(0xE006))