Ejemplo n.º 1
0
def read_fontconfig_orth(path):
    """filepath to fontconfig *.orth file --> (icu.UnicodeSet, [references])"""
    result = icu.UnicodeSet()
    references = [
        'https://cgit.freedesktop.org/fontconfig/tree/fc-lang/' +
        os.path.basename(path)
    ]
    with codecs.open(path, 'r', 'utf-8') as f:
        for line in f:
            references.extend(extract_urls(line))
            line = line.split('#')[0].strip().split('\t')[0].strip()
            if not line:
                continue
            elif line.startswith('include '):
                incfile = os.path.join(os.path.dirname(path), line.split()[1])
                result.addAll(read_fontconfig_orth(incfile)[0])
            else:
                r = [int(x, 16) for x in line.split('-') if x.strip()]
                if len(r) == 1:
                    result.add(normalize_fontconfig_char(unichr(r[0])))
                elif len(r) == 2:
                    for c in range(r[0], r[1] + 1):
                        result.add(normalize_fontconfig_char(unichr(c)))
                else:
                    raise ValueError(path)
    result = result.compact()
    return (result, references)
Ejemplo n.º 2
0
def ScriptSymbols(script, include_script_code=False):
    """Yields short symbol names for all characters in the given script."""
    script_chars = icu.UnicodeSet(r'[\p{%s}\u200C\u200D]' % script.getName())
    script_name = script.getName().replace('_', ' ')
    utf8.stderr.write('Found %d characters specific to %s (%s)\n' %
                      (len(script_chars), script_name, script.getShortName()))
    prefix = script_name.upper()
    for c in script_chars:
        label = CharToCodepoint(c)
        if label in EXCEPTIONS:
            symbol_name = EXCEPTIONS[label]
        else:
            name = CharName(c)
            if not name:
                utf8.stderr.write('Warning: No Unicode name for %04X\n' %
                                  label)
                continue
            name = RemovePrefix(name, prefix)
            name = RemovePrefix(name, 'ZERO WIDTH')
            assert name
            for old, new in DIGITS.items():
                name = name.replace(old, new)
            components = [t for t in name.split() if t not in STOPWORDS]
            symbol_name = '_'.join(components).lower()
            assert symbol_name, ('Empty symbol name for %04X (%s)' %
                                 (label, name))
            if symbol_name.startswith('-'):
                symbol_name = "'%s" % symbol_name[1:]
            if any(substr in name for substr in DEPENDENT):
                symbol_name = '-%s' % symbol_name
        if include_script_code:
            symbol_name = '%s:%s' % (script.getShortName(), symbol_name)
        yield symbol_name, label
    return
Ejemplo n.º 3
0
def make_phoneme_set(s):
    pat = [u'\\u0020', "ˈ", '.']
    for phoneme in s.split():
        if len(phoneme) == 1:
            pat.append(phoneme)
        else:
            pat.append('{%s}' % phoneme)
    result = icu.UnicodeSet()
    result.applyPattern('[%s]' % ' '.join(pat))
    return result
Ejemplo n.º 4
0
def makePhonemeSet(s):
    pat = []
    for phoneme in s.split():
        if len(phoneme) == 1:
            pat.append(phoneme)
        else:
            pat.append('{%s}' % phoneme)
    #print ' '.join(pat).encode('utf-8')
    result = icu.UnicodeSet()
    result.applyPattern('[%s]' % ' '.join(pat))
    return result
Ejemplo n.º 5
0
def get_cldr_exemplars(lang, exemplars):
    main, src = get_cldr_exemplars_by_type(lang, 'main', cldr_exemplars)
    if not main:
        return None, set()
    result = icu.UnicodeSet(main)
    sources = {src}
    aux, src = get_cldr_exemplars_by_type(lang, 'auxiliary', cldr_exemplars)
    if aux:
        result.addAll(aux)
        sources.add(src)
    index, src = get_cldr_exemplars_by_type(lang, 'index', cldr_exemplars)
    if index:
        result.addAll(index)
        sources.add(src)
    return result, sources
Ejemplo n.º 6
0
def format_unicodeset(uset):
    ranges = []
    for i in range(uset.getRangeCount()):
        if len(uset.getRangeStart(i)) != 1 or len(uset.getRangeEnd(i)) != 1:
            return uset.toPattern()
        start = ord(uset.getRangeStart(i))
        end = ord(uset.getRangeEnd(i))
        if end - start < 3:
            ranges.extend([
                escape_for_unicodeset(unichr(c))
                for c in range(start, end + 1)
            ])
        else:
            ranges.append('%s-%s' % (escape_for_unicodeset(
                unichr(start)), escape_for_unicodeset(unichr(end))))
    result = '[%s]' % ' '.join(ranges)
    # Make sure we don't change semantics with our pretty-pretting.
    if icu.UnicodeSet(result).toPattern() != uset.toPattern():
        return uset.toPattern()
    return result
Ejemplo n.º 7
0
def read_cldr_file(filepath):
    assert filepath.endswith('.xml'), filepath
    exemplars = {}
    ldml = etree.parse(filepath).getroot()
    lang = ldml.find('./identity/language').attrib['type']
    script = ldml.find('./identity/script')
    if script is not None:
        lang = lang + '_' + script.attrib['type']
    territory = ldml.find('./identity/territory')
    if territory is not None:
        lang = lang + '_' + territory.attrib['type']
    variants = sorted(
        [t.attrib['type'] for t in ldml.iterfind('./identity/variant')])
    if variants is not None:
        lang = '_'.join([lang] + variants)
    tags = set(t.tag for t in ldml.iterfind('./identity/*'))
    if not tags.issubset(
        {'version', 'language', 'script', 'territory', 'variant'}):
        raise ValueError('unexpected identity elements in %s' % filepath)
    for ex in ldml.iterfind('./characters/exemplarCharacters'):
        extype = ex.attrib.get('type', 'main')
        exemplars[extype] = icu.UnicodeSet(''.join(ex.itertext()))
    return lang, exemplars
Ejemplo n.º 8
0

def makePhonemeSet(s):
    pat = []
    for phoneme in s.split():
        if len(phoneme) == 1:
            pat.append(phoneme)
        else:
            pat.append('{%s}' % phoneme)
    #print ' '.join(pat).encode('utf-8')
    result = icu.UnicodeSet()
    result.applyPattern('[%s]' % ' '.join(pat))
    return result


ARMENIAN_GRAPHEMES = icu.UnicodeSet()
ARMENIAN_GRAPHEMES.applyPattern('[:Armn:]')

ARMENIAN_PHONEMES = makePhonemeSet("""

    m n
    p pʰ t tʰ k kʰ b d g 
    t͡s t͡sʰ t͡ʃ t͡ʃʰ d͡z d͡ʒ
    f v s z ʃ ʒ x ɣ h
    l j r ɾ

    i u
    ɛ ə o
    a

""")
Ejemplo n.º 9
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import codecs
import icu

from cldr_util import makePhonemeSet, match, check, regtest

GRAPHEMES = icu.UnicodeSet()
GRAPHEMES.applyPattern('[[:Sinh:] [:Cf:]]')

PHONEMES = makePhonemeSet("""

    m n ɲ ŋ
    p b ᵐb ⁿd ʈ ɖ ⁿɖ k g ᵑg
    s ʃ
    t͡ʃ  d͡ʒ
    f h
    r
    ʋ l j
    w

    i iː   u uː
    e eː ə o oː
    æː æ   a aː
    .

""")

check('si-si_FONIPA.txt', GRAPHEMES, PHONEMES)
regtest('si-si_FONIPA', GRAPHEMES, PHONEMES)
Ejemplo n.º 10
0
def write_deltas(deltas, out):
    for lang, (chars, refs, cldr_sources) in sorted(deltas.items()):
        locale = icu.Locale(lang)
        out.write('\n\n### %s: %s\n\n' % (lang, locale.getDisplayName()))
        out.write('```\n%s\n```\n\n' % (format_unicodeset(chars)))
        if cldr_sources:
            markdown = '[%s](http://www.unicode.org/repos/cldr/trunk/%s)'
            s = [markdown % (src, src) for src in sorted(cldr_sources)]
            out.write('* CLDR: %s\n' % ' '.join(s))
        for ref in refs:
            out.write('* %s\n' % ref)


if __name__ == '__main__':
    empty_uset = icu.UnicodeSet()
    fully_missing, fully_missing_manual_cleanup_needed = {}, {}
    chars_missing, ok, bogus = {}, {}, {}
    likely_subtags = read_likely_subtags()
    language_aliases = read_language_aliases()
    cldr_exemplars = read_cldr_exemplars()
    fontconfig_exemplars = read_fontconfig_exemplars()
    for fclang, (fcset, fcrefs) in sorted(fontconfig_exemplars.items()):
        lang = language_aliases.get(fclang, fclang)
        if lang == 'ps_PK': lang = 'ps'
        if lang == 'pap_AN': lang = 'pap_Latn'
        if lang in {'pap_AW'}: continue
        likely = likely_subtags.get(lang, 'und')
        if lang not in cldr_exemplars and not lang.startswith('zh_'):
            pattern = fcset.toPattern()
            lang = '_'.join((lang.split('_')[0], guess_script(pattern)))