コード例 #1
0
ファイル: common.py プロジェクト: tiansworld/gramps
    class AlphabeticIndex(icuAlphabeticIndex):
        """
        Call the ICU AlphabeticIndex, passing the ICU Locale
        """
        def __init__(self, rlocale):
            self.iculocale = Locale(rlocale.collation)
            super().__init__(self.iculocale)

            # set the maximum number of buckets, the undocumented default is 99
            # Latin + Greek + Cyrillic + Hebrew + Arabic + Tamil + Hiragana +
            # CJK Unified is about 206 different buckets
            self.maxLabelCount = 500  # pylint: disable=invalid-name

            # Add bucket labels for scripts other than the one for the output
            # which is being generated
            self.iculocale.addLikelySubtags()
            default_script = self.iculocale.getDisplayScript()
            used_scripts = [default_script]

            for lang_code in glocale.get_language_dict().values():
                loc = Locale(lang_code)
                loc.addLikelySubtags()
                script = loc.getDisplayScript()
                if script not in used_scripts:
                    used_scripts.append(script)
                    super().addLabels(loc)
コード例 #2
0
def gen_khm_words(text: str) -> str:
    bi = BreakIterator.createWordInstance(Locale("km"))
    bi.setText(text)
    start = bi.first()
    for end in bi:
        yield text[start:end]
        start = end
コード例 #3
0
ファイル: views.py プロジェクト: aaronhelton/unlod
def index(request):
  preferred_language = translation.get_language()
  collator = Collator.createInstance(Locale(preferred_language))
  if request.GET.get('aspect'):
    aspect = request.GET['aspect']

  else:
    aspect = 'Collection'

  try:
    aspect_uri = ROUTABLES[aspect]
  except KeyError:
    aspect_uri = ROUTABLES['Collection']

  this_results = []
  for res in graph.subjects(RDF.type, aspect_uri):
    r = Resource(graph,res)
    if Resource(graph,UNBIST.PlaceName) in list(r[RDF.type]):
      continue
    res_label = get_preferred_label(res,preferred_language)
    this_results.append({'uri': res, 'pref_label':res_label})
  #sorted_results =  sorted(this_results, key=lambda tup: tup['pref_label'], cmp=collator.compare)
  sorted_results =  sorted(this_results, key=lambda tup: tup['pref_label'])

  try:
    page = request.GET.get('page',1)
  except PageNotAnInteger:
    page = 1

  p = Paginator(sorted_results, 20, request=request)
  paginated_results = p.page(page)

  return render(request, 'thesaurus/index.html', {'results': paginated_results, 'target': 'instances', 'aspect':aspect })
コード例 #4
0
    def widget(cls, field, value, collation=None, **attributes):
        """
        Generates a SELECT tag, including OPTIONs (only 1 option allowed)

        see also: `FormWidget.widget`
        """
        default = dict(value=value)
        attr = cls._attributes(field, default, **attributes)
        requires = field.requires
        if not isinstance(requires, (list, tuple)):
            requires = [requires]
        if requires:
            if hasattr(requires[0], 'options'):
                options = requires[0].options()
            else:
                raise SyntaxError('widget cannot determine options of %s' %
                                  field)

        if collation:
            myloc = Locale(collation)
            coll = Collator.createInstance(myloc)
            options = sorted(options, key=itemgetter(1), cmp=coll.compare)

        opts = [OPTION(v, _value=k) for (k, v) in options]
        return SELECT(*opts, **attr)
コード例 #5
0
ファイル: pyicu.py プロジェクト: eveem/pythainlp
def _gen_words(text: str) -> str:
    bd = BreakIterator.createWordInstance(Locale("th"))
    bd.setText(text)
    p = bd.first()
    for q in bd:
        yield text[p:q]
        p = q
コード例 #6
0
    def __init__(self, *args, **kwargs):
        '''Initialize a unicode dictionary.  The signature is changed because the 
        kwargs are used to set the comparison details

        '''
        if len(args) > 1:
            raise TypeError('expected at most 1 arguments, got %d' % len(args))

        if len(args) == 1 and isinstance(args[0],self.__class__):
            locale = args[0].locale if 'locale' not in kwargs else kwargs.pop('locale')
            comparison_level = args[0].comparison_level if 'comparison_level' \
                not in kwargs else kwargs.pop('comparison_level')
            case_sensitive = args[0].case_sensitive if 'case_sensitive' \
                not in kwargs else kwargs.pop('case_sensitive')
        else:
            locale = kwargs.pop('locale','en_US')
            comparison_level = max(0,min(3,kwargs.pop('comparison_level',0)))
            case_sensitive = kwargs.pop('case_sensitive', False)
        self.__locale = Locale(locale)
        self.__collator = Collator.createInstance(self.__locale)
        self.__collator.setStrength(comparison_level)
        self.__collator.setAttribute(UCollAttribute.CASE_LEVEL,
            UCollAttributeValue.ON if case_sensitive else UCollAttributeValue.OFF)
        if len(args) == 1:
            if isinstance(args[0],Mapping):
                vals = list(args[0].items())
            else:
                vals = args[0]
            for key,val in vals:
                self.__setitem__(key,val)
コード例 #7
0
ファイル: dict2.py プロジェクト: enabling-languages/dinka
def sorted_(data, l, series=False, i=False):
    loc = Locale.forLanguageTag(l)
    collator = Collator.createInstance(loc)
    if isinstance(data, dict):
        sorted_data = sorted(data.items(),
                             key=lambda x: collator.getSortKey(x[i]))
        return dict(sorted_data)
コード例 #8
0
def _localize_timezones(locale: babel.Locale) -> LocalizedTimezone:
    zones_and_aliases = _read_timezone_ids_and_aliases()
    # locale.language: 'en' or 'en_US'
    collator = Collator.createInstance(Locale.createFromName(locale.language))
    return [
        _localize_timezone(zone, aliases, locale, collator)
        for zone, aliases in zones_and_aliases.items()
    ]
コード例 #9
0
ファイル: __init__.py プロジェクト: beavyHQ/flask-icu
 def default_locale(self):
     """The default locale from the configuration as instance of a
     `icu.Locale` object.
     """
     default = self.app.config['ICU_DEFAULT_LOCALE']
     if default is None:
         default = 'en'
     return Locale(default)
コード例 #10
0
def sortkey_length(strength, word):
    c = Collator.createInstance(Locale(''))
    c.setStrength(strength)
    c.setAttribute(
        UCollAttribute.ALTERNATE_HANDLING,
        UCollAttributeValue.SHIFTED,
    )
    coll_key = c.getSortKey(word)
    return len(coll_key) - 1  # subtract 1 for ending \x00 byte
コード例 #11
0
    def __init__(self, locale="en"):
        from icu import Locale, BreakIterator

        # ICU includes lists of common abbreviations that can be used to filter, to ignore,
        # these false sentence boundaries for some languages.
        # (http://userguide.icu-project.org/boundaryanalysis)
        if locale in {"en", "de", "es", "it", "pt"}:
            locale += "@ss=standard"
        self.locale = Locale(locale)
        self.breaker = BreakIterator.createSentenceInstance(self.locale)
コード例 #12
0
ファイル: alphabeticindex.py プロジェクト: tiansworld/gramps
 def primary_difference(prev_key, new_key, rlocale=glocale):
     """
     Try to use the PyICU collation.
     If we generate a report for another language, make sure we use the good
     collation sequence
     """
     collate_lang = Locale(rlocale.collation)
     collation = Collator.createInstance(collate_lang)
     collation.setStrength(Collator.PRIMARY)
     return collation.compare(prev_key, new_key) != 0
コード例 #13
0
 def _compute_char_brkpoints(self):
     """
     This function uses ICU BreakIterator to identify and store extended grapheme clusters.
     """
     chars_break_iterator = BreakIterator.createCharacterInstance(
         Locale.getRoot())
     chars_break_iterator.setText(self.unsegmented)
     self.char_brkpoints = [0]
     for brkpoint in chars_break_iterator:
         self.char_brkpoints.append(brkpoint)
コード例 #14
0
def sortkey(strength, maxlength=None):
    c = Collator.createInstance(Locale(''))
    c.setStrength(strength)
    c.setAttribute(
        UCollAttribute.ALTERNATE_HANDLING,
        UCollAttributeValue.SHIFTED,
    )
    if maxlength is None:
        return c.getSortKey
    else:
        return lambda x: c.getSortKey(x)[:maxlength]
コード例 #15
0
class UnicodeStrFactory(object):
    def __init__(self,locale="EN_US",comparison_level=0,case_sensitive=False):
        comparison_level = max(0,min(3,comparison_level))

        self.__locale = Locale(locale)
        self.__collator = Collator.createInstance(self.__locale)
        self.__collator.setStrength(comparison_level)
        self.__collator.setAttribute(UCollAttribute.CASE_LEVEL,
            UCollAttributeValue.ON if case_sensitive else UCollAttributeValue.OFF)
        if comparison_level == 0 and case_sensitive == False:
            self.__base_coll = self.__collator
        else:
            self.__base_coll = Collator.createInstance(self.__locale)
            self.__base_coll.setStrength(0)
            self.__base_coll.setAttribute(UCollAttribute.CASE_LEVEL, UCollAttributeValue.OFF)

    @property
    def locale(self):
        return self.__locale.getName()

    @property
    def comparison_level(self):
        return self.__collator.getStrength()

    @property
    def case_sensitive(self):
        return self.__collator.getAttribute(UCollAttribute.CASE_LEVEL) == UCollAttributeValue.ON

    @property
    def collator(self):
        return self.__collator

    def coll_len(self,string):
        return len(self.__base_coll.getSortKey(string))-1

    __marker = object()

    def __call__(self,obj,encoding=__marker, errors='strict'):
        class unicode_str(unicode_str_base):
            _factory = self

        if encoding == self.__marker:
            return unicode_str(obj)
        else:
            return unicode_str(obj,encoding=encoding,errors=errors)

    def __reduce__(self):
        inst_dict = vars(self).copy()
        for k in vars(self.__class__()):
            inst_dict.pop(k, None)
        return (self.__class__, (self.locale,self.comparison_level,self.case_sensitive), inst_dict)

    def key_for_caching(self,word):
        return self(word).key_for_caching()
コード例 #16
0
ファイル: L10n.py プロジェクト: gempacked/fpdb
def get_installed_translations():
    #
    # returns a list of translated installed languages, (de, es)...
    # and a list of lang/country combos for that language (de_DE, de_AT)...
    #
    import locale
    import gettext

    la_list = []
    la_co_list = []

    for (ident, la_co) in locale.windows_locale.iteritems():
        if gettext.find("fpdb", localedir="locale", languages=[la_co]):
            if "_" in la_co:
                la, co = la_co.split("_", 1)
                la_list.append(la)
            else:
                la_list.append(la_co)
            la_co_list.append(la_co)
    #
    # eliminate dupes
    #
    la_set = set(la_list)
    la_list = list(la_set)

    la_dict = {}
    la_co_dict = {}
    try:
        from icu import Locale

        for code in la_list:
            la_dict[code] = Locale.getDisplayName(Locale(code))
        for code in la_co_list:
            la_co_dict[code] = Locale.getDisplayName(Locale(code))
    except:
        for code in la_list:
            la_dict[code] = code
        for code in la_co_list:
            la_co_dict[code] = code

    return la_dict, la_co_dict
コード例 #17
0
ファイル: l10n.py プロジェクト: gkkulik/kontext
def sort(iterable, loc, key=None, reverse=False):
    """
    Creates new sorted list from passed list (or any iterable data) according to the passed locale.

    arguments:
    iterable -- iterable object (typically a list or a tuple)
    loc -- locale identifier (e.g. cs_CZ.UTF-8, en_US,...)
    key -- access to sorted value
    reverse -- whether the result should be in reversed order (default is False)
    """
    collator = Collator.createInstance(Locale(loc))
    return sorted(iterable, cmp=collator.compare, key=key, reverse=reverse)
コード例 #18
0
def icu_format_message(locale_id: str,
                       message: str,
                       arguments: _MessageArguments = {}) -> str:
    """Substitute arguments into ICU-style message.
    You can have variable substitution, plurals, selects and nested messages.
    
    Raises `ICUError` in case of incorrectly formatted message.
    
    The arguments must be a dict
    """
    return MessageFormat(message, Locale.createFromName(locale_id)).format(
        list(arguments.keys()), [Formattable(x) for x in arguments.values()])
コード例 #19
0
ファイル: utils.py プロジェクト: jsbien/tolejniczak-pdfautils
def divideIntoWords(txt, locale):
    loc = Locale.createFromName(locale)
    bi = BreakIterator.createWordInstance(loc)
    #print txt
    bi.setText(txt)
    res = []
    while True:
        try:
            #print bi.next()
            res.append(bi.next())
        except StopIteration:
            return res
コード例 #20
0
def get_installed_translations():
    #
    # returns a list of translated installed languages, (de, es)...
    # and a list of lang/country combos for that language (de_DE, de_AT)...
    #
    import locale
    import gettext
    la_list = []
    la_co_list = []

    for (ident, la_co) in locale.windows_locale.iteritems():
        if gettext.find("fpdb", localedir="locale", languages=[la_co]):
            if "_" in la_co:
                la, co = la_co.split("_", 1)
                la_list.append(la)
            else:
                la_list.append(la_co)
            la_co_list.append(la_co)
    #
    #eliminate dupes
    #
    la_set = set(la_list)
    la_list = list(la_set)

    la_dict = {}
    la_co_dict = {}
    try:
        from icu import Locale
        for code in la_list:
            la_dict[code] = Locale.getDisplayName(Locale(code))
        for code in la_co_list:
            la_co_dict[code] = Locale.getDisplayName(Locale(code))
    except:
        for code in la_list:
            la_dict[code] = code
        for code in la_co_list:
            la_co_dict[code] = code

    return la_dict, la_co_dict
コード例 #21
0
def sort_for_script(cp_list, script):
    lang = lang_for_script(script)
    if not lang:
        print 'cannot sort for script, no lang for %s' % script
        return cp_list
    if _HAVE_ICU:
        from icu import Locale, Collator
        loc = Locale(lang + '_' + script)
        col = Collator.createInstance(loc)
        return sorted(cp_list, cmp=col.compare)
    else:
        import locale
        return sorted(cp_list, cmp=locale.strcoll)
コード例 #22
0
def main():

    print "ICU Break Iterator Sample Program"
    print "C++ Break Iteration in Python"
    
    stringToExamine = u"Aaa bbb ccc. Ddd eee fff."
    print "Examining: ", stringToExamine

    # print each sentence in forward and reverse order
    boundary = BreakIterator.createSentenceInstance(Locale.getUS())
    boundary.setText(stringToExamine)

    print
    print "Sentence Boundaries... "
    print "----- forward: -----------"
    printEachForward(boundary)
    print "----- backward: ----------"
    printEachBackward(boundary)

    # print each word in order
    print
    print "Word Boundaries..."
    boundary = BreakIterator.createWordInstance(Locale.getUS())
    boundary.setText(stringToExamine)
    print "----- forward: -----------"
    printEachForward(boundary)
    # print first element
    print "----- first: -------------"
    printFirst(boundary)
    # print last element
    print "----- last: --------------"
    printLast(boundary)
    # print word at charpos 10
    print "----- at pos 10: ---------"
    printAt(boundary, 10)

    print
    print "End C++ Break Iteration in Python"
class Language(object):
  def __init__(self, choice):
    basic_name, code, confidence, bytesize = choice
    self.locale = Locale(code)
    self.confidence = float(confidence)
    self.read_bytes = int(bytesize)

  @property
  def name(self):
    return self.locale.getDisplayLanguage()

  @property
  def code(self):
    return self.locale.getName()

  def __str__(self):
    return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
            "read bytes:{:>6}".format(self.name, self.code,
                                    self.confidence, self.read_bytes))

  @staticmethod
  def from_code(code):
    return Language(("", code, 100, 0))
コード例 #24
0
    def __init__(self,locale="EN_US",comparison_level=0,case_sensitive=False):
        comparison_level = max(0,min(3,comparison_level))

        self.__locale = Locale(locale)
        self.__collator = Collator.createInstance(self.__locale)
        self.__collator.setStrength(comparison_level)
        self.__collator.setAttribute(UCollAttribute.CASE_LEVEL,
            UCollAttributeValue.ON if case_sensitive else UCollAttributeValue.OFF)
        if comparison_level == 0 and case_sensitive == False:
            self.__base_coll = self.__collator
        else:
            self.__base_coll = Collator.createInstance(self.__locale)
            self.__base_coll.setStrength(0)
            self.__base_coll.setAttribute(UCollAttribute.CASE_LEVEL, UCollAttributeValue.OFF)
コード例 #25
0
def worker(path, outdir, with_sorting=True):
    collator = Collator.createInstance(Locale("pl_PL.UTF-8"))
    separator = re.compile("[\W\d]+")
    filepath = path.replace(".yml", ".txt")
    with open(filepath) as file:
        text = file.read().lower().rstrip()
        words = set(re.split(separator, text))
    with open(path) as file:
        meta = yaml.safe_load(file)
    with open(f"{outdir}/extracted-words-for-{meta['label']}.txt",
              "w") as file:
        if with_sorting:
            words = sorted(words, key=collator.getSortKey)
        file.write("\n".join(words))
    return path
コード例 #26
0
 def _compute_icu_segmented(self):
     """
     This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order
     to use it the unsegmented version must have been already computed.
     """
     words_break_iterator = BreakIterator.createWordInstance(
         Locale.getRoot())
     words_break_iterator.setText(self.unsegmented)
     self.icu_word_brkpoints = [0]
     for brkpoint in words_break_iterator:
         self.icu_word_brkpoints.append(brkpoint)
     self.icu_segmented = "|"
     for i in range(len(self.icu_word_brkpoints) - 1):
         self.icu_segmented += self.unsegmented[
             self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i +
                                                                1]] + "|"
コード例 #27
0
ファイル: forms.py プロジェクト: msoftware/weblate
def sort_choices(choices):
    '''
    Sorts choices alphabetically.

    Either using cmp or ICU.
    '''
    if not HAS_ICU:
        sorter = cmp
    else:
        sorter = Collator.createInstance(Locale(get_language())).compare

    # Actually sort values
    return sorted(
        choices,
        key=lambda tup: tup[1],
        cmp=sorter
    )
コード例 #28
0
def coverage(font, threshold=10):
    cmap = set(chr(c) for c in font.getBestCmap())

    languages = set()
    scripts = set()
    partial = {}

    for locale in Locale.getAvailableLocales():
        data = LocaleData(locale)
        examplar = set("".join(data.getExemplarSet()))
        if not cmap.isdisjoint(examplar):
            locale = Locale(locale)
            locale.addLikelySubtags()
            diff = examplar - cmap
            if not diff:
                scripts.add(locale.getDisplayScript())
                languages.add(locale.getDisplayLanguage())
            elif len(diff) <= threshold:
                partial[locale.getDisplayLanguage()] = diff

    return scripts, languages, partial
コード例 #29
0
    def character_tokenize(self, word):
        """ Returns the tokenization in character level.
        
        Arguments:
            word {string} -- word to be tokenized in character level.
        
        Returns:
            [list] -- list of characters.
        """

        temp_ = BreakIterator.createCharacterInstance(Locale())
        temp_.setText(word)
        char = []
        i = 0
        for j in temp_:
            s = word[i:j]
            char.append(s)
            i = j

        return char
コード例 #30
0
def icu_format_html_message(
    locale_id: str,
    message: str,
    arguments: _MessageArguments = {},
    tags: _TagMapping = {},
) -> str:
    """Substitute arguments into ICU-style HTML message.
    You can have variable substitution, plurals, selects and nested messages.
    You can also replace HTML tag placeholders.
    
    Raises `ICUError` in case of incorrectly formatted message.
    """
    return MessageFormat(restore_tags(
        message, tags), Locale.createFromName(locale_id)).format(
            list(arguments.keys()),
            [
                Formattable(escape(x) if isinstance(x, str) else x)
                for x in arguments.values()
            ],
        )
コード例 #31
0
ファイル: page_normalizer.py プロジェクト: impactcentre/pol
	def endElement(self, name):
		if name == u"Unicode":
			self.__isUni = False
			loc = Locale.createFromName("utf-8")
			bi = BreakIterator.createWordInstance(loc)
			bi.setText(self.__uniText)
			tokens = []
			prev = 0
			while True:
				try:
					ind = bi.next()
					tokens.append(self.__uniText[prev:ind])
					prev = ind
				except StopIteration:
					break
			text = u""
			for t in tokens:
				text += processToken(t)
			self.__downstream.characters(text)
		self.__downstream.endElement(name)
コード例 #32
0
ファイル: page_normalizer.py プロジェクト: jsbien/pol
 def endElement(self, name):
     if name == u"Unicode":
         self.__isUni = False
         loc = Locale.createFromName("utf-8")
         bi = BreakIterator.createWordInstance(loc)
         bi.setText(self.__uniText)
         tokens = []
         prev = 0
         while True:
             try:
                 ind = bi.next()
                 tokens.append(self.__uniText[prev:ind])
                 prev = ind
             except StopIteration:
                 break
         text = u""
         for t in tokens:
             text += processToken(t)
         self.__downstream.characters(text)
     self.__downstream.endElement(name)
コード例 #33
0
ファイル: __init__.py プロジェクト: beavyHQ/flask-icu
def get_locale():
    """Returns the locale that should be used for this request as
    `icu.Locale` object.  Returns `None` if used outside of
    a request.
    """
    ctx = _request_ctx_stack.top
    if ctx is None:
        return None
    locale = getattr(ctx, 'icu_locale', None)
    if locale is None:
        icu = ctx.app.extensions['icu']
        if icu.locale_selector_func is None:
            locale = icu.default_locale
        else:
            rv = icu.locale_selector_func()
            if rv is None:
                locale = icu.default_locale
            else:
                locale = Locale(rv)
        ctx.icu_locale = locale
    return locale
コード例 #34
0
ファイル: l10n.py プロジェクト: mzimandl/kontext
def sort(iterable, loc, key=None, reverse=False):
    """
    Creates new sorted list from passed list (or any iterable data) according to the passed locale.

    arguments:
    iterable -- iterable object (typically a list or a tuple)
    loc -- locale identifier (e.g. cs_CZ.UTF-8, en_US,...)
    key -- access to sorted value
    reverse -- whether the result should be in reversed order (default is False)
    """
    if not loc:
        raise LocalizationError(
            'cannot sort string due to missing locale information (probably a configuration issue)')
    collator = Collator.createInstance(Locale(loc))
    if key is None:
        kf = cmp_to_key(collator.compare)
    else:
        def tmp(v1, v2):
            return collator.compare(key(v1), key(v2))
        kf = cmp_to_key(tmp)
    return sorted(iterable, key=kf, reverse=reverse)
コード例 #35
0
ファイル: check.py プロジェクト: claudegel/scheduler-card
def cross_validate(english_value,
                   other_language_value,
                   other_language,
                   key_name=None):
    this_lang = other_language.split("/")[-1].split(".js")[0].replace("-", "_")
    this_lang = Locale(this_lang).getDisplayName(english_lang)
    if other_language_value is None:
        print(
            "🟡 In" + Style.BRIGHT + Fore.YELLOW,
            f"{this_lang}" + Style.RESET_ALL,
            f"there is no value for {Fore.YELLOW + key_name + Fore.WHITE}.",
        )
    elif type(english_value) != type(other_language_value):
        raise Exception(
            f"The type of the English value ({english_value}) and the type of"
            +
            f"{this_lang}'s value ({other_language_value}) are different for key {key_name}."
        )
    elif isinstance(english_value, dict):
        for name, item in english_value.items():
            cross_validate(item, other_language_value.get(name),
                           other_language, name)
コード例 #36
0
ファイル: i18n.py プロジェクト: MarcSchmitzer/pylf
def make_collator(request):
    loc = Locale.createFromName(request.locale_name)
    return Collator.createInstance(loc)
コード例 #37
0
ファイル: gen_locales.py プロジェクト: pussbb/sxdevsite
# -*- coding: utf-8 -*-
"""

"""
import os
import json

from icu import Locale

BASE_PATH = os.path.dirname(os.path.abspath(__file__))

locales = []
for locale in Locale.getAvailableLocales().values():
    locales.append({'locale': locale.getName(),
                    'name': locale.getDisplayName(locale)})

json.dump(locales, open(os.path.join(BASE_PATH, 'locales.json'), 'w'))
コード例 #38
0
ファイル: base.py プロジェクト: aboSamoor/polyglot
 def __init__(self, choice):
   basic_name, code, confidence, bytesize = choice
   self.locale = Locale(code)
   self.confidence = float(confidence)
   self.read_bytes = int(bytesize)
コード例 #39
0
class unicode_set(set):
    '''Set that support unicode comparison as defined by icu (UCA)
    It uses a dict as the underlying storage instead of the built-in set
    despite the performance difference since it needs to keep anyway a mapping dict
    '''

    def __init__(self, *args, **kwargs):
        '''Initialize a unicode set.  The signature is changed because the 
        kwargs are used to set the comparison details

        '''
        if len(args) > 1:
            raise TypeError('expected at most 1 arguments, got %d' % len(args))

        if len(args) == 1 and isinstance(args[0],self.__class__):
            locale = args[0].locale if 'locale' not in kwargs else kwargs.pop('locale')
            comparison_level = args[0].comparison_level if 'comparison_level' \
                not in kwargs else kwargs.pop('comparison_level')
            case_sensitive = args[0].case_sensitive if 'case_sensitive' \
                not in kwargs else kwargs.pop('case_sensitive')
        else:
            locale = kwargs.pop('locale','en_US')
            comparison_level = max(0,min(3,kwargs.pop('comparison_level',0)))
            case_sensitive = kwargs.pop('case_sensitive', False)
        self.__locale = Locale(locale)
        self.__collator = Collator.createInstance(self.__locale)
        self.__collator.setStrength(comparison_level)
        self.__collator.setAttribute(UCollAttribute.CASE_LEVEL,
            UCollAttributeValue.ON if case_sensitive else UCollAttributeValue.OFF)
        self.__values = {} # set implementation
        if len(args) == 1:
            vals = args[0]
            for val in vals:
                self.add(val)

    @property
    def locale(self):
        return self.__locale.getName()

    @property
    def comparison_level(self):
        return self.__collator.getStrength()

    @property
    def case_sensitive(self):
        return self.__collator.getAttribute(UCollAttribute.CASE_LEVEL) == UCollAttributeValue.ON

    def __in_key(self,key):
        return self.__collator.getSortKey(key) if isinstance(key,basestring) else key

    def __in_equality(self,other):
        return self.locale == other.locale and\
            self.comparison_level == other.comparison_level and \
            self.case_sensitive == other.case_sensitive

    def add(self,val):
        '''Add an element to a set.
        
        This has no effect if the element is already present.
        '''
        self.__values[self.__in_key(val)] = val

    def clear(self):
        '''Remove all elements from this set.
        '''
        self.__values.clear()
    
    def copy(self):
        '''Return a shallow copy of a set.
        '''
        return self.__class__(self)
    
    def difference(self, *args):
        '''Return the difference of two or more sets as a new set.
        
        (i.e. all elements that are in this set but not the others.)
        '''
        ret = self.__class__(self)
        ret.difference_update(*args)

        return ret

    def difference_update(self, *args):
        '''Remove all elements of another set from this set.
        '''
        if len(args) > 1:
            for arg in args:
                self.difference_update(arg)
        else:
            arg = args[0]
            if isinstance(arg,self.__class__) and self.__in_equality(arg):
                for i in arg.__values.iterkeys():
                    if i in self.__values:
                        del self.__values[i]
            else:
                for i in arg:
                    i = self.__in_key(i)
                    if i in self.__values:
                        del self.__values[i]

    def discard(self,val):
        '''Remove an element from a set if it is a member.
        
        If the element is not a member, do nothing.
        '''
        try:
            self.remove(val)
        except KeyError:
            pass

    def intersection(self,*args):
        '''Return the intersection of two or more sets as a new set.
        
        (i.e. elements that are common to all of the sets.)
        '''
        ret = self.__class__(self)
        ret.intersection_update(*args)
        
        return ret

    def intersection_update(self,*args):
        '''Update a set with the intersection of itself and another.
        '''
        if len(args) > 1:
            for arg in args:
                self.intersection_update(arg)
        else:
            if isinstance(args[0],self.__class__) and self.__in_equality(args[0]):
                arg = args[0]
            else:
                arg = self.__class__(args[0],
                    locale = self.locale, 
                    case_sensitive = self.case_sensitive,
                    comparison_level = self.comparison_level)
            for k,v in self.__values.items():
                if v not in arg:
                    del self.__values[k]

    def isdisjoint(self,other):
        '''Return True if two sets have a null intersection.
        '''
        return len(self.intersection(other)) == 0

    def issubset(self,other):
        '''Report whether another set contains this set.
        '''
        return self.__class__(other,
            locale = self.locale, 
            case_sensitive = self.case_sensitive, 
            comparison_level = self.comparison_level).issuperset(self)

    def issuperset(self,other):
        '''Report whether this set contains another set.
        '''
        return len(self.__class__(other, 
            locale = self.locale, 
            case_sensitive = self.case_sensitive, 
            comparison_level = self.comparison_level)) == len(self.intersection(other))


    def pop(self):
        '''Remove and return an arbitrary set element.
        Raises KeyError if the set is empty.
        '''
        return self.__values.popitem()[1]

    def remove(self,val):
        '''Remove an element from a set; it must be a member.
        
        If the element is not a member, raise a KeyError.
        '''
        del self.__values[self.__in_key(val)]

    def symmetric_difference(self,other):
        '''Return the symmetric difference of two sets as a new set.
        
        (i.e. all elements that are in exactly one of the sets.)
        '''
        ret = self.__class__(self)
        ret.update(other)
        ret.difference_update(self.intersection(other))

        return ret

    def symmetric_difference_update(self,other):
        '''Update a set with the symmetric difference of itself and another.
        '''
        bck = self.__class__(self)
        self.update(other)
        self.difference_update(bck.intersection(other))

    def union(self,*others):
        '''Return the union of sets as a new set.
        
        (i.e. all elements that are in either set.)
        '''
        ret = self.__class__(self)
        ret.update(*others)

        return ret

    def update(self,*others):
        '''Update a set with the union of itself and others.
        '''
        for other in others:
            if isinstance(other,self.__class__) and self.__in_equality(other):
                self.__values.update(other.__values)
            else:
                self.__values.update({self.__in_key(i):i for i in other})

    def __and__(self,other):
        '''x.__and__(y) <==> x&y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.intersection(other)

    def __contains__(self,obj):
        '''x.__contains__(y) <==> y in x.
        '''
        return self.__in_key(obj) in self.__values

    def __eq__(self,other):
        '''x.__eq__(y) <==> x==y
        '''
        return isinstance(other,self.__class__) and self.__in_equality(other) \
            and set(self.__values.keys()) == set(other.__values.keys())

    def __ge__(self,other):
        '''x.__ge__(y) <==> x>=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.issuperset(other)

    def __gt__(self,other):
        '''x.__gt__(y) <==> x>y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.issuperset(other) and self != other

    def __iand__(self,other):
        '''x.__iand__(y) <==> x&=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        self.intersection_update(other)
        return self

    def __ior__(self,other):
        '''x.__ior__(y) <==> x|=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")
        self.update(other)
        return self

    def __isub__(self,other):
        '''x.__isub__(y) <==> x-=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        self.difference_update(other)
        return self

    def __iter__(self):
        '''x.__iter__() <==> iter(x)
        '''
        return self.__values.itervalues()

    def __ixor__(self,other):
        '''x.__ixor__(y) <==> x^=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        self.symmetric_difference_update(other)
        return self

    def __le__(self,other):
        '''x.__le__(y) <==> x<=y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.issubset(other)

    def __len__(self):
        '''x.__len__() <==> len(x)
        '''
        return len(self.__values)

    def __lt__(self,other):
        '''x.__lt__(y) <==> x<y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.issubset(other) and self != other

    def __ne__(self,other):
        '''x.__ne__(y) <==> x!=y
        '''
        return not self == other

    def __or__(self,other):
        '''x.__or__(y) <==> x|y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.union(other)

    def __rand__(self,other):
        '''x.__rand__(y) <==> y&x
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return other & self

    def __repr__(self, _repr_running={}):
        '''x.__repr__() <==> repr(x)
        '''
        call_key = id(self), _get_ident()
        if call_key in _repr_running:
            return '...'
        _repr_running[call_key] = 1
        try:
            if not self:
                return '%s()' % (self.__class__.__name__,)
            return '%s(%r)' % (self.__class__.__name__, self.__values.values())
        finally:
            del _repr_running[call_key]

    def __ror__(self,other):
        '''x.__ror__(y) <==> y|x
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return other | self

    def __rsub__(self,other):
        '''x.__rsub__(y) <==> y-x
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return other - self

    def __rxor__(self,other):
        '''x.__rxor__(y) <==> y^x
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return other ^ self

    def __sizeof__(self):
        '''S.__sizeof__() -> size of S in memory, in bytes
        '''
        return self.__value.__sizeof__()

    def __sub__(self,other):
        '''x.__sub__(y) <==> x-y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.difference(other)

    def __xor__(self,other):
        '''x.__xor__(y) <==> x^y
        '''
        if not isinstance(other,self.__class__):
            raise TypeError("can only compare to a unicode_set")

        if not self.__in_equality(other):
            raise TypeError("can only compare to a unicode_set with the same caracteristic")

        return self.symmetric_difference(other)

    def __reduce__(self):
        inst_dict = vars(self).copy()
        for k in vars(unicode_set()):
            inst_dict.pop(k, None)
        inst_dict.update({
            'locale':self.locale, 
            'comparison_level':self.comparison_level,
            'case_sensitive': self.case_sensitive
            })
        return (unicode_set_from_data, ([self.__values.values()],inst_dict))
コード例 #40
0
ファイル: wearnowlocale.py プロジェクト: bmcage/wearnow
    def __init__(self, localedir=None, lang=None, domain=None, languages=None):
        """
        Init a WearNowLocale. Run __init_first_instance() to set up the
        environment if this is the first run. Return __first_instance
        otherwise if called without arguments.
        """
        global _hdlr
        #initialized is special, used only for the "first instance",
        #and created by __new__(). It's used to prevent re-__init__ing
        #__first_instance when __new__() returns its pointer.
        if hasattr(self, 'initialized') and self.initialized:
            return
        _first = self._WearNowLocale__first_instance
        self.localedir = None
        # Everything breaks without localedir, so get that set up
        # first.  Warnings are logged in _init_first_instance or
        # _init_secondary_locale if this comes up empty.
        if localedir and os.path.exists(os.path.abspath(localedir)):
            self.localedir = localedir
        elif (_first and hasattr(_first, 'localedir') and _first.localedir and
              os.path.exists(os.path.abspath(_first.localedir))):
            self.localedir = _first.localedir
        else:
            LOG.warn('Missing or invalid localedir %s; no translations will be available.', repr(localedir))

        self.lang = lang
        self.localedomain = domain or 'wearnow'
        if languages:
            self.language = [x for x in [self.check_available_translations(l)
                                         for l in languages.split(":")]
                             if x]
        else:
            self.language = None

        if self == _first:
            self._WearNowLocale__init_first_instance()
        else:
            self._init_secondary_locale()

        self.icu_locales = {}
        self.collator = None
        if HAVE_ICU:
            self.icu_locales["default"] = Locale.createFromName(self.lang)
            if self.collation and self.collation != self.lang:
                self.icu_locales["collation"] = Locale.createFromName(self.collation)
            else:
                self.icu_locales["collation"] = self.icu_locales["default"]
            try:
                self.collator = Collator.createInstance(self.icu_locales["collation"])
            except ICUError as err:
                LOG.warning("Unable to create collator: %s", str(err))
                self.collator = None

        try:
            self.translation = self._get_translation(self.localedomain,
                                                     self.localedir,
                                                     self.language)
        except ValueError:
            LOG.warning("Unable to find translation for languages in %s, using US English", ':'.join(self.language))
            self.translation = WearNowNullTranslations()
            self.translation._language = "en"

        if _hdlr:
            LOG.removeHandler(_hdlr)
            _hdlr = None
        self._dd = self._dp = None
        #Guards against running twice on the first instance.
        self.initialized = True
コード例 #41
0
ファイル: wearnowlocale.py プロジェクト: bmcage/wearnow
# LOG.setLevel(logging.DEBUG)
try:
    from icu import Locale, Collator
    HAVE_ICU = True
except ImportError:
    try:
        from PyICU import Locale, Collator
        HAVE_ICU = True
    except ImportError as err:
        # No logger, save the warning message for later.
        _icu_err = ("ICU not loaded because %s. Localization will be impaired. "
                    "Use your package manager to install PyICU" % str(err))

ICU_LOCALES = None
if HAVE_ICU:
    ICU_LOCALES = Locale.getAvailableLocales()

# Map of languages for converting to Microsoft locales and naming
# locales for display to the user.  It's important to add to this list
# when a new translation is added.  Note the dummy _(): That's just to
# get xgettext to include the string in wearnow.pot; actual translation
# is done in _get_language_string() below.
# (The wearnow officially-supported language list is ALL_LINGUAS in setup.py)
_ = lambda x: x
_LOCALE_NAMES = {
    'ar': ('Arabic_Saudi Arabia', '1256', _("Arabic")),
    'bg': ('Bulgrian_Bulgaria', '1251', _("Bulgarian")),
    'br': (None, None, _("Breton")), #Windows has no translation for Breton
    'ca': ('Catalan_Spain', '1252', _("Catalan")),
    'cs': ('Czech_Czech Republic', '1250', _("Czech")),
    'da': ('Danish_Denmark', '1252', _("Danish")),
コード例 #42
0
ファイル: unicode_29.py プロジェクト: myaser/DAPOS
 def __init__(self):
     self.BreakIterator = BreakIterator.createWordInstance(
                                               Locale.createFromName('ar'))
コード例 #43
0
class unicode_dict(dict):
    '''Dictionary that support unicode comparison as defined by icu (UCA)
    '''

    def __init__(self, *args, **kwargs):
        '''Initialize a unicode dictionary.  The signature is changed because the 
        kwargs are used to set the comparison details

        '''
        if len(args) > 1:
            raise TypeError('expected at most 1 arguments, got %d' % len(args))

        if len(args) == 1 and isinstance(args[0],self.__class__):
            locale = args[0].locale if 'locale' not in kwargs else kwargs.pop('locale')
            comparison_level = args[0].comparison_level if 'comparison_level' \
                not in kwargs else kwargs.pop('comparison_level')
            case_sensitive = args[0].case_sensitive if 'case_sensitive' \
                not in kwargs else kwargs.pop('case_sensitive')
        else:
            locale = kwargs.pop('locale','en_US')
            comparison_level = max(0,min(3,kwargs.pop('comparison_level',0)))
            case_sensitive = kwargs.pop('case_sensitive', False)
        self.__locale = Locale(locale)
        self.__collator = Collator.createInstance(self.__locale)
        self.__collator.setStrength(comparison_level)
        self.__collator.setAttribute(UCollAttribute.CASE_LEVEL,
            UCollAttributeValue.ON if case_sensitive else UCollAttributeValue.OFF)
        if len(args) == 1:
            if isinstance(args[0],Mapping):
                vals = list(args[0].items())
            else:
                vals = args[0]
            for key,val in vals:
                self.__setitem__(key,val)

    @property
    def locale(self):
        return self.__locale.getName()

    @property
    def comparison_level(self):
        return self.__collator.getStrength()

    @property
    def case_sensitive(self):
        return self.__collator.getAttribute(UCollAttribute.CASE_LEVEL) == UCollAttributeValue.ON

    def __in_key(self,key):
        return self.__collator.getSortKey(key) if isinstance(key,str) else key

    def __setitem__(self, key, value):
        super(unicode_dict,self).__setitem__(self.__in_key(key),(key,value))

    def __getitem__(self, key):
        try:
            return super(unicode_dict,self).__getitem__(self.__in_key(key))[1]
        except KeyError:
            raise KeyError(key)

    def get(self, key, default = None):
        try:
            return super(unicode_dict,self).__getitem__(self.__in_key(key))[1]
        except KeyError:
            return default
    
    def __delitem__(self, key):
        try:
            super(unicode_dict,self).__delitem__(self.__in_key(key))
        except KeyError:
            raise KeyError(key)
    
    def __iter__(self):
        for i,_ in super(unicode_dict,self).values():
            yield i

    def __contains__(self,key):
        return super(unicode_dict,self).__contains__(self.__in_key(key))

    def clear(self):
        super(unicode_dict,self).clear()
    
    def keys(self):
        return list(self)

    def values(self):
        return [i for _,i in super(unicode_dict,self).values()]

    def items(self):
        return [i for i in super(unicode_dict,self).values()]
    
    def iterkeys(self):
        return iter(self)

    def itervalues(self):
        for _,i in super(unicode_dict,self).values():
            yield i

    def iteritems(self):
        for i in super(unicode_dict,self).values():
            yield i

    def update(self, *args,**kwargs):
        if len(args) > 1:
            raise TypeError('expected at most 1 arguments, got %d' % len(args))
        
        if isinstance(args[0],Mapping):
            vals = list(args[0].items())
        else:
            vals = args[0]
        for key,val in vals:
            self.__setitem__(key,val)

        for key,val in kwargs:
            self.__setitem__(key,val)

    __marker = object()

    def pop(self, key, default=__marker):
        if key in self:
            r = self[key]
            del self[key]
            return r
        if default is self.__marker:
            raise KeyError(key)
        return default
    
    def setdefault(self, key, default=None):
        if key in self:
            return self[key]
        self[key] = default
        return default

    def popitem(self):
        _,v = super(unicode_dict,self).popitem()
        return v

    def __repr__(self, _repr_running={}):
        call_key = id(self), _get_ident()
        if call_key in _repr_running:
            return '...'
        _repr_running[call_key] = 1
        try:
            if not self:
                return '%s()' % (self.__class__.__name__,)
            return '%s(%r)' % (self.__class__.__name__, dict(list(self.items())))
        finally:
            del _repr_running[call_key]

    def __reduce__(self):
        items = list(self.items())
        inst_dict = vars(self).copy()
        for k in vars(unicode_dict()):
            inst_dict.pop(k, None)
        inst_dict.update({
            'locale':self.locale, 
            'comparison_level':self.comparison_level,
            'case_sensitive': self.case_sensitive
            })
        return (unicode_dict_from_data, ([items],inst_dict))
    
    def copy(self):
        return self.__class__(self)

    def __eq__(self, other):
        ''' Two unicode_dict are equal only if have all keys equal and the matching val is equal
        unicode_dict are equal only with themselves

        '''
        if not isinstance(other,self.__class__):
            return False
        return self.locale == other.locale and self.comparison_level == other.comparison_level and \
            self.case_sensitive == other.case_sensitive and \
            unicode_set(iter(self.keys())) == unicode_set(iter(other.keys())) and \
            all(self[k] == other[k] for k in self)

    def __ne__(self, other):
        return not self == other