コード例 #1
0
def make_alphabetic(hits, processname, sortnames=False, lang="sv"):
    """
    Loop through hits, apply the function 'processname' on each object and then sort the result in alphabetical order.

    The function processname should append zero or more processed form of
    the object to the result list.
    This processed forms should be a pair (first_letter, result)
    where first_letter is the first_letter of each object (to sort on), and the result
    is what the html-template want e.g. a pair of (name, no_hits)
    """
    def fix_lastname(name):
        vonaf_pattern = re.compile(r"^(%s) " % "|".join(VONAV_LIST))
        name = re.sub(vonaf_pattern, r"", name)
        return name.replace(" ", "z")

    results = []
    for hit in hits:
        processname(hit, results)

    letter_results = {}
    # Split the result into start letters
    for first_letter, result in results:
        if first_letter == "Ø":
            first_letter = "Ö"
        if first_letter == "Æ":
            first_letter = "Ä"
        if first_letter == "Ü":
            first_letter = "Y"
        if lang == "en" and first_letter == "Ö":
            first_letter = "O"
        if lang == "en" and first_letter in "ÄÅ":
            first_letter = "A"
        if first_letter not in letter_results:
            letter_results[first_letter] = [result]
        else:
            letter_results[first_letter].append(result)

    # Sort result dictionary alphabetically into list
    if lang == "en":
        collator = icu.Collator.createInstance(icu.Locale("en_EN.UTF-8"))
    else:
        collator = icu.Collator.createInstance(icu.Locale("sv_SE.UTF-8"))
    for _n, items in list(letter_results.items()):
        if sortnames:
            items.sort(key=lambda x: collator.getSortKey(
                fix_lastname(x[0]) + " " + x[1]))
        else:
            items.sort(key=lambda x: collator.getSortKey(x[0]))

    letter_results = sorted(list(letter_results.items()),
                            key=lambda x: collator.getSortKey(x[0]))
    return letter_results
コード例 #2
0
def make_alphabetic(hits, processname, sortnames=False, lang="sv"):
    """ Loops through hits, applies the function 'processname'
        on each object and then sorts the result in alphabetical
        order.
        The function processname should append zero or more processed form of
        the object to the result list.
        This processed forms should be a pair (first_letter, result)
        where first_letter is the first_letter of each object (to sort on), and the result
        is what the html-template want e.g. a pair of (name, no_hits)
    """
    def fix_lastname(name):
        name = re.sub(r"(^von )|(^af )", r"", name)
        return name.replace(" ", "z")

    results = []
    for hit in hits:
        processname(hit, results)

    letter_results = {}
    # Split the result into start letters
    for first_letter, result in results:
        if first_letter == u'Ø':
            first_letter = u'Ö'
        if first_letter == u'Æ':
            first_letter = u'Ä'
        if first_letter == u'Ü':
            first_letter = u'Y'
        if lang == "en" and first_letter == u"Ö":
            first_letter = u"O"
        if lang == "en" and first_letter in u"ÄÅ":
            first_letter = u"A"
        if first_letter not in letter_results:
            letter_results[first_letter] = [result]
        else:
            letter_results[first_letter].append(result)

    # Sort result dictionary alphabetically into list
    if lang == "en":
        collator = icu.Collator.createInstance(icu.Locale('en_EN.UTF-8'))
    else:
        collator = icu.Collator.createInstance(icu.Locale('sv_SE.UTF-8'))
    for n, items in letter_results.items():
        if sortnames:
            items.sort(key=lambda x: collator.getSortKey(fix_lastname(x[0]) + " " + x[1]))
        else:
            items.sort(key=lambda x: collator.getSortKey(x[0]))

    letter_results = sorted(letter_results.items(), key=lambda x: collator.getSortKey(x[0]))
    return letter_results
コード例 #3
0
    def __binaryFind(self, what):
        log.log("HintRegisterBrowser.__binaryFind", [what], 0)
        collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))

        def __pom(left, right):
            if left == right:
                if collator.compare(anyHint(self.__hints[left]), what) > 0:
                    return left
                else:
                    return left + 1
            elif left + 1 == right:
                if collator.compare(anyHint(self.__hints[left]), what) > 0:
                    return left
                elif collator.compare(anyHint(self.__hints[right]), what) > 0:
                    return right
                else:
                    return right + 1
            lenn = right - left
            center = left + lenn // 2
            if collator.compare(anyHint(self.__hints[center]), what) > 0:
                return __pom(left, center - 1)
            else:
                return __pom(center + 1, right)

        res = __pom(0, len(self.__hints) - 1)
        log.log("HintRegisterBrowser.__binaryFind return", [res], 1)
        return res
コード例 #4
0
 def _testMissorderedTags(self):
     # For accented char sorting
     collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8'))
     if sorted(removeSpecialCharFromArray(self.track.artists),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.artists):
         self.missorderedTag.append('Artists')
         self.missorderedTagsCounter += 1
     if self.track.remix == '':
         if sorted(removeSpecialCharFromArray(self.track.artists),
                   key=collator.getSortKey) != removeSpecialCharFromArray(
                       self.track.fileNameList[4].split(', ')):
             self.missorderedTag.append('Artists')
             self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.performers),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.performers):
         self.missorderedTag.append('Performers')
         self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.feat),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.feat):
         self.missorderedTag.append('Featuring')
         self.missorderedTagsCounter += 1
     if sorted(removeSpecialCharFromArray(self.track.remix),
               key=collator.getSortKey) != removeSpecialCharFromArray(
                   self.track.remix):
         self.missorderedTag.append('Remixer')
         self.missorderedTagsCounter += 1
     if self.missorderedTagsCounter > 0:
         self.errorCounter += 1
         self.errors.append(ErrorEnum.MISS_ORDERED_TAGS)
コード例 #5
0
	def __init__(self, *args, **kwargs):
		RegisterBrowser.__init__(self, *args, **kwargs)
		self.__dBController = None
		self.__limit = 100
		self.__localVeto = False
		self.__binaryType = None
		self.__collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
コード例 #6
0
    def character_tokenize(self, word):
        """ Returns the tokenization in character level.
        
        Arguments:
            word {string} -- word to be tokenized in character level.
        
        Returns:
            [list] -- list of characters.
        """
        try:
            import icu

        except:
            print("please install PyICU")
        
        temp_ = icu.BreakIterator.createCharacterInstance(icu.Locale())
        temp_.setText(word)
        char = []
        i = 0
        for j in temp_:
            s = word[i:j]
            char.append(s)
            i = j

        return char
コード例 #7
0
def jwOnSortedFunction(s1,
                       s2,
                       collator=icu.Collator.createInstance(
                           icu.Locale('de_DE.UTF-8'))):
    s1_s = ''.join(sorted(list(s1), key=collator.getSortKey))
    s2_s = ''.join(sorted(list(s2), key=collator.getSortKey))
    return jw_distance.get_jaro_distance(s1_s, s2_s, winkler=True)
コード例 #8
0
ファイル: pyicu.py プロジェクト: phattharachon/sentiment_api
def gen_words(text):
    bd = icu.BreakIterator.createWordInstance(icu.Locale("th"))
    bd.setText(text)
    p = bd.first()
    for q in bd:
        yield text[p:q]
        p = q
コード例 #9
0
 def findWord(self, key):
     self.__exact = False
     collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
     if self._level == 0:
         res = self._dict.get(key)
         if res != None:
             self.__exact = True
             return res
         for k in self.__keys:
             #print "  ", k, key
             if collator.compare(k, key) > 0:
                 return self._dict.get(k)
         return None
     else:
         h = self._at(key, 0)
         if h != u'':
             t = key[1:]
         else:
             t = u''
         subdict = self._dict.get(h)
         if subdict != None:
             res = subdict.findWord(t)
             if res != None:
                 self.__exact = subdict.exact()
                 return res
         first = True
         for k in self.__keys:
             #print "L", k, key
             if collator.compare(k, h) > 0:
                 res = self._dict.get(k).findWord(u'')
                 if res != None:
                     return res
         return None
コード例 #10
0
 def updateSheetOrder(self):
     try:
         users = []
         for user in self.fetch_group.users:
             users.append(str(user.name))
         sheet = self.client.open(self.sheet_order).sheet1
         collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8'))
         users.sort(key=collator.getSortKey)
         print(users)
         cell_list = sheet.range('B2:B' + str(len(users) + 100))
         usr = deque(users)
         try:
             for cell in cell_list:
                 cell.value = ''
                 if usr != deque([]):
                     cell.value = str(usr.popleft())
         except IndexError as e:
             print('IndexError')
         # Update in batch
         sheet.update_cells(cell_list)
     except Exception as e:
         self.sendMsg(self.fetch_error,
                      msg='update order error: ' + e,
                      rich=False,
                      typing=False)
コード例 #11
0
def replace_gerund():
    bounfolder = "/home/dicle/git/serdoo-servis2/django_docker/learning/_lexicons/tr_sentiment_boun"
    names = ["positive_n2", "negative_n2"]

    for n in names:
        boun = open(os.path.join(bounfolder, n + ".txt"), "r").readlines()
        boun = [w.strip() for w in boun]

        _newlist = [
            re.sub("\set(tir)?(me)?\#[nvpb]", "", w).strip() for w in boun
        ]
        _newlist = [
            re.sub("\syap(tır)?(ma)?\#[nvpb]", "", w).strip() for w in _newlist
        ]

        newlist = []
        for w in _newlist:
            if "#" not in w:
                w = w + "#n"
            newlist.append(w)

        newlist = list(set(newlist))
        import icu
        collator = icu.Collator.createInstance(icu.Locale('tr_TR.UTF-8'))
        newlist.sort(key=collator.getSortKey)
        open(os.path.join(bounfolder, n + "_n4.txt"),
             "w").write("\n".join(newlist))
コード例 #12
0
def gen_words(text):
    it = icu.BreakIterator.createWordInstance(icu.Locale("th"))
    it.setText(text)
    start = it.first()
    for end in it:
        yield text[start:end]
        start = end
コード例 #13
0
def main(song_dir: Path) -> None:
    """
    Writes a .tex file with a list of inputs. The file name is the song
    directory name concatenated with '.autogenerated.tex'.
    Sorts the inputs by a key constructed from the song title and artist.
    """
    # Get sort key and file names.
    key_and_file_pairs = []
    for song_file in song_dir.glob("*.tex"):
        match = re.match(
            r".*\\SongTitle(\[[^\[]+\])?\{(?P<title>[^\}]+)\}\{(?P<artist>[^\}]+)\}",
            song_file.read_text(),
            re.DOTALL | re.UNICODE,
        )
        if not match:
            raise ValueError(f"{song_file} does not seem to be a song file")
        key = match["title"] + " - " + match["artist"]
        key_and_file_pairs.append((key, song_file))

    # Sort by key.
    collator = icu.Collator.createInstance(icu.Locale('sk_SK.UTF-8'))
    key_and_file_pairs.sort(key=lambda x: collator.getSortKey(x[0]))

    # Write the file with inputs.
    output_file = song_dir.with_suffix(".autogenerated.tex")
    with output_file.open("w") as f:
        f.write(f"% THIS FILE IS AUTOGENERATED.\n")
        f.write(f"% DO NOT EDIT!\n")
        for _, song_file in key_and_file_pairs:
            f.write(f"\\input{{{song_file}}}\n")
コード例 #14
0
ファイル: make-manuf.py プロジェクト: zbyclar/wireshark
def shorten(manuf):
    '''Convert a long manufacturer name to abbreviated and short names'''
    # Normalize whitespace.
    manuf = ' '.join(manuf.split())
    orig_manuf = manuf
    # Add exactly one space on each end.
    # XXX This appears to be for the re.sub below.
    manuf = u' {} '.format(manuf)
    # Convert to consistent case
    manuf = manuf.title()
    # Remove any punctuation
    # XXX Use string.punctuation? Note that it includes '-' and '*'.
    manuf = re.sub(u"[',.()]", ' ', manuf)
    # & isn't needed when Standalone
    manuf = manuf.replace(" & ", " ")
    # Remove any "the", "inc", "plc" ...
    manuf = re.sub(
        '\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )',
        '',
        manuf,
        flags=re.IGNORECASE)
    # Remove all spaces
    manuf = re.sub('\s+', '', manuf)

    # Truncate names to a reasonable length, say, 8 characters. If
    # the string contains UTF-8, this may be substantially more than
    # 8 bytes. It might also be less than 8 visible characters. Plain
    # Python slices Unicode strings by code point, which is better
    # than raw bytes but not as good as grapheme clusters. PyICU
    # supports grapheme clusters. https://bugs.python.org/issue30717
    #
    # In our case plain Python truncates 'Savroni̇k Elektroni̇k'
    # to 'Savroni̇', which is 7 visible characters, 8 code points,
    # and 9 bytes.

    # Truncate by code points
    trunc_len = 8

    if have_icu:
        # Truncate by grapheme clusters
        bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
        bi_ci.setText(manuf)
        bounds = list(bi_ci)
        bounds = bounds[0:8]
        trunc_len = bounds[-1]

    manuf = manuf[:trunc_len]

    if manuf.lower() == orig_manuf.lower():
        # Original manufacturer name was short and simple.
        return manuf

    mixed_manuf = orig_manuf
    # At least one entry has whitespace in front of a period.
    mixed_manuf = re.sub('\s+\.', '.', mixed_manuf)
    #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
    if mixed_manuf.upper() == mixed_manuf:
        mixed_manuf = mixed_manuf.title()

    return u'{}\t{}'.format(manuf, mixed_manuf)
コード例 #15
0
ファイル: utils.py プロジェクト: k-akinpe1u/com.sagebrew.web
def cleanup_title(value):
    # Need to use this rather than .title() because .title()
    # does not handle things like "Wouldn't" properly. It
    # converts it to "Wouldn'T" rather than keeping the T
    # lowercase
    if value[0] == '"' or value[0] == "'":
        value = value[1:]
    if value[len(value) - 1] == '"' or value[len(value) - 1] == "'":
        value = value[:len(value) - 1]
    value = value.replace('"', "").strip()
    value = HTMLParser.HTMLParser().unescape(value.lower())
    en_us_locale = icu.Locale('en_US')
    break_iter = icu.BreakIterator.createTitleInstance(en_us_locale)
    temp_title = icu.UnicodeString(value)
    title = unicode(temp_title.toTitle(break_iter, en_us_locale))
    word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"]
    for acronym in settings.COMPANY_ACRONYMS:
        if '.com' in acronym[0]:
            # .com often comes at the end of a title so we don't want to add
            # the trailing space check
            if acronym[1] in title:
                title = title.replace(acronym[1], acronym[0])
        else:
            if title.rfind(acronym[1]) == len(title) - len(acronym[1]):
                title = "%s%s" % (title[:len(title) -
                                        (len(acronym[1]))], acronym[0])
            for ender in word_enders:
                if "%s%s" % (acronym[1], ender) in title:
                    title = title.replace("%s%s" % (acronym[1], ender),
                                          "%s%s" % (acronym[0], ender))
    return title
コード例 #16
0
ファイル: pyicu.py プロジェクト: somjeat/pythainlp
def segment(txt):
    """รับค่า ''str'' คืนค่าออกมาเป็น ''list'' ที่ได้มาจากการตัดคำโดย ICU"""
    bd = icu.BreakIterator.createWordInstance(icu.Locale(
        "th"))  # เริ่มต้นเรียกใช้งานคำสั่งตัดคำโดยเลือกเป็นภาษาไทยโดยใช้ icu
    txt = txt.replace('\n', '')
    bd.setText(txt)  # ทำการตัดคำ
    breaks = list(bd)
    result = [txt[x[0]:x[1]] for x in zip([0] + breaks, breaks)]
    result1 = []
    for data in result:
        data1 = list(data)
        data2 = []
        for txt1 in data1:
            if isThai(txt1) == True:
                if len(data2) == 0:
                    data2.append(txt1)
                else:
                    if isThai(data2[data1.index(txt1) - 1]) == True:
                        data2.append(txt1)
                    else:
                        data2.append(',' + txt1)
            else:
                if len(data2) == 0:
                    data2.append(txt1)
                else:
                    if isThai(data2[data1.index(txt1) - 1]) == True:
                        data2.append(',' + txt1)
                    else:
                        data2.append(txt1)
        data1 = ''.join(data2)
        result1 += data1.split(',')
    return [x for x in result1 if x != ' ']
コード例 #17
0
def print_lexicon(lex):
    print('# SPDX-License-Identifier: Unicode-DFS-2016')
    print('# Columns: Form; Pronunciation')
    print()
    collator = icu.Collator.createInstance(icu.Locale('si'))
    for line in sorted(lex, key=collator.getSortKey):
        print(line.encode('utf-8'))
コード例 #18
0
def write_additions(deltas, out):
    for lang, (chars, refs, cldr_sources) in sorted(deltas.items()):
        locale = icu.Locale(lang)
        out.write('\n\n### %s: %s\n\n' % (lang, locale.getDisplayName()))
        reflist = ['R%d' % i for i in range(1, len(refs) + 1)]
        references = ' references="%s"' % ' '.join(reflist) if reflist else ''
        if locale.getScript() in ('Arab', 'Thaa', 'Nkoo', 'Syrc'):
            characterOrder = 'right-to-left'
        else:
            characterOrder = 'left-to-right'
        out.write('```xml\n')
        out.write(
            CLDR_EXEMPLAR_XML_START % {
                'language': locale.getLanguage(),
                'script': locale.getScript(),
                'characterOrder': characterOrder,
                'lineOrder': 'top-to-bottom',
                'characters': xmlescape(format_unicodeset(chars)),
                'references': xmlescape(references),
            })
        if refs:
            out.write('\t<references>\n')
            for i, ref in enumerate(refs):
                out.write(
                    CLDR_EXEMPLAR_XML_REFERENCE % {
                        'type': reflist[i],
                        'uri': xmlescape(ref),
                        'text': xmlescape(get_reference_description(ref)),
                    })
            out.write('\t</references>\n')
        out.write('</ldml>\n```\n\n')
コード例 #19
0
def compare(a, b, hint):
    collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
    if len(commonprefix(a, hint)) > len(commonprefix(b, hint)):
        return 1
    elif len(commonprefix(b, hint)) > len(commonprefix(a, hint)):
        return -1
    else:
        return collator.compare(b, a)
コード例 #20
0
def exact_matching():
    spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
コード例 #21
0
 def find_students_by_class(self):
     classe = self.classe_list.currentText()
     cursor.execute(''' SELECT nom FROM eleve WHERE classe = ? ''', (classe,))
     liste = cursor.fetchall()
     l = [e[0] for e in liste]
     collator = icu.Collator.createInstance(icu.Locale('ar_utf8'))
     l1 = sorted(l, key=collator.getSortKey)
     return l1
コード例 #22
0
def get_inspire_theme_link_children_tags(inspire_theme_link_parent_id=1):
    list = []
    tags = Tags.query.join(Links.tags).with_entities(Tags.title).filter(
        Links.parent_id == inspire_theme_link_parent_id).all()
    for tag in tags:
        tag.title not in list and list.append(tag.title)
    collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8'))
    list.sort(key=collator.getSortKey)
    return tuple(list)
コード例 #23
0
def similar_matching():
    """Matches ignoring accent."""
    spa_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='spa') if not discartable(word)}
    por_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='por') if not discartable(word)}

    common_words = spa_words & por_words

    collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8'))
    return sorted(common_words, key=collator.getSortKey)
コード例 #24
0
def get_lang_choices(request, with_default=False):
    choices = [(k, v) for k, v in request.locale.languages.items()]
    collator = icu.Collator.createInstance(
        icu.Locale(pyramid.i18n.negotiate_locale_name(request)))
    f = functools.cmp_to_key(collator.compare)
    choices.sort(key=lambda it: f(it[1]))
    if with_default:
        choices = [('*', _("DEFAULT"))] + choices
    return choices
コード例 #25
0
def now():
    '''
	คืนค่าเวลา ณ ขณะนี้ ในรูปแบบ str
	ตัวอย่าง "7 มกราคม 2560 20:22:59"
	'''
    formatter = icu.DateFormat.createDateTimeInstance(icu.DateFormat.LONG,
                                                      icu.DateFormat.kDefault,
                                                      icu.Locale('th_TH'))
    return formatter.format(datetime.datetime.now())
コード例 #26
0
ファイル: tokenizer_json.py プロジェクト: uhh-lt/158
def tokenize_icu(text, language):
    bd = icu.BreakIterator.createWordInstance(icu.Locale(language))
    bd.setText(text)
    start_pos = 0
    tokens = ''
    for obj in bd:
        tokens += text[start_pos:obj]
        tokens += ' '
        start_pos = obj
    return tokens
コード例 #27
0
 def _testPerformerComposition(self):
     # For accented char sorting
     collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8'))
     # If track has featured artists, we append them to the performer tmp string
     # Sorted comparison to only test value equality. The artists alphabetic order is tested elsewhere
     if len(self.track.performers) != len(self.track.composedPerformer) or \
         sorted(removeSpecialCharFromArray(self.track.performers), key=collator.getSortKey) != \
         sorted(removeSpecialCharFromArray(self.track.composedPerformer), key=collator.getSortKey):
         self.errorCounter += 1
         self.errors.append(ErrorEnum.INCONSISTENT_PERFORMER)
コード例 #28
0
ファイル: generate.py プロジェクト: jacopofar/grammar-quiz
def tokenize(text: str, lang: str):
    """Split a string into tokens."""
    # Is there no word breaker already set up? Instantiate it
    if lang not in _breakers:
        _breakers[lang] = (icu.BreakIterator.createWordInstance(
            icu.Locale(lang)))

    _breakers[lang].setText(text)
    boundaries = list(_breakers[lang])
    return [text[i:j] for i, j in zip([0] + boundaries, boundaries)]
コード例 #29
0
ファイル: _token.py プロジェクト: zxlzr/bistring
    def __init__(self, locale: str, constructor: Callable):
        # BreakIterator is not a thread-safe API, so store a cache of
        # thread-local iterators
        self._locale = icu.Locale(locale)
        self._constructor = constructor
        self._local = threading.local()

        # Eagerly construct one on this thread as an optimization, and to check
        # for errors
        self._break_iterator()
コード例 #30
0
 def sort_by_name(self):
     for record in self:
         collator = icu.Collator.createInstance(icu.Locale('es'))
         # student_ids = sorted(record.student_ids,key=attrgetter('last_name','mother_name','first_name','middle_name'),cmp=collator.compare)
         student_ids = sorted(record.student_ids,
                              key=attrgetter('full_name'),
                              cmp=collator.compare)
         seq = 0
         for student in student_ids:
             seq += 1
             student.write({'seq': seq})