def make_alphabetic(hits, processname, sortnames=False, lang="sv"): """ Loop through hits, apply the function 'processname' on each object and then sort the result in alphabetical order. The function processname should append zero or more processed form of the object to the result list. This processed forms should be a pair (first_letter, result) where first_letter is the first_letter of each object (to sort on), and the result is what the html-template want e.g. a pair of (name, no_hits) """ def fix_lastname(name): vonaf_pattern = re.compile(r"^(%s) " % "|".join(VONAV_LIST)) name = re.sub(vonaf_pattern, r"", name) return name.replace(" ", "z") results = [] for hit in hits: processname(hit, results) letter_results = {} # Split the result into start letters for first_letter, result in results: if first_letter == "Ø": first_letter = "Ö" if first_letter == "Æ": first_letter = "Ä" if first_letter == "Ü": first_letter = "Y" if lang == "en" and first_letter == "Ö": first_letter = "O" if lang == "en" and first_letter in "ÄÅ": first_letter = "A" if first_letter not in letter_results: letter_results[first_letter] = [result] else: letter_results[first_letter].append(result) # Sort result dictionary alphabetically into list if lang == "en": collator = icu.Collator.createInstance(icu.Locale("en_EN.UTF-8")) else: collator = icu.Collator.createInstance(icu.Locale("sv_SE.UTF-8")) for _n, items in list(letter_results.items()): if sortnames: items.sort(key=lambda x: collator.getSortKey( fix_lastname(x[0]) + " " + x[1])) else: items.sort(key=lambda x: collator.getSortKey(x[0])) letter_results = sorted(list(letter_results.items()), key=lambda x: collator.getSortKey(x[0])) return letter_results
def make_alphabetic(hits, processname, sortnames=False, lang="sv"): """ Loops through hits, applies the function 'processname' on each object and then sorts the result in alphabetical order. The function processname should append zero or more processed form of the object to the result list. This processed forms should be a pair (first_letter, result) where first_letter is the first_letter of each object (to sort on), and the result is what the html-template want e.g. a pair of (name, no_hits) """ def fix_lastname(name): name = re.sub(r"(^von )|(^af )", r"", name) return name.replace(" ", "z") results = [] for hit in hits: processname(hit, results) letter_results = {} # Split the result into start letters for first_letter, result in results: if first_letter == u'Ø': first_letter = u'Ö' if first_letter == u'Æ': first_letter = u'Ä' if first_letter == u'Ü': first_letter = u'Y' if lang == "en" and first_letter == u"Ö": first_letter = u"O" if lang == "en" and first_letter in u"ÄÅ": first_letter = u"A" if first_letter not in letter_results: letter_results[first_letter] = [result] else: letter_results[first_letter].append(result) # Sort result dictionary alphabetically into list if lang == "en": collator = icu.Collator.createInstance(icu.Locale('en_EN.UTF-8')) else: collator = icu.Collator.createInstance(icu.Locale('sv_SE.UTF-8')) for n, items in letter_results.items(): if sortnames: items.sort(key=lambda x: collator.getSortKey(fix_lastname(x[0]) + " " + x[1])) else: items.sort(key=lambda x: collator.getSortKey(x[0])) letter_results = sorted(letter_results.items(), key=lambda x: collator.getSortKey(x[0])) return letter_results
def __binaryFind(self, what): log.log("HintRegisterBrowser.__binaryFind", [what], 0) collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8')) def __pom(left, right): if left == right: if collator.compare(anyHint(self.__hints[left]), what) > 0: return left else: return left + 1 elif left + 1 == right: if collator.compare(anyHint(self.__hints[left]), what) > 0: return left elif collator.compare(anyHint(self.__hints[right]), what) > 0: return right else: return right + 1 lenn = right - left center = left + lenn // 2 if collator.compare(anyHint(self.__hints[center]), what) > 0: return __pom(left, center - 1) else: return __pom(center + 1, right) res = __pom(0, len(self.__hints) - 1) log.log("HintRegisterBrowser.__binaryFind return", [res], 1) return res
def _testMissorderedTags(self): # For accented char sorting collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8')) if sorted(removeSpecialCharFromArray(self.track.artists), key=collator.getSortKey) != removeSpecialCharFromArray( self.track.artists): self.missorderedTag.append('Artists') self.missorderedTagsCounter += 1 if self.track.remix == '': if sorted(removeSpecialCharFromArray(self.track.artists), key=collator.getSortKey) != removeSpecialCharFromArray( self.track.fileNameList[4].split(', ')): self.missorderedTag.append('Artists') self.missorderedTagsCounter += 1 if sorted(removeSpecialCharFromArray(self.track.performers), key=collator.getSortKey) != removeSpecialCharFromArray( self.track.performers): self.missorderedTag.append('Performers') self.missorderedTagsCounter += 1 if sorted(removeSpecialCharFromArray(self.track.feat), key=collator.getSortKey) != removeSpecialCharFromArray( self.track.feat): self.missorderedTag.append('Featuring') self.missorderedTagsCounter += 1 if sorted(removeSpecialCharFromArray(self.track.remix), key=collator.getSortKey) != removeSpecialCharFromArray( self.track.remix): self.missorderedTag.append('Remixer') self.missorderedTagsCounter += 1 if self.missorderedTagsCounter > 0: self.errorCounter += 1 self.errors.append(ErrorEnum.MISS_ORDERED_TAGS)
def __init__(self, *args, **kwargs): RegisterBrowser.__init__(self, *args, **kwargs) self.__dBController = None self.__limit = 100 self.__localVeto = False self.__binaryType = None self.__collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8'))
def character_tokenize(self, word): """ Returns the tokenization in character level. Arguments: word {string} -- word to be tokenized in character level. Returns: [list] -- list of characters. """ try: import icu except: print("please install PyICU") temp_ = icu.BreakIterator.createCharacterInstance(icu.Locale()) temp_.setText(word) char = [] i = 0 for j in temp_: s = word[i:j] char.append(s) i = j return char
def jwOnSortedFunction(s1, s2, collator=icu.Collator.createInstance( icu.Locale('de_DE.UTF-8'))): s1_s = ''.join(sorted(list(s1), key=collator.getSortKey)) s2_s = ''.join(sorted(list(s2), key=collator.getSortKey)) return jw_distance.get_jaro_distance(s1_s, s2_s, winkler=True)
def gen_words(text): bd = icu.BreakIterator.createWordInstance(icu.Locale("th")) bd.setText(text) p = bd.first() for q in bd: yield text[p:q] p = q
def findWord(self, key): self.__exact = False collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8')) if self._level == 0: res = self._dict.get(key) if res != None: self.__exact = True return res for k in self.__keys: #print " ", k, key if collator.compare(k, key) > 0: return self._dict.get(k) return None else: h = self._at(key, 0) if h != u'': t = key[1:] else: t = u'' subdict = self._dict.get(h) if subdict != None: res = subdict.findWord(t) if res != None: self.__exact = subdict.exact() return res first = True for k in self.__keys: #print "L", k, key if collator.compare(k, h) > 0: res = self._dict.get(k).findWord(u'') if res != None: return res return None
def updateSheetOrder(self): try: users = [] for user in self.fetch_group.users: users.append(str(user.name)) sheet = self.client.open(self.sheet_order).sheet1 collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8')) users.sort(key=collator.getSortKey) print(users) cell_list = sheet.range('B2:B' + str(len(users) + 100)) usr = deque(users) try: for cell in cell_list: cell.value = '' if usr != deque([]): cell.value = str(usr.popleft()) except IndexError as e: print('IndexError') # Update in batch sheet.update_cells(cell_list) except Exception as e: self.sendMsg(self.fetch_error, msg='update order error: ' + e, rich=False, typing=False)
def replace_gerund(): bounfolder = "/home/dicle/git/serdoo-servis2/django_docker/learning/_lexicons/tr_sentiment_boun" names = ["positive_n2", "negative_n2"] for n in names: boun = open(os.path.join(bounfolder, n + ".txt"), "r").readlines() boun = [w.strip() for w in boun] _newlist = [ re.sub("\set(tir)?(me)?\#[nvpb]", "", w).strip() for w in boun ] _newlist = [ re.sub("\syap(tır)?(ma)?\#[nvpb]", "", w).strip() for w in _newlist ] newlist = [] for w in _newlist: if "#" not in w: w = w + "#n" newlist.append(w) newlist = list(set(newlist)) import icu collator = icu.Collator.createInstance(icu.Locale('tr_TR.UTF-8')) newlist.sort(key=collator.getSortKey) open(os.path.join(bounfolder, n + "_n4.txt"), "w").write("\n".join(newlist))
def gen_words(text): it = icu.BreakIterator.createWordInstance(icu.Locale("th")) it.setText(text) start = it.first() for end in it: yield text[start:end] start = end
def main(song_dir: Path) -> None: """ Writes a .tex file with a list of inputs. The file name is the song directory name concatenated with '.autogenerated.tex'. Sorts the inputs by a key constructed from the song title and artist. """ # Get sort key and file names. key_and_file_pairs = [] for song_file in song_dir.glob("*.tex"): match = re.match( r".*\\SongTitle(\[[^\[]+\])?\{(?P<title>[^\}]+)\}\{(?P<artist>[^\}]+)\}", song_file.read_text(), re.DOTALL | re.UNICODE, ) if not match: raise ValueError(f"{song_file} does not seem to be a song file") key = match["title"] + " - " + match["artist"] key_and_file_pairs.append((key, song_file)) # Sort by key. collator = icu.Collator.createInstance(icu.Locale('sk_SK.UTF-8')) key_and_file_pairs.sort(key=lambda x: collator.getSortKey(x[0])) # Write the file with inputs. output_file = song_dir.with_suffix(".autogenerated.tex") with output_file.open("w") as f: f.write(f"% THIS FILE IS AUTOGENERATED.\n") f.write(f"% DO NOT EDIT!\n") for _, song_file in key_and_file_pairs: f.write(f"\\input{{{song_file}}}\n")
def shorten(manuf): '''Convert a long manufacturer name to abbreviated and short names''' # Normalize whitespace. manuf = ' '.join(manuf.split()) orig_manuf = manuf # Add exactly one space on each end. # XXX This appears to be for the re.sub below. manuf = u' {} '.format(manuf) # Convert to consistent case manuf = manuf.title() # Remove any punctuation # XXX Use string.punctuation? Note that it includes '-' and '*'. manuf = re.sub(u"[',.()]", ' ', manuf) # & isn't needed when Standalone manuf = manuf.replace(" & ", " ") # Remove any "the", "inc", "plc" ... manuf = re.sub( '\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE) # Remove all spaces manuf = re.sub('\s+', '', manuf) # Truncate names to a reasonable length, say, 8 characters. If # the string contains UTF-8, this may be substantially more than # 8 bytes. It might also be less than 8 visible characters. Plain # Python slices Unicode strings by code point, which is better # than raw bytes but not as good as grapheme clusters. PyICU # supports grapheme clusters. https://bugs.python.org/issue30717 # # In our case plain Python truncates 'Savroni̇k Elektroni̇k' # to 'Savroni̇', which is 7 visible characters, 8 code points, # and 9 bytes. # Truncate by code points trunc_len = 8 if have_icu: # Truncate by grapheme clusters bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US')) bi_ci.setText(manuf) bounds = list(bi_ci) bounds = bounds[0:8] trunc_len = bounds[-1] manuf = manuf[:trunc_len] if manuf.lower() == orig_manuf.lower(): # Original manufacturer name was short and simple. return manuf mixed_manuf = orig_manuf # At least one entry has whitespace in front of a period. mixed_manuf = re.sub('\s+\.', '.', mixed_manuf) #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name) if mixed_manuf.upper() == mixed_manuf: mixed_manuf = mixed_manuf.title() return u'{}\t{}'.format(manuf, mixed_manuf)
def cleanup_title(value): # Need to use this rather than .title() because .title() # does not handle things like "Wouldn't" properly. It # converts it to "Wouldn'T" rather than keeping the T # lowercase if value[0] == '"' or value[0] == "'": value = value[1:] if value[len(value) - 1] == '"' or value[len(value) - 1] == "'": value = value[:len(value) - 1] value = value.replace('"', "").strip() value = HTMLParser.HTMLParser().unescape(value.lower()) en_us_locale = icu.Locale('en_US') break_iter = icu.BreakIterator.createTitleInstance(en_us_locale) temp_title = icu.UnicodeString(value) title = unicode(temp_title.toTitle(break_iter, en_us_locale)) word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"] for acronym in settings.COMPANY_ACRONYMS: if '.com' in acronym[0]: # .com often comes at the end of a title so we don't want to add # the trailing space check if acronym[1] in title: title = title.replace(acronym[1], acronym[0]) else: if title.rfind(acronym[1]) == len(title) - len(acronym[1]): title = "%s%s" % (title[:len(title) - (len(acronym[1]))], acronym[0]) for ender in word_enders: if "%s%s" % (acronym[1], ender) in title: title = title.replace("%s%s" % (acronym[1], ender), "%s%s" % (acronym[0], ender)) return title
def segment(txt): """รับค่า ''str'' คืนค่าออกมาเป็น ''list'' ที่ได้มาจากการตัดคำโดย ICU""" bd = icu.BreakIterator.createWordInstance(icu.Locale( "th")) # เริ่มต้นเรียกใช้งานคำสั่งตัดคำโดยเลือกเป็นภาษาไทยโดยใช้ icu txt = txt.replace('\n', '') bd.setText(txt) # ทำการตัดคำ breaks = list(bd) result = [txt[x[0]:x[1]] for x in zip([0] + breaks, breaks)] result1 = [] for data in result: data1 = list(data) data2 = [] for txt1 in data1: if isThai(txt1) == True: if len(data2) == 0: data2.append(txt1) else: if isThai(data2[data1.index(txt1) - 1]) == True: data2.append(txt1) else: data2.append(',' + txt1) else: if len(data2) == 0: data2.append(txt1) else: if isThai(data2[data1.index(txt1) - 1]) == True: data2.append(',' + txt1) else: data2.append(txt1) data1 = ''.join(data2) result1 += data1.split(',') return [x for x in result1 if x != ' ']
def print_lexicon(lex): print('# SPDX-License-Identifier: Unicode-DFS-2016') print('# Columns: Form; Pronunciation') print() collator = icu.Collator.createInstance(icu.Locale('si')) for line in sorted(lex, key=collator.getSortKey): print(line.encode('utf-8'))
def write_additions(deltas, out): for lang, (chars, refs, cldr_sources) in sorted(deltas.items()): locale = icu.Locale(lang) out.write('\n\n### %s: %s\n\n' % (lang, locale.getDisplayName())) reflist = ['R%d' % i for i in range(1, len(refs) + 1)] references = ' references="%s"' % ' '.join(reflist) if reflist else '' if locale.getScript() in ('Arab', 'Thaa', 'Nkoo', 'Syrc'): characterOrder = 'right-to-left' else: characterOrder = 'left-to-right' out.write('```xml\n') out.write( CLDR_EXEMPLAR_XML_START % { 'language': locale.getLanguage(), 'script': locale.getScript(), 'characterOrder': characterOrder, 'lineOrder': 'top-to-bottom', 'characters': xmlescape(format_unicodeset(chars)), 'references': xmlescape(references), }) if refs: out.write('\t<references>\n') for i, ref in enumerate(refs): out.write( CLDR_EXEMPLAR_XML_REFERENCE % { 'type': reflist[i], 'uri': xmlescape(ref), 'text': xmlescape(get_reference_description(ref)), }) out.write('\t</references>\n') out.write('</ldml>\n```\n\n')
def compare(a, b, hint): collator = icu.Collator.createInstance(icu.Locale('pl_PL.UTF-8')) if len(commonprefix(a, hint)) > len(commonprefix(b, hint)): return 1 elif len(commonprefix(b, hint)) > len(commonprefix(a, hint)): return -1 else: return collator.compare(b, a)
def exact_matching(): spa_words = {word for word in wn.all_lemma_names(lang='spa') if not discartable(word)} por_words = {word for word in wn.all_lemma_names(lang='por') if not discartable(word)} common_words = spa_words & por_words collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8')) return sorted(common_words, key=collator.getSortKey)
def find_students_by_class(self): classe = self.classe_list.currentText() cursor.execute(''' SELECT nom FROM eleve WHERE classe = ? ''', (classe,)) liste = cursor.fetchall() l = [e[0] for e in liste] collator = icu.Collator.createInstance(icu.Locale('ar_utf8')) l1 = sorted(l, key=collator.getSortKey) return l1
def get_inspire_theme_link_children_tags(inspire_theme_link_parent_id=1): list = [] tags = Tags.query.join(Links.tags).with_entities(Tags.title).filter( Links.parent_id == inspire_theme_link_parent_id).all() for tag in tags: tag.title not in list and list.append(tag.title) collator = icu.Collator.createInstance(icu.Locale('de_DE.UTF-8')) list.sort(key=collator.getSortKey) return tuple(list)
def similar_matching(): """Matches ignoring accent.""" spa_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='spa') if not discartable(word)} por_words = {change_similar_letters(word) for word in wn.all_lemma_names(lang='por') if not discartable(word)} common_words = spa_words & por_words collator = icu.Collator.createInstance(icu.Locale('es_ES.UTF-8')) return sorted(common_words, key=collator.getSortKey)
def get_lang_choices(request, with_default=False): choices = [(k, v) for k, v in request.locale.languages.items()] collator = icu.Collator.createInstance( icu.Locale(pyramid.i18n.negotiate_locale_name(request))) f = functools.cmp_to_key(collator.compare) choices.sort(key=lambda it: f(it[1])) if with_default: choices = [('*', _("DEFAULT"))] + choices return choices
def now(): ''' คืนค่าเวลา ณ ขณะนี้ ในรูปแบบ str ตัวอย่าง "7 มกราคม 2560 20:22:59" ''' formatter = icu.DateFormat.createDateTimeInstance(icu.DateFormat.LONG, icu.DateFormat.kDefault, icu.Locale('th_TH')) return formatter.format(datetime.datetime.now())
def tokenize_icu(text, language): bd = icu.BreakIterator.createWordInstance(icu.Locale(language)) bd.setText(text) start_pos = 0 tokens = '' for obj in bd: tokens += text[start_pos:obj] tokens += ' ' start_pos = obj return tokens
def _testPerformerComposition(self): # For accented char sorting collator = icu.Collator.createInstance(icu.Locale('fr_FR.UTF-8')) # If track has featured artists, we append them to the performer tmp string # Sorted comparison to only test value equality. The artists alphabetic order is tested elsewhere if len(self.track.performers) != len(self.track.composedPerformer) or \ sorted(removeSpecialCharFromArray(self.track.performers), key=collator.getSortKey) != \ sorted(removeSpecialCharFromArray(self.track.composedPerformer), key=collator.getSortKey): self.errorCounter += 1 self.errors.append(ErrorEnum.INCONSISTENT_PERFORMER)
def tokenize(text: str, lang: str): """Split a string into tokens.""" # Is there no word breaker already set up? Instantiate it if lang not in _breakers: _breakers[lang] = (icu.BreakIterator.createWordInstance( icu.Locale(lang))) _breakers[lang].setText(text) boundaries = list(_breakers[lang]) return [text[i:j] for i, j in zip([0] + boundaries, boundaries)]
def __init__(self, locale: str, constructor: Callable): # BreakIterator is not a thread-safe API, so store a cache of # thread-local iterators self._locale = icu.Locale(locale) self._constructor = constructor self._local = threading.local() # Eagerly construct one on this thread as an optimization, and to check # for errors self._break_iterator()
def sort_by_name(self): for record in self: collator = icu.Collator.createInstance(icu.Locale('es')) # student_ids = sorted(record.student_ids,key=attrgetter('last_name','mother_name','first_name','middle_name'),cmp=collator.compare) student_ids = sorted(record.student_ids, key=attrgetter('full_name'), cmp=collator.compare) seq = 0 for student in student_ids: seq += 1 student.write({'seq': seq})