class IndexLetter(SearchBase): def __init__(self, word): self._word = word self.searcher = None self.query = None self.collator = Collator() self.num_results = 0 def get_num_results(self): return self.num_results def sort_key(self, string): s = string.decode("utf-8") return self.collator.sort_key(s) def get_results(self): if self.searcher is None: self.search() facet = FieldFacet("verb_form") facet = TranslateFacet(self.sort_key, facet) results = self.searcher.search(self.query, limit=None, sortedby=facet, collapse_limit=1, collapse='verb_form') self.num_results = len(results) return results def search(self): self.searcher = ix_letter.searcher() fields = [] qs = u'index_letter:({0})'.format(self.word) fields.append("index_letter") self.query = MultifieldParser(fields, ix_letter.schema).parse(qs) def get_json(self): OK = 200 status = OK results = self.get_results() all_results = [] for result in results: verb = {} verb['verb_form'] = result['verb_form'] if result['verb_form'] != result['infinitive']: verb['infinitive'] = result['infinitive'] all_results.append(verb) return json.dumps(all_results, indent=4, separators=(',', ': ')), status
class FromFullTest(TestCase): def setUp(self): from pyuca import Collator self.c = Collator() def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) def test_4(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_5(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) )
def show_event(id): try: event = Event.query.filter_by(id=id).first_or_404() except: abort(404) # I need to prefill these variables here to simplify the template participantes, compositores, personas = set(), set(), set() for i in event.participants: if i.person and i.activity.name == "Compositor/a": i.person.is_composer = True compositores.add(i.person) personas.add(i.person) else: participantes.add(i) if i.person: personas.add(i.person) # Now, iterate in performances to extract other composers for p in event.performances: for c in p.musical_piece.composers: c.is_composer = True compositores.add(c) personas.add(c) collator = Collator() compositores = sorted(compositores, key=lambda e: collator.sort_key(e.get_name())) participantes = sorted(participantes, key=lambda e: collator.sort_key(e.get_name())) return render_template('public/detalle.html', e=event, participantes=participantes, compositores=compositores, personas=personas, request=request)
def GenerateCollationEquivalenceTable(unicodecharlist): charbuckets = {} C = Collator() def internal_sortfunc(codepointA, codepointB): A = rationalizeCollationKeys(C.sort_key(codepointA)) B = rationalizeCollationKeys(C.sort_key(codepointB)) cmp = 0 if (A[2], A[3]) < (B[2], B[3]): cmp = -1 elif (A[2], A[3]) > (B[2], B[3]): cmp = 1 return cmp for codepoint in unicodecharlist: # Up to 4 collation keys are returned, we group on first two non-zero keys collationkeys = rationalizeCollationKeys(C.sort_key(codepoint)) # print codepoint + " : " + repr(collationkeys) if collationkeys[0] == 0: continue # Not sure why case-ish transitions map to this value in the Unicode standard, # but this value seems to be consitently used in this way across all scripts. if collationkeys[1][0] != 32: continue k0 = collationkeys[0] k1 = collationkeys[1] if k0 not in charbuckets: charbuckets[k0] = {} if k1 not in charbuckets[k0]: charbuckets[k0][k1] = [] charbuckets[k0][k1].append(codepoint) codepointMap = {} for k1 in charbuckets: for k2 in charbuckets[k1]: # This is what we are looking for: buckets containing multiple characters. # Find the character with the lowest sort order in the bucket according # to it's full collation key sequence and map all of the other characters # in the bucket to this "smallest" characeter. For instance this maps # "A" to "a". if len(charbuckets[k1][k2]) > 1: s = sorted(charbuckets[k1][k2], internal_sortfunc) for codepoint in s[1:]: codepointMap[codepoint] = s[0] return codepointMap
class UnicodeCollationNormalizer(SimpleNormalizer): """ Use pyuca to create sort key for string Only, but Very, useful for sorting """ def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) keyPath = self.get_path(session, 'keyFile', 'allkeys.txt') # This is handy -- means if no pyuca, no problem from pyuca import Collator self.collator = Collator(keyPath) def process_string(self, session, data): # fix eszett sorting data = data.replace(u'\u00DF', 'ss') ints = self.collator.sort_key(data) exp = ["%04d" % i for i in ints] return ''.join(exp)
def person(initial="A"): if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": return redirect(url_for('.person', initial='A')) try: # Primer intento fallido, hay gente sin apellido # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all() personas = Person.query.filter( or_( and_(Person.last_name == '', Person.first_name.ilike(initial + "%")), Person.last_name.ilike(initial + "%"))).all() collator = Collator() personas = sorted( personas, key=lambda e: collator.sort_key(e.get_name().upper())) return render_template('public/person_initial.html', initial=initial, personas=personas) except TemplateNotFound: abort(404)
def create_language_name_map(self) -> None: join = os.path.join deploy_root = settings.DEPLOY_ROOT path = join(deploy_root, "locale", "language_options.json") output_path = join(deploy_root, "locale", "language_name_map.json") with open(path, "rb") as reader: languages = orjson.loads(reader.read()) lang_list = [] for lang_info in languages["languages"]: lang_info["name"] = lang_info["name_local"] del lang_info["name_local"] lang_list.append(lang_info) collator = Collator() lang_list.sort(key=lambda lang: collator.sort_key(lang["name"])) with open(output_path, "wb") as output_file: output_file.write( orjson.dumps( {"name_map": lang_list}, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ))
def sort_by_name(iterable): """Sort by a translatable name, using pyuca for a better result.""" c = Collator() key = lambda obj: c.sort_key(str(obj.name)) return sorted(iterable, key=key)
class FromFullTest(unittest.TestCase): def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_4(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_5(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_1_old(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_2_old(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_3_old(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_4_old(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_5_old(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) )
#!/usr/bin/env python3 from collections import defaultdict import sys from pyuca import Collator c = Collator() filename = sys.argv[1] entries = defaultdict(list) key = None with open(filename) as f: for line in f: if line.strip() == "": continue elif line.startswith(" "): assert key entries[key].append(line.rstrip()) else: key = line.strip() for key, lines in sorted(entries.items(), key=lambda i: c.sort_key(i[0])): print() print(key) for line in lines: print(line)
from morphgnt.utils import load_yaml def n(x): return unicodedata.normalize("NFKC", x) lexemes = load_yaml("lexemes.yaml") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted(lexemes.items(), key=lambda x: collator.sort_key(x[0])): if "bdag-headword" in metadata: print "{}:\n pos: {}\n bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], metadata["bdag-headword"].encode("utf-8")) if metadata["bdag-headword"] not in headwords: existing_not_in_headwords.append(metadata["bdag-headword"].encode("utf-8")) else: if lexeme in headwords: print "{}:\n pos: {}\n bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], lexeme.encode("utf-8")) added.append(lexeme.encode("utf-8")) else: print "{}:\n pos: {}".format(lexeme.encode("utf-8"), metadata["pos"]) missing_not_in_headwords.append(lexeme.encode("utf-8")) print >>sys.stderr, "existing" for word in existing_not_in_headwords: print >>sys.stderr, "\t", word
c = Collator() prev_sort_key = None success = 0 failure = 0 with open("CollationTest/CollationTest_NON_IGNORABLE.txt") as f: for line in f.readlines(): points = line.split("#")[0].split(";")[0].strip().split() if points: test_string = "".join( chr(int(point, 16)) for point in points ) test_string_sort_key = c.sort_key(test_string) x = format_sort_key(test_string_sort_key) if prev_sort_key: if prev_sort_key > test_string_sort_key: failure += 1 print(line) print(x) else: success += 1 prev_sort_key = test_string_sort_key print() print("{} success; {} failure".format(success, failure)) if failure > 0: sys.exit(1)
success = 0 failure = 0 path = "CollationTest/{0}/CollationTest_NON_IGNORABLE.txt".format( c.UCA_VERSION) with open(path) as f: for i, line in enumerate(f.readlines()): points = line.split("#", 1)[0].split(";", 1)[0].strip().split() if points: test_string = "".join( chr(int(point, 16)) for point in points ) test_string_sort_key = c.sort_key(test_string) if prev_sort_key: if prev_sort_key > test_string_sort_key: failure += 1 print('-------') print("failed on line {0}:".format(i+1)) print(line.rstrip('\n')) print("PREV: {0}".format(format_sort_key(prev_sort_key))) print("THIS: {0}".format( format_sort_key(test_string_sort_key))) print('-------') else: success += 1 prev_sort_key = test_string_sort_key print("")
def n(x): return unicodedata.normalize("NFKC", x) lexemes = load_yaml("lexemes.yaml") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted(lexemes.items(), key=lambda x: collator.sort_key(x[0])): if "bdag-headword" in metadata: print "{}:\n pos: {}\n bdag-headword: {}".format( lexeme.encode("utf-8"), metadata["pos"], metadata["bdag-headword"].encode("utf-8")) if metadata["bdag-headword"] not in headwords: existing_not_in_headwords.append( metadata["bdag-headword"].encode("utf-8")) else: if lexeme in headwords: print "{}:\n pos: {}\n bdag-headword: {}".format( lexeme.encode("utf-8"), metadata["pos"], lexeme.encode("utf-8")) added.append(lexeme.encode("utf-8")) else: print "{}:\n pos: {}".format(lexeme.encode("utf-8"),
class FromFullTest(unittest.TestCase): def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_4(self): self.assertEqual(self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_5(self): self.assertEqual(self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_1_old(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_2_old(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_3_old(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_4_old(self): self.assertEqual(self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_5_old(self): self.assertEqual(self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000))
#!/usr/bin/env python3 import sys import yaml from pyuca import Collator c = Collator() FILENAME = sys.argv[1] with open(FILENAME) as f: # load yaml data = yaml.safe_load(f) # sort based on the keys using pyuca data = dict(sorted(data.items(), key=lambda x: c.sort_key(x[0]))) with open(FILENAME, "w") as g: yaml.dump(data, g, sort_keys=False, allow_unicode=True)