def __init__(self): self.template = os.path.join(os.path.dirname(__file__), "sort.html") self.silpacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt")) self.ucacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt")) self.response = SilpaResponse(self.template)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) keyPath = self.get_path(session, "keyFile", "allkeys.txt") # This is handy -- means if no pyuca, no problem from pyuca import Collator self.collator = Collator(keyPath)
def GenerateCollationEquivalenceTable(unicodecharlist): charbuckets = {} C = Collator() def internal_sortfunc(codepointA, codepointB): A = rationalizeCollationKeys(C.sort_key(codepointA)) B = rationalizeCollationKeys(C.sort_key(codepointB)) cmp = 0 if (A[2], A[3]) < (B[2], B[3]): cmp = -1 elif (A[2], A[3]) > (B[2], B[3]): cmp = 1 return cmp for codepoint in unicodecharlist: # Up to 4 collation keys are returned, we group on first two non-zero keys collationkeys = rationalizeCollationKeys(C.sort_key(codepoint)) # print codepoint + " : " + repr(collationkeys) if collationkeys[0] == 0: continue # Not sure why case-ish transitions map to this value in the Unicode standard, # but this value seems to be consitently used in this way across all scripts. if collationkeys[1][0] != 32: continue k0 = collationkeys[0] k1 = collationkeys[1] if k0 not in charbuckets: charbuckets[k0] = {} if k1 not in charbuckets[k0]: charbuckets[k0][k1] = [] charbuckets[k0][k1].append(codepoint) codepointMap = {} for k1 in charbuckets: for k2 in charbuckets[k1]: # This is what we are looking for: buckets containing multiple characters. # Find the character with the lowest sort order in the bucket according # to it's full collation key sequence and map all of the other characters # in the bucket to this "smallest" characeter. For instance this maps # "A" to "a". if len(charbuckets[k1][k2]) > 1: s = sorted(charbuckets[k1][k2], internal_sortfunc) for codepoint in s[1:]: codepointMap[codepoint] = s[0] return codepointMap
def get_ranked_documents(self, query_string): ranked_documents = list() # Cuando el sistema inicia con la colección procesada y llega una consulta # Se procesa la consulta utilizando la función del analizador query_terms = Analyzer.retrieve_html_str_terms(query_string) # Se buscan los términos de la consulta en el vocabulario # Filtered query terms debe estar ordenado alfabéticamente para luego # utilizar dicho ordenamiento en las entradas de los vectores filtered_query_terms = [ term for term in query_terms if term in self.__collection_vocabulary.keys() ] # Caso en el que ninguno de los términos de la consulta existe en el vocabulario de la colección if len(filtered_query_terms) == 0: return ranked_documents query_terms_np_array = np.array(filtered_query_terms) terms, counts = np.unique(query_terms_np_array, return_counts=True) query_vocabulary = dict(zip(terms, counts)) max_l_freq_lq = max(counts) collator = Collator() final_query_terms = sorted(query_vocabulary.keys(), key=collator.sort_key) # Se obtiene el vector de pesos de la consulta query_weights_vector = self.get_query_weights_vector( final_query_terms, max_l_freq_lq, query_vocabulary) # Se recuperan las listas de posteo de cada palabra involucrada postings_lists = self.__collection_handler.get_postings_lists( final_query_terms) # Se obtienen los vectores de pesos para los documentos de las listas de posteo documents_weights_vectors = SearchEngine.get_documents_weights_short_vectors( postings_lists, final_query_terms) # Se calcula la similaridad del peso de cada documento de la lista de posteo con el peso de la consulta sorted_documents_aliases = sorted(documents_weights_vectors.keys(), key=collator.sort_key) query_documents_dot_products = SearchEngine.get_query_documents_dot_products( query_weights_vector, documents_weights_vectors, sorted_documents_aliases) query_documents_norms_products = self.get_query_documents_norms_products( query_weights_vector, sorted_documents_aliases) similarities = query_documents_dot_products / query_documents_norms_products # Se hace el ranking respectivo para ordenar los documentos # Los documentos se identifican por ID en la lista de document_entries, # y lo que se devuelve es el document entry de cada uno ascending_ranked_similarities = np.argsort(similarities) descending_ranked_similarities = ascending_ranked_similarities[::-1] for index in descending_ranked_similarities: document_complete_alias = sorted_documents_aliases[index] ranked_documents.append(document_complete_alias) return ranked_documents
def test_cafe(self): from pyuca import Collator c = Collator() self.assertEqual(sorted(["cafe", "caff", "café"]), ["cafe", "caff", "café"]) self.assertEqual(sorted(["cafe", "caff", "café"], key=c.sort_key), ["cafe", "café", "caff"])
def person(initial="A"): if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": return redirect(url_for('.person', initial='A')) try: # Primer intento fallido, hay gente sin apellido # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all() personas = Person.query.filter( or_( and_(Person.last_name == '', Person.first_name.ilike(initial + "%")), Person.last_name.ilike(initial + "%"))).all() collator = Collator() personas = sorted( personas, key=lambda e: collator.sort_key(e.get_name().upper())) return render_template('public/person_initial.html', initial=initial, personas=personas) except TemplateNotFound: abort(404)
class IndexLetter(SearchBase): def __init__(self, word): self._word = word self.searcher = None self.query = None self.collator = Collator() self.num_results = 0 def get_num_results(self): return self.num_results def sort_key(self, string): s = string.decode("utf-8") return self.collator.sort_key(s) def get_results(self): if self.searcher is None: self.search() facet = FieldFacet("verb_form") facet = TranslateFacet(self.sort_key, facet) results = self.searcher.search(self.query, limit=None, sortedby=facet, collapse_limit=1, collapse='verb_form') self.num_results = len(results) return results def search(self): self.searcher = ix_letter.searcher() fields = [] qs = u'index_letter:({0})'.format(self.word) fields.append("index_letter") self.query = MultifieldParser(fields, ix_letter.schema).parse(qs) def get_json(self): OK = 200 status = OK results = self.get_results() all_results = [] for result in results: verb = {} verb['verb_form'] = result['verb_form'] if result['verb_form'] != result['infinitive']: verb['infinitive'] = result['infinitive'] all_results.append(verb) return json.dumps(all_results, indent=4, separators=(',', ': ')), status
class FromFullTest(TestCase): def setUp(self): from pyuca import Collator self.c = Collator() def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) def test_4(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) def test_5(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) )
def sort_language_constants(): """ function to generate correct ordering of constants.LANGUAGES list sorted by Unicode characters """ c = Collator() lang_names = [ Locale(lang).get_language_name(lang).capitalize() for lang in constants.LANGUAGES ] available_languages = dict(zip(lang_names, constants.LANGUAGES)) sorted_lang_names = sorted(lang_names, key=c.sort_key) return [available_languages[lang_name] for lang_name in sorted_lang_names]
def show_event(id): try: event = Event.query.filter_by(id=id).first_or_404() except: abort(404) # I need to prefill these variables here to simplify the template participantes, compositores, personas = set(), set(), set() for i in event.participants: if i.person and i.activity.name == "Compositor/a": i.person.is_composer = True compositores.add(i.person) personas.add(i.person) else: participantes.add(i) if i.person: personas.add(i.person) # Now, iterate in performances to extract other composers for p in event.performances: for c in p.musical_piece.composers: c.is_composer = True compositores.add(c) personas.add(c) collator = Collator() compositores = sorted(compositores, key=lambda e: collator.sort_key(e.get_name())) participantes = sorted(participantes, key=lambda e: collator.sort_key(e.get_name())) return render_template('public/detalle.html', e=event, participantes=participantes, compositores=compositores, personas=personas, request=request)
def tcompare(text1, text2): #short for "text comparer" c = Collator("allkeys.txt") #from James Tauber's "pyuca" text1split = text1.splitlines() text2split = text2.splitlines() sortedwords1 = sorted(text1split, key=c.sort_key) sortedwords2 = sorted(text2split, key=c.sort_key) for line1 in sortedwords1: for line2 in sortedwords2: if line1 == line2: if line1 != '': print(line1)
def create_language_name_map(self) -> None: join = os.path.join deploy_root = settings.DEPLOY_ROOT path = join(deploy_root, "locale", "language_options.json") output_path = join(deploy_root, "locale", "language_name_map.json") with open(path, "rb") as reader: languages = orjson.loads(reader.read()) lang_list = [] for lang_info in languages["languages"]: lang_info["name"] = lang_info["name_local"] del lang_info["name_local"] lang_list.append(lang_info) collator = Collator() lang_list.sort(key=lambda lang: collator.sort_key(lang["name"])) with open(output_path, "wb") as output_file: output_file.write( orjson.dumps( {"name_map": lang_list}, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ))
class UnicodeCollationNormalizer(SimpleNormalizer): """ Use pyuca to create sort key for string Only, but Very, useful for sorting """ def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) keyPath = self.get_path(session, 'keyFile', 'allkeys.txt') # This is handy -- means if no pyuca, no problem from pyuca import Collator self.collator = Collator(keyPath) def process_string(self, session, data): # fix eszett sorting data = data.replace(u'\u00DF', 'ss') ints = self.collator.sort_key(data) exp = ["%04d" % i for i in ints] return ''.join(exp)
def aTergo(date): site = pywikibot.getSite() c = Collator('allkeys.txt') pageList = getListFromXML( date ) # if not called from afterDump.py, it can be changed to getListFromXML('foo', True) - will fetch the latest dump sortDict = collections.defaultdict() sortedDict = collections.defaultdict() #sweep through the dump and add reversed words to appropriate dictionaries for elem in pageList: try: page = Haslo( elem.title, elem.text) #use the parser in klasa.py to parse xml entry except sectionsNotFound: continue except WrongHeader: continue else: if page.type == 3: for section in page.listLangs: if section.lang in sortDict: sortDict[section.lang].append(page.title[::-1]) else: sortDict[section.lang] = [page.title[::-1]] sortedDict['afrykanerski'] = sorted(sortDict['afrykanerski'], key=c.sort_key) letter = sortedDict['afrykanerski'][0][0] text = '{| class=hiddentable style="text-align:right"\n|-' counter = 0 for i in range(len(sortedDict['afrykanerski'])): if elem[0] == letter: text = text + '|[[%s|%s]]|\n' % (elem[::-1], elem) else: pywikibot.Page( site, 'Wikipedysta:Alkamid/atergo/afrykanerski/%s' % letter).put(text) text = '' letter = elem[0] text = text + '* %s\n' % elem
def sort_by_name(iterable): """Sort by a translatable name, using pyuca for a better result.""" c = Collator() key = lambda obj: c.sort_key(str(obj.name)) return sorted(iterable, key=key)
import sys from pyuca import Collator from pyuca.utils import format_sort_key c = Collator() prev_sort_key = None success = 0 failure = 0 with open("CollationTest/CollationTest_NON_IGNORABLE.txt") as f: for line in f.readlines(): points = line.split("#")[0].split(";")[0].strip().split() if points: test_string = "".join( chr(int(point, 16)) for point in points ) test_string_sort_key = c.sort_key(test_string) x = format_sort_key(test_string_sort_key) if prev_sort_key: if prev_sort_key > test_string_sort_key: failure += 1 print(line) print(x) else: success += 1 prev_sort_key = test_string_sort_key
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("corpustxt/allkeys.txt") arq="corpustxt/catatau.txt" fileObj = codecs.open( arq, "r", "utf-8" ) catatau = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas=stok.tokenize(catatau) # filtra repetições a = set(catalinhas) frases=list(a) # usando o padrao de ordenamento do collate pyuca para considerar acentos frases=sorted(frases, key=c.sort_key)
from pyuca import Collator col = Collator() file_obj = open("dictionary.txt", "r", encoding="utf8") count = 0 dict = {} english_word = "" for line in file_obj: if count % 2 == 0: ## do something with english text ##print("English: " + line) english_word = line.rstrip("\n") else: ## do something with greek text ##print("Greek: " + line) ## check if word in the dict and if so make multiple entries if english_word in dict: dict[english_word + " (1)"] = dict.pop(english_word) dict[english_word + " (2)"] = line.rstrip("\n") else: dict[english_word] = line.rstrip("\n") count += 1 file_obj.close()
def unicode_sorted_key(key): return Collator().sort_key(key)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("corpustxt/allkeys.txt") arq = "corpustxt/exclama.txt" fileObj = codecs.open(arq, "r", "utf-8") catatau = fileObj.read( ) # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas = stok.tokenize(catatau) # filtra repetições a = set(catalinhas) frases = list(a) # usando o padrao de ordenamento do collate pyuca para considerar acentos frases = sorted(frases, key=c.sort_key) #frases.reverse() # termina em interrogação. txt = "" conta = 0
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("allkeys.txt") arq="catatau.txt" fileObj = codecs.open( arq, "r", "utf-8" ) catatau = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas=stok.tokenize(catatau) #limpa palavras conectivas def cleanupDoc(s): stopset = set(stopwords.words('portuguese')) tokens = nltk.word_tokenize(s) cleanup = " ".join(filter(lambda word: word not in stopset, s.split())) return cleanup
def __init__(self): self.silpacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt")) self.ucacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("corpustxt/allkeys.txt") arq = "corpustxt/catatau_semlinebreaks.txt" fileObj = codecs.open(arq, "r", "utf-8") catatau = fileObj.read() # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load("tokenizers/punkt/portuguese.pickle") catalinhas = stok.tokenize(catatau) # filtra repetições a = set(catalinhas) frases = list(a) # usando o padrao de ordenamento do collate pyuca para considerar acentos frases = sorted(frases, key=c.sort_key) # frases.reverse() # termina em interrogação.
#!/usr/bin/env python3 from collections import defaultdict import sys from pyuca import Collator c = Collator() filename = sys.argv[1] entries = defaultdict(list) key = None with open(filename) as f: for line in f: if line.strip() == "": continue elif line.startswith(" "): assert key entries[key].append(line.rstrip()) else: key = line.strip() for key, lines in sorted(entries.items(), key=lambda i: c.sort_key(i[0])): print() print(key) for line in lines: print(line)
def _get_sorted_copy(self): ''' Returns the SegmentPool as a list of tuples sorted appropriately for human consumption in *the current language*. This means that the _(NAME) value should determine the sort order of the outer dict and the _('segment_config') key should determine the order of the inner dicts. In both cases, the keys need to be compared in the provided language. Further note that the current language is given by get_language() and that this will reflect the CMS operator's user settings, NOT the current PAGE language. NOTE: that the structure of the sorted pool is different. Two of the nested dicts are now lists of tuples so that the sort can be retained. _sorted_segments = [ (/class/, { NAME: _(/name/), CFGS: [ (/configuration_string/, { LABEL: _(/configuration_string/), OVERRIDES: { /user.id/: /SegmentOverride enum value/, ... }, INSTANCES: [ ... ] }) ] }) ] NOTE: On Python 3.0+ systems, we depend on pyuca for collation, which produces excellent results. On earlier systems, this is not available, so, we use a cruder mapping of accented characters into their unaccented ASCII equivalents. ''' sort_key = None if sys.version_info >= (3, 0): uca = None # # Unfortunately, the pyuca class–which can provide collation of # strings in a thread-safe manner–is for Python 3.0+ only # try: from pyuca import Collator uca = Collator() sort_key = uca.sort_key except: pass if not sort_key: # # Our fallback position is to use a more simple approach of # mapping 'accented' chars to latin equivalents before sorting, # this is crude, but better than nothing. # from .unaccent import unaccented_map def sort_key(s): return s.translate(unaccented_map()) pool = self.segments clone = [] for cls_key in sorted(pool.keys()): cls_dict = { self.NAME: pool[cls_key][self.NAME], self.CFGS: list(), } clone.append((cls_key, cls_dict)) # We'll build the CFG as a list in arbitrary order for now... for cfg_key in pool[cls_key][self.CFGS]: cfg_dict = { self.LABEL: pool[cls_key][self.CFGS][cfg_key][self.LABEL], self.OVERRIDES: dict(), self.INSTANCES: list(), } for username, override in pool[cls_key][self.CFGS][cfg_key][ self.OVERRIDES].items(): cfg_dict[self.OVERRIDES][username] = override for instance in pool[cls_key][self.CFGS][cfg_key][ self.INSTANCES]: cfg_dict[self.INSTANCES].append(instance) cls_dict[self.CFGS].append((cfg_key, cfg_dict)) # # Now, sort the CFGS by their LABEL, using which every means we # have available to us at this moment. # cls_dict[self.CFGS] = sorted( cls_dict[self.CFGS], key=lambda x: sort_key(force_text(x[1][self.LABEL]))) return clone
from distutils.dir_util import copy_tree from os.path import isfile, join from subprocess import call from tempfile import mkdtemp from django.conf import settings from django.core.management.base import CommandError from django.template import Template from django.utils import translation from django_countries import countries from pyuca import Collator from hosting.models import Place c = Collator() class LatexCommand(object): template_name = 'PasportaServo.tex' address_only = False make_pdf = False tex_files = [ template_name, 'pages/title.tex', 'pages/address.tex', ] def activate_translation(self): translation.activate(settings.LANGUAGE_CODE)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) keyPath = self.get_path(session, 'keyFile', 'allkeys.txt') # This is handy -- means if no pyuca, no problem from pyuca import Collator self.collator = Collator(keyPath)
def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("allkeys.txt") arq = "catatau.txt" fileObj = codecs.open(arq, "r", "utf-8") catatau = fileObj.read( ) # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas = stok.tokenize(catatau) # filtra repetições a = set(catalinhas) frases = list(a) # usando o padrao de ordenamento do collate pyuca para considerar acentos frases = sorted(frases, key=c.sort_key) #frases.reverse() # termina em interrogação. txt = "" conta = 0
#!/usr/bin/env python import sys import unicodedata from pyuca import Collator collator = Collator() from morphgnt.utils import load_yaml def n(x): return unicodedata.normalize("NFKC", x) lexemes = load_yaml("lexemes.yaml") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted(lexemes.items(), key=lambda x: collator.sort_key(x[0])): if "bdag-headword" in metadata: print "{}:\n pos: {}\n bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], metadata["bdag-headword"].encode("utf-8")) if metadata["bdag-headword"] not in headwords: existing_not_in_headwords.append(metadata["bdag-headword"].encode("utf-8")) else: if lexeme in headwords: print "{}:\n pos: {}\n bdag-headword: {}".format(lexeme.encode("utf-8"), metadata["pos"], lexeme.encode("utf-8"))
def test_authors_ordering(): with open('AUTHORS', 'rt', encoding='utf8') as fh: authors = fh.readlines() ordered_authors = sorted(authors, key=Collator().sort_key) assert authors == ordered_authors
"""PytSite Object Document Mapper UI Plugin Widgets """ __author__ = 'Oleksandr Shepetko' __email__ = '*****@*****.**' __license__ = 'MIT' import htmler from typing import List, Callable, Union, Iterable, Tuple from pyuca import Collator from json import dumps as json_dumps from pytsite import lang from plugins import widget, odm, http_api, odm_http_api _pyuca_col = Collator() def _sanitize_kwargs_exclude(kwargs: dict): if not ('exclude' in kwargs and kwargs['exclude']): return if isinstance(kwargs['exclude'], odm.Entity): kwargs['exclude'] = [kwargs['exclude'].ref ] if not kwargs['exclude'].is_new else [] elif isinstance(kwargs['exclude'], str): kwargs['exclude'] = [kwargs['exclude']] elif isinstance(kwargs['exclude'], (list, tuple)): ex = [] for item in kwargs['exclude']: if isinstance(item, odm.Entity): if not item.is_new: ex.append(item.ref)
class FromFullTest(unittest.TestCase): def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_4(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") @unittest.skipIf(V8_0_0, "not for UCA version 8.0.0") @unittest.skipIf(V10_0_0, "not for UCA version 10.0.0") def test_5(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_1_old(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_2_old(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_3_old(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_4_old(self): self.assertEqual( self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) ) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_5_old(self): self.assertEqual( self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000) )
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator from string import ascii_lowercase c = Collator("corpustxt/allkeys.txt") # arquivo analisado (no mesmo diretorio) arq="corpustxt/catatau_semlinebreaks.txt" fileObj = codecs.open( arq, "r", "utf-8" ) mikrofesto = fileObj.read() # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas=stok.tokenize(mikrofesto) #separando pontuações do final de palavras e demais tokens tokens = nltk.word_tokenize(mikrofesto) # limpando conectivos #cleanupDoc(tokens) #formatando em estrutura de dados nltk para padronizar posteriormente
#!/usr/bin/env python """ C.11.5 Index and Glossary (p211) """ import string, os from plasTeX.Tokenizer import Token, EscapeSequence from plasTeX import Command, Environment from plasTeX.Logging import getLogger from Sectioning import SectionUtils try: from pyuca import Collator collator = Collator(os.path.join(os.path.dirname(__file__), 'allkeys.txt')).sort_key except ImportError: collator = lambda x: x.lower() class IndexUtils(object): """ Helper functions for generating indexes """ linkType = 'index' level = Command.CHAPTER_LEVEL class Index(Command): """ Utility class used to surface the index entries to the renderer """
class FromFullTest(unittest.TestCase): def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_1(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x004A, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_2(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x1991, 0x0000, 0x0020, 0x004A, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_3(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x2571, 0x2587, 0x258A, 0x15EB, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_4(self): self.assertEqual(self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(not PYTHON3, "only matches Python 3's UCA version") def test_5(self): self.assertEqual(self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x025D, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_1_old(self): self.assertEqual( self.c.sort_key("\u0332\u0334"), (0x0000, 0x007C, 0x0021, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_2_old(self): self.assertEqual( self.c.sort_key("\u0430\u0306\u0334"), (0x15B0, 0x0000, 0x0020, 0x007C, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_3_old(self): self.assertEqual( self.c.sort_key("\u0FB2\u0F71\u0001\u0F80\u0061"), (0x205B, 0x206D, 0x2070, 0x120F, 0x0000, 0x0020, 0x0020, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_4_old(self): self.assertEqual(self.c.sort_key("\u4E00\u0021"), (0xFB40, 0xCE00, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000)) @unittest.skipIf(PYTHON3, "only matches the older Python 2's UCA version") def test_5_old(self): self.assertEqual(self.c.sort_key("\u3400\u0021"), (0xFB80, 0xB400, 0x026E, 0x0000, 0x0020, 0x0020, 0x0000, 0x0002, 0x0002, 0x0000))
from smithers import data_types from smithers import redis_keys as rkeys from smithers.utils import get_epoch_minute if settings.ENABLE_REDIS: redis = get_redis_connection('smithers') else: redis = False TWITTER_URL = 'https://twitter.com/share' FB_URL = 'https://www.facebook.com/sharer/sharer.php' COUNT_FOOTNOTE = ('<a href="#number-modal" class="number-help" ' 'data-toggle="modal" title="{}">' '<span class="share_total"></span>' '<i class="fa fa-question-circle"></i></a>') uca_collator = Collator() def uca_sort_key(country): """Sort key function using pyuca on the 2nd element of the argument.""" return uca_collator.sort_key(country[1]) def get_tw_share_url(**kwargs): kwargs.setdefault('dnt', 'true') text = kwargs.get('text') if text: kwargs['text'] = text.encode('utf8') return '?'.join([TWITTER_URL, urlencode(kwargs)])
#!/usr/bin/env python import sys from pyuca import Collator collator = Collator() from morphgnt.utils import load_yaml from morphgnt.utils import nfkc_normalize as n danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") greenlee = {} with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f: for line in f: key, value = line.strip().split("\t") greenlee[n(key.decode("utf-8")).split(",")[0]] = { "full-entry": n(key.decode("utf-8")), "components": n(value.decode("utf-8")), } words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))] count = 0 for word in sorted(words, key=collator.sort_key): count += 1 print "{}:".format(word.encode("utf-8")) if word in danker: print " danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8")) print " danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8")) if word in greenlee:
def setUp(self): from pyuca import Collator self.c = Collator()