def __init__(self): self.template = os.path.join(os.path.dirname(__file__), "sort.html") self.silpacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt")) self.ucacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt")) self.response = SilpaResponse(self.template)
def get_ranked_documents(self, query_string): ranked_documents = list() # Cuando el sistema inicia con la colección procesada y llega una consulta # Se procesa la consulta utilizando la función del analizador query_terms = Analyzer.retrieve_html_str_terms(query_string) # Se buscan los términos de la consulta en el vocabulario # Filtered query terms debe estar ordenado alfabéticamente para luego # utilizar dicho ordenamiento en las entradas de los vectores filtered_query_terms = [ term for term in query_terms if term in self.__collection_vocabulary.keys() ] # Caso en el que ninguno de los términos de la consulta existe en el vocabulario de la colección if len(filtered_query_terms) == 0: return ranked_documents query_terms_np_array = np.array(filtered_query_terms) terms, counts = np.unique(query_terms_np_array, return_counts=True) query_vocabulary = dict(zip(terms, counts)) max_l_freq_lq = max(counts) collator = Collator() final_query_terms = sorted(query_vocabulary.keys(), key=collator.sort_key) # Se obtiene el vector de pesos de la consulta query_weights_vector = self.get_query_weights_vector( final_query_terms, max_l_freq_lq, query_vocabulary) # Se recuperan las listas de posteo de cada palabra involucrada postings_lists = self.__collection_handler.get_postings_lists( final_query_terms) # Se obtienen los vectores de pesos para los documentos de las listas de posteo documents_weights_vectors = SearchEngine.get_documents_weights_short_vectors( postings_lists, final_query_terms) # Se calcula la similaridad del peso de cada documento de la lista de posteo con el peso de la consulta sorted_documents_aliases = sorted(documents_weights_vectors.keys(), key=collator.sort_key) query_documents_dot_products = SearchEngine.get_query_documents_dot_products( query_weights_vector, documents_weights_vectors, sorted_documents_aliases) query_documents_norms_products = self.get_query_documents_norms_products( query_weights_vector, sorted_documents_aliases) similarities = query_documents_dot_products / query_documents_norms_products # Se hace el ranking respectivo para ordenar los documentos # Los documentos se identifican por ID en la lista de document_entries, # y lo que se devuelve es el document entry de cada uno ascending_ranked_similarities = np.argsort(similarities) descending_ranked_similarities = ascending_ranked_similarities[::-1] for index in descending_ranked_similarities: document_complete_alias = sorted_documents_aliases[index] ranked_documents.append(document_complete_alias) return ranked_documents
def test_cafe(self): from pyuca import Collator c = Collator() self.assertEqual(sorted(["cafe", "caff", "café"]), ["cafe", "caff", "café"]) self.assertEqual(sorted(["cafe", "caff", "café"], key=c.sort_key), ["cafe", "café", "caff"])
def sort_language_constants(): """ function to generate correct ordering of constants.LANGUAGES list sorted by Unicode characters """ c = Collator() lang_names = [ Locale(lang).get_language_name(lang).capitalize() for lang in constants.LANGUAGES ] available_languages = dict(zip(lang_names, constants.LANGUAGES)) sorted_lang_names = sorted(lang_names, key=c.sort_key) return [available_languages[lang_name] for lang_name in sorted_lang_names]
def tcompare(text1, text2): #short for "text comparer" c = Collator("allkeys.txt") #from James Tauber's "pyuca" text1split = text1.splitlines() text2split = text2.splitlines() sortedwords1 = sorted(text1split, key=c.sort_key) sortedwords2 = sorted(text2split, key=c.sort_key) for line1 in sortedwords1: for line2 in sortedwords2: if line1 == line2: if line1 != '': print(line1)
def aTergo(date): site = pywikibot.getSite() c = Collator('allkeys.txt') pageList = getListFromXML( date ) # if not called from afterDump.py, it can be changed to getListFromXML('foo', True) - will fetch the latest dump sortDict = collections.defaultdict() sortedDict = collections.defaultdict() #sweep through the dump and add reversed words to appropriate dictionaries for elem in pageList: try: page = Haslo( elem.title, elem.text) #use the parser in klasa.py to parse xml entry except sectionsNotFound: continue except WrongHeader: continue else: if page.type == 3: for section in page.listLangs: if section.lang in sortDict: sortDict[section.lang].append(page.title[::-1]) else: sortDict[section.lang] = [page.title[::-1]] sortedDict['afrykanerski'] = sorted(sortDict['afrykanerski'], key=c.sort_key) letter = sortedDict['afrykanerski'][0][0] text = '{| class=hiddentable style="text-align:right"\n|-' counter = 0 for i in range(len(sortedDict['afrykanerski'])): if elem[0] == letter: text = text + '|[[%s|%s]]|\n' % (elem[::-1], elem) else: pywikibot.Page( site, 'Wikipedysta:Alkamid/atergo/afrykanerski/%s' % letter).put(text) text = '' letter = elem[0] text = text + '* %s\n' % elem
def person(initial="A"): if initial not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": return redirect(url_for('.person', initial='A')) try: # Primer intento fallido, hay gente sin apellido # personas = Person.query.filter(Person.last_name.ilike(initial + "%")).all() personas = Person.query.filter( or_( and_(Person.last_name == '', Person.first_name.ilike(initial + "%")), Person.last_name.ilike(initial + "%"))).all() collator = Collator() personas = sorted( personas, key=lambda e: collator.sort_key(e.get_name().upper())) return render_template('public/person_initial.html', initial=initial, personas=personas) except TemplateNotFound: abort(404)
def show_event(id): try: event = Event.query.filter_by(id=id).first_or_404() except: abort(404) # I need to prefill these variables here to simplify the template participantes, compositores, personas = set(), set(), set() for i in event.participants: if i.person and i.activity.name == "Compositor/a": i.person.is_composer = True compositores.add(i.person) personas.add(i.person) else: participantes.add(i) if i.person: personas.add(i.person) # Now, iterate in performances to extract other composers for p in event.performances: for c in p.musical_piece.composers: c.is_composer = True compositores.add(c) personas.add(c) collator = Collator() compositores = sorted(compositores, key=lambda e: collator.sort_key(e.get_name())) participantes = sorted(participantes, key=lambda e: collator.sort_key(e.get_name())) return render_template('public/detalle.html', e=event, participantes=participantes, compositores=compositores, personas=personas, request=request)
def create_language_name_map(self) -> None: join = os.path.join deploy_root = settings.DEPLOY_ROOT path = join(deploy_root, "locale", "language_options.json") output_path = join(deploy_root, "locale", "language_name_map.json") with open(path, "rb") as reader: languages = orjson.loads(reader.read()) lang_list = [] for lang_info in languages["languages"]: lang_info["name"] = lang_info["name_local"] del lang_info["name_local"] lang_list.append(lang_info) collator = Collator() lang_list.sort(key=lambda lang: collator.sort_key(lang["name"])) with open(output_path, "wb") as output_file: output_file.write( orjson.dumps( {"name_map": lang_list}, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ))
def test_authors_ordering(): with open('AUTHORS', 'rt', encoding='utf8') as fh: authors = fh.readlines() ordered_authors = sorted(authors, key=Collator().sort_key) assert authors == ordered_authors
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator c = Collator("allkeys.txt") arq = "catatau.txt" fileObj = codecs.open(arq, "r", "utf-8") catatau = fileObj.read( ) # Returns a Unicode string from the UTF-8 bytes in the file # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas = stok.tokenize(catatau) # filtra repetições a = set(catalinhas) frases = list(a) # usando o padrao de ordenamento do collate pyuca para considerar acentos frases = sorted(frases, key=c.sort_key) #frases.reverse() # termina em interrogação. txt = "" conta = 0
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) keyPath = self.get_path(session, 'keyFile', 'allkeys.txt') # This is handy -- means if no pyuca, no problem from pyuca import Collator self.collator = Collator(keyPath)
def sort_by_name(iterable): """Sort by a translatable name, using pyuca for a better result.""" c = Collator() key = lambda obj: c.sort_key(str(obj.name)) return sorted(iterable, key=key)
def _get_sorted_copy(self): ''' Returns the SegmentPool as a list of tuples sorted appropriately for human consumption in *the current language*. This means that the _(NAME) value should determine the sort order of the outer dict and the _('segment_config') key should determine the order of the inner dicts. In both cases, the keys need to be compared in the provided language. Further note that the current language is given by get_language() and that this will reflect the CMS operator's user settings, NOT the current PAGE language. NOTE: that the structure of the sorted pool is different. Two of the nested dicts are now lists of tuples so that the sort can be retained. _sorted_segments = [ (/class/, { NAME: _(/name/), CFGS: [ (/configuration_string/, { LABEL: _(/configuration_string/), OVERRIDES: { /user.id/: /SegmentOverride enum value/, ... }, INSTANCES: [ ... ] }) ] }) ] NOTE: On Python 3.0+ systems, we depend on pyuca for collation, which produces excellent results. On earlier systems, this is not available, so, we use a cruder mapping of accented characters into their unaccented ASCII equivalents. ''' sort_key = None if sys.version_info >= (3, 0): uca = None # # Unfortunately, the pyuca class–which can provide collation of # strings in a thread-safe manner–is for Python 3.0+ only # try: from pyuca import Collator uca = Collator() sort_key = uca.sort_key except: pass if not sort_key: # # Our fallback position is to use a more simple approach of # mapping 'accented' chars to latin equivalents before sorting, # this is crude, but better than nothing. # from .unaccent import unaccented_map def sort_key(s): return s.translate(unaccented_map()) pool = self.segments clone = [] for cls_key in sorted(pool.keys()): cls_dict = { self.NAME: pool[cls_key][self.NAME], self.CFGS: list(), } clone.append((cls_key, cls_dict)) # We'll build the CFG as a list in arbitrary order for now... for cfg_key in pool[cls_key][self.CFGS]: cfg_dict = { self.LABEL: pool[cls_key][self.CFGS][cfg_key][self.LABEL], self.OVERRIDES: dict(), self.INSTANCES: list(), } for username, override in pool[cls_key][self.CFGS][cfg_key][ self.OVERRIDES].items(): cfg_dict[self.OVERRIDES][username] = override for instance in pool[cls_key][self.CFGS][cfg_key][ self.INSTANCES]: cfg_dict[self.INSTANCES].append(instance) cls_dict[self.CFGS].append((cfg_key, cfg_dict)) # # Now, sort the CFGS by their LABEL, using which every means we # have available to us at this moment. # cls_dict[self.CFGS] = sorted( cls_dict[self.CFGS], key=lambda x: sort_key(force_text(x[1][self.LABEL]))) return clone
def unicode_sorted_key(key): return Collator().sort_key(key)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import codecs import nltk from nltk.corpus import stopwords from pyuca import Collator from string import ascii_lowercase c = Collator("corpustxt/allkeys.txt") # arquivo analisado (no mesmo diretorio) arq="corpustxt/catatau_semlinebreaks.txt" fileObj = codecs.open( arq, "r", "utf-8" ) mikrofesto = fileObj.read() # separa em linhas stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') catalinhas=stok.tokenize(mikrofesto) #separando pontuações do final de palavras e demais tokens tokens = nltk.word_tokenize(mikrofesto) # limpando conectivos #cleanupDoc(tokens) #formatando em estrutura de dados nltk para padronizar posteriormente
def __init__(self, word): self._word = word self.searcher = None self.query = None self.collator = Collator() self.num_results = 0
def __init__(self, *args, **kwargs): from pyuca import Collator super(FromFullTest, self).__init__(*args, **kwargs) self.c = Collator() (0, 74, 33, 0, 2, 2, 0)
#!/usr/bin/env python3 import csv from pyuca import Collator c = Collator("bin/allkeys.txt") codes = {} lists = { 'country': { 'title': 'Country register', 'url': 'http://country.openregister.org', 'publisher': 'foreign-commonwealth-office', 'format': 'register', 'path': '../data/country/countries.tsv' }, 'territory': { 'title': 'Territory register', 'url': 'http://territory.openregister.org', 'publisher': 'foreign-commonwealth-office', 'format': 'register', 'path': '../data/territory/territories.tsv' }, 'uk': { 'title': 'UK register', 'url': 'http://uk.openregister.org', 'publisher': 'cabinet-office', 'format': 'register', 'path': '../data/uk/uk.tsv' },
from smithers import data_types from smithers import redis_keys as rkeys from smithers.utils import get_epoch_minute if settings.ENABLE_REDIS: redis = get_redis_connection('smithers') else: redis = False TWITTER_URL = 'https://twitter.com/share' FB_URL = 'https://www.facebook.com/sharer/sharer.php' COUNT_FOOTNOTE = ('<a href="#number-modal" class="number-help" ' 'data-toggle="modal" title="{}">' '<span class="share_total"></span>' '<i class="fa fa-question-circle"></i></a>') uca_collator = Collator() def uca_sort_key(country): """Sort key function using pyuca on the 2nd element of the argument.""" return uca_collator.sort_key(country[1]) def get_tw_share_url(**kwargs): kwargs.setdefault('dnt', 'true') text = kwargs.get('text') if text: kwargs['text'] = text.encode('utf8') return '?'.join([TWITTER_URL, urlencode(kwargs)])
from pyuca import Collator col = Collator() file_obj = open("dictionary.txt", "r", encoding="utf8") count = 0 dict = {} english_word = "" for line in file_obj: if count % 2 == 0: ## do something with english text ##print("English: " + line) english_word = line.rstrip("\n") else: ## do something with greek text ##print("Greek: " + line) ## check if word in the dict and if so make multiple entries if english_word in dict: dict[english_word + " (1)"] = dict.pop(english_word) dict[english_word + " (2)"] = line.rstrip("\n") else: dict[english_word] = line.rstrip("\n") count += 1 file_obj.close()
"""PytSite Object Document Mapper UI Plugin Widgets """ __author__ = 'Oleksandr Shepetko' __email__ = '*****@*****.**' __license__ = 'MIT' import htmler from typing import List, Callable, Union, Iterable, Tuple from pyuca import Collator from json import dumps as json_dumps from pytsite import lang from plugins import widget, odm, http_api, odm_http_api _pyuca_col = Collator() def _sanitize_kwargs_exclude(kwargs: dict): if not ('exclude' in kwargs and kwargs['exclude']): return if isinstance(kwargs['exclude'], odm.Entity): kwargs['exclude'] = [kwargs['exclude'].ref ] if not kwargs['exclude'].is_new else [] elif isinstance(kwargs['exclude'], str): kwargs['exclude'] = [kwargs['exclude']] elif isinstance(kwargs['exclude'], (list, tuple)): ex = [] for item in kwargs['exclude']: if isinstance(item, odm.Entity): if not item.is_new: ex.append(item.ref)
#!/usr/bin/env python """ C.11.5 Index and Glossary (p211) """ import string, os from plasTeX.Tokenizer import Token, EscapeSequence from plasTeX import Command, Environment from plasTeX.Logging import getLogger from Sectioning import SectionUtils try: from pyuca import Collator collator = Collator(os.path.join(os.path.dirname(__file__), 'allkeys.txt')).sort_key except ImportError: collator = lambda x: x.lower() class IndexUtils(object): """ Helper functions for generating indexes """ linkType = 'index' level = Command.CHAPTER_LEVEL class Index(Command): """ Utility class used to surface the index entries to the renderer """
def __init__(self): self.silpacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-silpa-6.0.0.txt")) self.ucacollator = Collator( os.path.join(os.path.dirname(__file__), "allkeys-6.0.0.txt"))
def process_collection(self, document_entries, debug): self.__collection_handler.create_tok_dir() self.__collection_handler.create_wtd_dir() vocabulary = dict() documents = dict() collator = Collator() if debug: long_file_lines = list() special_file_lines = list() dash_file_lines = list() terms_per_document_sum = 0 for document_entry in document_entries: document_alias = document_entry.get_alias() documents[document_alias] = dict() document_html_str = document_entry.get_html_str() tok_file_lines = list() if debug: print('Procesando {}...'.format(document_entry.get_alias())) long = list() special = list() dash = list() document_terms = Analyzer.retrieve_html_str_terms( document_html_str, long, special, dash) else: document_terms = Analyzer.retrieve_html_str_terms( document_html_str) if len(document_terms) > 0: if debug: # prueba de promedio terms_per_document_sum += len(document_terms) document_terms_np_array = np.array(document_terms) terms, counts = np.unique(document_terms_np_array, return_counts=True) doc_vocabulary = dict(zip(terms, counts)) max_l_freq_lj = max(counts) # El archivo tok debe estar ordenado alfabéticamente for term in sorted(doc_vocabulary.keys(), key=collator.sort_key): freq_ij = doc_vocabulary[ term] # freq_ij = frecuencia del término k_i en el documento d_j f_ij = freq_ij / max_l_freq_lj # f_ij = frecuencia normalizada del término k_i en el documento d_j # Se calcula como freq_ij divido por la frecuencia del término más frecuente en el documento d_j tok_line = '{:30} {:12} {:20}'.format( term, str(freq_ij), str(round(f_ij, 3))) tok_file_lines.append(tok_line) self.update_vocabulary_dict(vocabulary, term, freq_ij) documents[document_alias][term] = round(f_ij, 3) else: tok_file_lines.append('\n') if debug: for long_elem in long: line = '{:35} {}'.format(document_entry.get_alias(), long_elem) long_file_lines.append(line) for special_elem in special: line = '{:35} {}'.format(document_entry.get_alias(), special_elem) special_file_lines.append(line) for dash_elem in dash: line = '{:35} {}'.format(document_entry.get_alias(), dash_elem) dash_file_lines.append(line) self.__collection_handler.create_file_for_document( document_entry.get_alias(), tok_file_lines, DocumentOutputFiles.TOK) vocabulary_file_lines = list() # El archivo Vocabulario debe estar ordenado alfabéticamente for term in sorted(vocabulary.keys(), key=collator.sort_key): values_tuple = vocabulary[term] n_i = values_tuple[0] idf = Utilities.get_inverse_term_frequency(len(document_entries), n_i) vocabulary[term] = (vocabulary[term][0], vocabulary[term][1], idf, vocabulary[term][3]) line = '{:30} {:12} {:20}'.format(term, str(n_i), str(idf)) vocabulary_file_lines.append(line) self.__collection_handler.create_file_for_collection( vocabulary_file_lines, CollectionOutputFiles.VOCABULARY) # Archivos Indice y Postings postings_file_lines = list() index_file_lines = list() postings_file_vocabulary = dict() for document_alias, document_terms in documents.items(): wtd_file_lines = list() for term, f_ij in document_terms.items(): weight = Utilities.get_term_weight(f_ij, vocabulary[term][2]) line = '{:30} {:20}'.format(term, str(weight)) wtd_file_lines.append(line) Indexer.update_postings_dict(postings_file_vocabulary, term, document_alias, weight) self.__collection_handler.create_file_for_document( document_alias, wtd_file_lines, DocumentOutputFiles.WTD) current_postings_line = 0 for term in sorted(postings_file_vocabulary.keys(), key=collator.sort_key): documents_list = postings_file_vocabulary[term] for values_tuple in documents_list: document_alias = values_tuple[0] term_weight = values_tuple[1] if vocabulary[term][3] is None: vocabulary[term] = (vocabulary[term][0], vocabulary[term][1], vocabulary[term][2], current_postings_line) line = '{:30} {:40} {:20}'.format(term, document_alias + '.html', str(term_weight)) postings_file_lines.append(line) current_postings_line += 1 for term in sorted(vocabulary.keys(), key=collator.sort_key): values_tuple = vocabulary[term] postings_entries_count = values_tuple[0] postings_initial_position = values_tuple[3] line = '{:30} {:12} {:12}'.format(term, str(postings_initial_position), str(postings_entries_count)) index_file_lines.append(line) self.__collection_handler.create_file_for_collection( index_file_lines, CollectionOutputFiles.INDEX) self.__collection_handler.create_file_for_collection( postings_file_lines, CollectionOutputFiles.POSTINGS) if debug: Utilities.print_debug_header('Resultados de la indexación', True) print("La cantidad de palabras en el vocabulario es: ", len(vocabulary_file_lines)) print("La cantidad promedio de palabras por documento es de: ", terms_per_document_sum / len(document_entries), " palabras.") print("La cantidad de palabras en long es: ", len(long_file_lines)) print("La cantidad de palabras en special es: ", len(special_file_lines)) print("La cantidad de palabras en dash es: ", len(dash_file_lines)) long_file_str = '\n'.join(line for line in long_file_lines) Utilities.create_and_save_file('long.txt', long_file_str) special_file_str = '\n'.join(line for line in special_file_lines) Utilities.create_and_save_file('special.txt', special_file_str) dash_file_str = '\n'.join(line for line in dash_file_lines) Utilities.create_and_save_file('dash.txt', dash_file_str)
#!/usr/bin/env python import sys from pyuca import Collator collator = Collator() from morphgnt.utils import load_yaml from morphgnt.utils import nfkc_normalize as n danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") greenlee = {} with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f: for line in f: key, value = line.strip().split("\t") greenlee[n(key.decode("utf-8")).split(",")[0]] = { "full-entry": n(key.decode("utf-8")), "components": n(value.decode("utf-8")), } words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))] count = 0 for word in sorted(words, key=collator.sort_key): count += 1 print "{}:".format(word.encode("utf-8")) if word in danker: print " danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8")) print " danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8")) if word in greenlee:
from distutils.dir_util import copy_tree from os.path import isfile, join from subprocess import call from tempfile import mkdtemp from django.conf import settings from django.core.management.base import CommandError from django.template import Template from django.utils import translation from django_countries import countries from pyuca import Collator from hosting.models import Place c = Collator() class LatexCommand(object): template_name = 'PasportaServo.tex' address_only = False make_pdf = False tex_files = [ template_name, 'pages/title.tex', 'pages/address.tex', ] def activate_translation(self): translation.activate(settings.LANGUAGE_CODE)