def sort_set(set_to_sort): coll = pyuca.Collator() set_to_sort = sorted(set_to_sort, key=coll.sort_key) return set_to_sort
import os import pyuca unicode_collator = pyuca.Collator( os.path.join(os.path.dirname(__file__), 'allkeys.txt'))
from django_countries.conf import settings from django.utils import six from django.utils.encoding import force_text from django.utils.translation import override from .base import CountriesBase try: import pyuca except ImportError: pyuca = None # Use UCA sorting if it's available. if pyuca: collator = pyuca.Collator() def sort_key(item): return collator.sort_key(item[1]) else: import unicodedata # Cheap and dirty method to sort against ASCII characters only. def sort_key(item): return (unicodedata.normalize('NFKD', item[1]).encode( 'ascii', 'ignore').decode('ascii')) class Countries(CountriesBase):
(181, 956) >>> name(micro), name(micro_kc) ('MICRO SIGN', 'GREEK SMALL LETTER MU') 使用 '1/2' 替代 '½' 可以接受,微符号也确实是小写的希腊字母 'μ',但是把 '4²' 转换成 '42' 就改变原意了。某些应用程序可以把 '4²' 保存为 '4<sup>2</sup>',但是 normalize 函数对格式一无所 知。因此,NFKC 或 NFKD 可能会损失或曲解信息,但是可以为搜索和 索引提供便利的中间表述:用户搜索 '1 / 2 inch' 时,如果还能找到 包含 '½ inch' 的文档,那么用户会感到满意。 使用 NFKC 和 NFKD 规范化形式时要小心,而且只能在特 殊情况中使用,例如搜索和索引,而不能用于持久存储,因为这两 种转换会导致数据损失。 通用的Unicode排序 >>> import pyuca #使用PyUCA库 >>> coll = pyuca.Collator() >>> fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola'] >>> sorted_fruits = sorted(fruits, key=coll.sort_key) >>> sorted_fruits ['açaí', 'acerola', 'atemoia', 'cajá', 'caju'] import re re_numbers_str = re.compile(r'\d+') ➊#前两个正则表达式是字符串类型。 re_words_str = re.compile(r'\w+') re_numbers_bytes = re.compile(rb'\d+') ➋#后两个正则表达式是字节序列类型。 re_words_bytes = re.compile(rb'\w+') text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" ➌ " as 1729 = 1³ + 12³ = 9³ + 10³.") ➍ text_bytes = text_str.encode('utf_8') ➎ #字节序列只能用字节序列正则表达式搜索。 print('Text', repr(text_str), sep='\n ')
def main(): authorsSet = set() authorsText = gitAuthorsOutput() for line in authorsText.split("\n"): if line == '': continue # Email addresses completely when they are not set. # See for example "zapman <unknown>". if " <unknown>" in line: line = line.replace(" <unknown>", "") # Use GitHub URL instead of [email protected] if '@users.noreply.github.com' in line: line = line.replace('@users.noreply.github.com', '') line = line.replace('<', '<https://github.com/') if line in blacklist: continue authorsSet.add(line) for author in patchAuthors: authorsSet.add(author) f = codecs.open("AUTHORS", "w", "utf-8") f.write('''// This is the official list of people who have contributed // to, and/or hold the copyright to Mumble. // // The use of Mumble source code is governed by a BSD-style // license that can be found in the LICENSE file at the root // of the Mumble source tree or at <https://www.mumble.info/LICENSE>. // // Contributions made on behalf of another entity, such as a // company are indicated with the following suffix: // // John Doe <*****@*****.**> (on behalf of $COMPANY) // // It is possible to mix individual contributions with company // contributions. For example, if a contributor, over time, // has contributed code copyrighted by the contributor, as well // as various companies: // // John Doe <*****@*****.**> (individually, on behalf of // $COMPANY1, on behalf of // $COMPANY2, [...]). // // Mumble's code is developed in a Git repository. A log of // every contribution ever made to Mumble is available in the // Git repository. The Git repository can be queried to get // detailed authorship information for copyright and attribution // purposes for each file that makes up the software. A detailed // analysis of contributions made to Mumble is available via GitHub's // contribution statistics: // // <https://github.com/mumble-voip/mumble/graphs/contributors> ''') # Sort alphabetically authors = list(authorsSet) collator = pyuca.Collator() authors.sort(key=collator.sort_key) for author in authors: f.write(author) f.write("\n") f.write(""" // Special thanks to: // // Thorvald Natvig, for founding the Mumble project // and maintaining it during its formative years. """) f.close()
import pyuca provinces = [ "พิษณุโลก", "ชัยนาท", "เลย", "เพชรบุรี", "ลำปาง", "ขอนแก่น", "กรุงเทพ", "เชียงใหม่", "ตาก", "แพร่" ] s = sorted(provinces) print(s) s_correct = sorted(provinces, key=pyuca.Collator().sort_key) print(s_correct) print(provinces) provinces.sort(key=pyuca.Collator().sort_key) print(provinces)
def unicode_cmp(): coll = pyuca.Collator() fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola'] sorted_fruits = sorted(fruits, key=coll.sort_key) print(sorted_fruits)
data = csv.reader(file) # = list of string data = csv.DictReader(file) # read file header print(data.fieldnames) fw=csv.writer fw.writerow(file) import datetime dt = datetime.datetime.now().strftime('%y %m %d %h:%m:%s') import urllib.request with urllib.request.urlopen(link)as f: pass import pyuca #unicode c algorhythm sorted(file,key = pyuca.Collator().sort_key from collections import Counter od = Orderdict() a = vars() create dict attributes attrs = s =[ %% getattr(self,a) for a in attrs] @property def blood(self): return self.__blood
import pyuca import sys import re data_in = [line.strip() for line in open(sys.argv[1], 'r', encoding='utf-8')] data_sort = sorted(data_in, key = pyuca.Collator().sort_key) text_out = open(sys.argv[1],"w", encoding='utf-8') for data in data_sort: text_out.write(data+"\n") text_out.close()
def parse(bulletin_url, first_page, last_page, check=None, date=None, coord=False): # request to find the documents pdf = requests.get(bulletin_url) open('bulletin.pdf', 'wb').write(pdf.content) # parse DataFrame from pdf dfs = read_pdf("bulletin.pdf", stream=True, pages=list(range(first_page, last_page + 1))) columns = ['municipio', 'suspeito', 'confirmado'] data = pd.concat(pd.DataFrame(df.iloc[2:,[0,1,4]].replace("-",0).values, columns=columns) for df in dfs if df.shape[1] == 5) data = data.reset_index(drop=True) # checksum the data data_checksum = data.fillna(0) data_checksum["suspeito"] = data_checksum["suspeito"].astype("int") data_checksum["confirmado"] = data_checksum["confirmado"].astype("int") total = data_checksum.iloc[-1,] data_checksum = data_checksum.iloc[:-1,] data = data.iloc[:-1,] if not all(sum(data_checksum[feature]) == total[feature] for feature in ["confirmado", "suspeito"]): print("Atenção! O total raspado não condiz com o total informado no boletim!") for feature in ["confirmado", "suspeito"]: print(f"{feature}: {sum(data_checksum[feature])} (raspado), {total[feature]} (boletim)") # fix multirow lines drop_lines = [] for index, row in data[data['municipio'].isnull()].iterrows(): #print(' '.join(data.iloc[[index-1, index+1], 0])) data.at[index-1, 'municipio'] = ' '.join(data.iloc[[index-1, index+1], 0]) data.at[index-1, 'suspeito'] = data.iloc[index, 1] data.at[index-1, 'confirmado'] = data.iloc[index, 2] drop_lines.extend([index, index+1]) data = data.drop(drop_lines).fillna(0) # fixing city names data.loc[data["municipio"] == "Governado Dix-Sep Rosado", "municipio"] = "Governador Dix-Sept Rosado" data.loc[data["municipio"] == "Lagoa d’Anta", "municipio"] = "Lagoa d'Anta" data.loc[data["municipio"] == "Santana dos Matos", "municipio"] = "Santana do Matos" if "Assú" in data["municipio"].unique(): data.loc[data["municipio"] == "Assú", "municipio"] = "Açu" # verifying against manually collected data data["confirmado"] = data["confirmado"].astype(int) data["suspeito"] = data["suspeito"].astype(int) data = data.query("suspeito > 0 or confirmado > 0") data = data.reset_index(drop=True) # checking against existing CSVs base_url = "https://raw.githubusercontent.com/leobezerra/covid19/master/data/rn_covid_19_boletins" if check: df_old = pd.read_csv(f"{base_url}/{date}.csv").query("suspeito > 0 or confirmado > 0") data_mun = set(data["municipio"].unique()) old_mun = set(df_old["municipio"].unique()) print("Atenção! Os municípios raspados e de referência não batem") if data_mun != old_mun: print("Não estão no CSV de referência: ", data_mun - old_mun) print("Não estão nos dados raspados: ", old_mun - data_mun) exit() for m in df_old.municipio: if df_old[df_old['municipio'] == m]['confirmado'].iloc[0] != data[data['municipio'] == m]['confirmado'].iloc[0]: print(m) print(df_old[df_old['municipio'] == m]['confirmado'].iloc[0], data[data['municipio'] == m]['confirmado'].iloc[0]) exit() # adding latitude and longitude data if coord: coord_rn = pd.read_csv(coord) data = pd.merge(data, coord_rn, how="right").fillna(0) data["confirmado"] = data["confirmado"].astype(int) data["suspeito"] = data["suspeito"].astype(int) # adding date data = data.assign(data=datetime.strptime(date, "%m-%d-%Y").strftime("%Y-%m-%d")) data = pd.DataFrame(data, columns="municipio,data,confirmado,suspeito,lat,lon".split(",")) # Pandas is not pythonic coll = pyuca.Collator() df_municipios = pd.DataFrame(sorted(data["municipio"], key=coll.sort_key), columns=["mun"]) data = pd.merge(df_municipios, data, left_on="mun", right_on="municipio").drop("mun", axis=1) # persisting data.to_csv(f"{date}.csv", index=False)
def __iter__(self): """ Iterate through countries, sorted by name. Each country record consists of a namedtuple of the two letter ISO3166-1 country ``code`` and short ``name``. The sorting happens based on the thread's current translation. Countries that are in ``settings.COUNTRIES_FIRST`` will be displayed before any sorted countries (in the order provided), and are only repeated in the sorted list if ``settings.COUNTRIES_FIRST_REPEAT`` is ``True``. The first countries can be separated from the sorted list by the value provided in ``settings.COUNTRIES_FIRST_BREAK``. """ # Initializes countries_first, so needs to happen first. countries = self.countries # Yield countries that should be displayed first. countries_first = (self.translate_pair(code) for code in self.countries_first) # Define the sorting method. if pyuca: collator = pyuca.Collator() # Use UCA sorting if it's available. def sort_key(item): return collator.sort_key(item[1]) else: # Cheap and dirty method to sort against ASCII characters only. def sort_key(item): return ( unicodedata.normalize("NFKD", item[1]) .encode("ascii", "ignore") .decode("ascii") ) if self.get_option("first_sort"): countries_first = sorted(countries_first, key=sort_key) for item in countries_first: yield item if self.countries_first: first_break = self.get_option("first_break") if first_break: yield ("", force_str(first_break)) # Force translation before sorting. ignore_first = None if self.get_option("first_repeat") else self.countries_first countries = tuple( itertools.chain.from_iterable( self.translate_code(code, ignore_first) for code in countries ) ) # Return sorted country list. for item in sorted(countries, key=sort_key): yield item
from collections import Iterable from time import time from itertools import tee, islice, chain, izip from os import path from Acquisition import aq_inner from zope.component import getMultiAdapter from zope.schema import getFieldsInOrder from zope import i18n import pyuca allkeys = path.join('/'.join(path.split(pyuca.__file__)[:-1]), 'allkeys.txt') collator = pyuca.Collator(allkeys) def flatten(l): """Generator for flattening irregularly nested lists. 'Borrowed' from here: http://stackoverflow.com /questions/2158395/flatten-an-irregular-list-of-lists-in-python """ for el in l: if isinstance(el, Iterable) and not isinstance(el, basestring): for sub in flatten(el): yield sub else: yield el def get_interface_fields(interface):