Python Collatorの例、pyuca.Collator Pythonの例

コード例 #1

0

ファイルを表示

def sort_set(set_to_sort):
    coll = pyuca.Collator()
    set_to_sort = sorted(set_to_sort, key=coll.sort_key)
    return set_to_sort

コード例 #2

0

ファイルを表示

import os
import pyuca

unicode_collator = pyuca.Collator(
    os.path.join(os.path.dirname(__file__), 'allkeys.txt'))

コード例 #3

0

ファイルを表示

ファイル: __init__.py プロジェクト: DrMeers/django-countries

from django_countries.conf import settings
from django.utils import six
from django.utils.encoding import force_text
from django.utils.translation import override

from .base import CountriesBase

try:
    import pyuca
except ImportError:
    pyuca = None

# Use UCA sorting if it's available.
if pyuca:
    collator = pyuca.Collator()

    def sort_key(item):
        return collator.sort_key(item[1])
else:
    import unicodedata

    # Cheap and dirty method to sort against ASCII characters only.


    def sort_key(item):
        return (unicodedata.normalize('NFKD', item[1]).encode(
            'ascii', 'ignore').decode('ascii'))


class Countries(CountriesBase):

コード例 #4

0

ファイルを表示

(181, 956)
>>> name(micro), name(micro_kc)
('MICRO SIGN', 'GREEK SMALL LETTER MU')
使用 '1/2' 替代 '½' 可以接受，微符号也确实是小写的希腊字母
'μ'，但是把 '4²' 转换成 '42' 就改变原意了。某些应用程序可以把
'4²' 保存为 '4<sup>2</sup>'，但是 normalize 函数对格式一无所
知。因此，NFKC 或 NFKD 可能会损失或曲解信息，但是可以为搜索和
索引提供便利的中间表述：用户搜索 '1 / 2 inch' 时，如果还能找到
包含 '½ inch' 的文档，那么用户会感到满意。
使用 NFKC 和 NFKD 规范化形式时要小心，而且只能在特
殊情况中使用，例如搜索和索引，而不能用于持久存储，因为这两
种转换会导致数据损失。

通用的Unicode排序
>>> import pyuca #使用PyUCA库
>>> coll = pyuca.Collator()
>>> fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
>>> sorted_fruits = sorted(fruits, key=coll.sort_key)
>>> sorted_fruits
['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


import re
re_numbers_str = re.compile(r'\d+') ➊#前两个正则表达式是字符串类型。
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+') ➋#后两个正则表达式是字节序列类型。
re_words_bytes = re.compile(rb'\w+')
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" ➌
" as 1729 = 1³ + 12³ = 9³ + 10³.") ➍
text_bytes = text_str.encode('utf_8') ➎ #字节序列只能用字节序列正则表达式搜索。
print('Text', repr(text_str), sep='\n ')

コード例 #5

0

ファイルを表示

def main():
    authorsSet = set()
    authorsText = gitAuthorsOutput()
    for line in authorsText.split("\n"):
        if line == '':
            continue

        # Email addresses completely when they are not set.
        # See for example "zapman <unknown>".
        if " <unknown>" in line:
            line = line.replace(" <unknown>", "")

        # Use GitHub URL instead of [email protected]
        if '@users.noreply.github.com' in line:
            line = line.replace('@users.noreply.github.com', '')
            line = line.replace('<', '<https://github.com/')

        if line in blacklist:
            continue

        authorsSet.add(line)

    for author in patchAuthors:
        authorsSet.add(author)

    f = codecs.open("AUTHORS", "w", "utf-8")
    f.write('''// This is the official list of people who have contributed
// to, and/or hold the copyright to Mumble.
//
// The use of Mumble source code is governed by a BSD-style
// license that can be found in the LICENSE file at the root
// of the Mumble source tree or at <https://www.mumble.info/LICENSE>.
//
// Contributions made on behalf of another entity, such as a
// company are indicated with the following suffix:
//
//     John Doe <*****@*****.**> (on behalf of $COMPANY)
//
// It is possible to mix individual contributions with company
// contributions. For example, if a contributor, over time,
// has contributed code copyrighted by the contributor, as well
// as various companies:
//
//     John Doe <*****@*****.**> (individually, on behalf of
//                                $COMPANY1, on behalf of
//                                $COMPANY2, [...]).
//
// Mumble's code is developed in a Git repository. A log of
// every contribution ever made to Mumble is available in the
// Git repository. The Git repository can be queried to get
// detailed authorship information for copyright and attribution
// purposes for each file that makes up the software. A detailed
// analysis of contributions made to Mumble is available via GitHub's
// contribution statistics:
//
// <https://github.com/mumble-voip/mumble/graphs/contributors>

''')

    # Sort alphabetically
    authors = list(authorsSet)
    collator = pyuca.Collator()
    authors.sort(key=collator.sort_key)

    for author in authors:
        f.write(author)
        f.write("\n")

    f.write("""
// Special thanks to:
//
//    Thorvald Natvig, for founding the Mumble project
//    and maintaining it during its formative years.
""")

    f.close()

コード例 #6

0

ファイルを表示

import pyuca

provinces = [
    "พิษณุโลก", "ชัยนาท", "เลย", "เพชรบุรี", "ลำปาง", "ขอนแก่น", "กรุงเทพ",
    "เชียงใหม่", "ตาก", "แพร่"
]
s = sorted(provinces)
print(s)
s_correct = sorted(provinces, key=pyuca.Collator().sort_key)
print(s_correct)
print(provinces)
provinces.sort(key=pyuca.Collator().sort_key)
print(provinces)

コード例 #7

0

ファイルを表示

def unicode_cmp():
    coll = pyuca.Collator()
    fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
    sorted_fruits = sorted(fruits, key=coll.sort_key)
    print(sorted_fruits)

コード例 #8

0

ファイルを表示

ファイル: 01null.py プロジェクト: trtum01/hello-github-sample

    data = csv.reader(file) # = list of string
    data = csv.DictReader(file) # read file header
    print(data.fieldnames)
    fw=csv.writer
    fw.writerow(file)

import datetime
dt = datetime.datetime.now().strftime('%y %m %d %h:%m:%s')

import urllib.request

with urllib.request.urlopen(link)as f:
    pass

import pyuca #unicode c algorhythm
sorted(file,key = pyuca.Collator().sort_key

from collections import Counter


od = Orderdict()

a = vars()
create dict attributes
attrs =
s =[ %% getattr(self,a) for a in attrs]

@property
def blood(self):
    return self.__blood

コード例 #9

0

ファイルを表示

ファイル: sort_dict.py プロジェクト: NoobNovice/NLP-project492

import pyuca
import sys
import re

data_in = [line.strip() for line in open(sys.argv[1], 'r', encoding='utf-8')]
data_sort = sorted(data_in, key = pyuca.Collator().sort_key)
text_out = open(sys.argv[1],"w", encoding='utf-8')
for data in data_sort:
    text_out.write(data+"\n")
text_out.close()

コード例 #10

0

ファイルを表示

ファイル: rn-parse.py プロジェクト: p-moreira/covid19

def parse(bulletin_url, first_page, last_page, check=None, date=None, coord=False):
    # request to find the documents
    pdf = requests.get(bulletin_url)
    open('bulletin.pdf', 'wb').write(pdf.content)

    # parse DataFrame from pdf
    dfs = read_pdf("bulletin.pdf", stream=True, pages=list(range(first_page, last_page + 1)))
    columns = ['municipio', 'suspeito', 'confirmado']
    data = pd.concat(pd.DataFrame(df.iloc[2:,[0,1,4]].replace("-",0).values, columns=columns) 
                     for df in dfs if df.shape[1] == 5)
    data = data.reset_index(drop=True)

    # checksum the data
    data_checksum = data.fillna(0)
    data_checksum["suspeito"] = data_checksum["suspeito"].astype("int")
    data_checksum["confirmado"] = data_checksum["confirmado"].astype("int")

    total = data_checksum.iloc[-1,]
    data_checksum = data_checksum.iloc[:-1,]
    data = data.iloc[:-1,]

    if not all(sum(data_checksum[feature]) == total[feature] for feature in ["confirmado", "suspeito"]):
        print("Atenção! O total raspado não condiz com o total informado no boletim!")
        for feature in ["confirmado", "suspeito"]:
            print(f"{feature}: {sum(data_checksum[feature])} (raspado), {total[feature]} (boletim)")

    # fix multirow lines
    drop_lines = []
    for index, row in data[data['municipio'].isnull()].iterrows():
        #print(' '.join(data.iloc[[index-1, index+1], 0]))
        data.at[index-1, 'municipio'] = ' '.join(data.iloc[[index-1, index+1], 0])
        data.at[index-1, 'suspeito'] = data.iloc[index, 1]
        data.at[index-1, 'confirmado'] = data.iloc[index, 2]
        drop_lines.extend([index, index+1])

    data = data.drop(drop_lines).fillna(0)

    # fixing city names
    data.loc[data["municipio"] == "Governado Dix-Sep Rosado", "municipio"] = "Governador Dix-Sept Rosado"
    data.loc[data["municipio"] == "Lagoa d’Anta", "municipio"] = "Lagoa d'Anta"
    data.loc[data["municipio"] == "Santana dos Matos", "municipio"] = "Santana do Matos"
    if "Assú" in data["municipio"].unique():
        data.loc[data["municipio"] == "Assú", "municipio"] = "Açu"

    # verifying against manually collected data
    data["confirmado"] = data["confirmado"].astype(int)
    data["suspeito"] = data["suspeito"].astype(int)
    data = data.query("suspeito > 0 or confirmado > 0")
    data = data.reset_index(drop=True)

    # checking against existing CSVs
    base_url = "https://raw.githubusercontent.com/leobezerra/covid19/master/data/rn_covid_19_boletins"
    if check:
        df_old = pd.read_csv(f"{base_url}/{date}.csv").query("suspeito > 0 or confirmado > 0")
        data_mun = set(data["municipio"].unique())
        old_mun = set(df_old["municipio"].unique())
        print("Atenção! Os municípios raspados e de referência não batem")
        if data_mun != old_mun:
            print("Não estão no CSV de referência: ", data_mun - old_mun)
            print("Não estão nos dados raspados: ", old_mun - data_mun)
            exit()
        for m in df_old.municipio:
            if df_old[df_old['municipio'] == m]['confirmado'].iloc[0] != data[data['municipio'] == m]['confirmado'].iloc[0]:
                print(m)
                print(df_old[df_old['municipio'] == m]['confirmado'].iloc[0], data[data['municipio'] == m]['confirmado'].iloc[0])
                exit()
    
    # adding latitude and longitude data
    if coord:
        coord_rn = pd.read_csv(coord)
        data = pd.merge(data, coord_rn, how="right").fillna(0)
        data["confirmado"] = data["confirmado"].astype(int)
        data["suspeito"] = data["suspeito"].astype(int)

    # adding date
    data = data.assign(data=datetime.strptime(date, "%m-%d-%Y").strftime("%Y-%m-%d"))
    data = pd.DataFrame(data, columns="municipio,data,confirmado,suspeito,lat,lon".split(","))

    # Pandas is not pythonic
    coll = pyuca.Collator()
    df_municipios = pd.DataFrame(sorted(data["municipio"], key=coll.sort_key), columns=["mun"])
    data = pd.merge(df_municipios, data, left_on="mun", right_on="municipio").drop("mun", axis=1)

    # persisting
    data.to_csv(f"{date}.csv", index=False)

コード例 #11

0

ファイルを表示

    def __iter__(self):
        """
        Iterate through countries, sorted by name.

        Each country record consists of a namedtuple of the two letter
        ISO3166-1 country ``code`` and short ``name``.

        The sorting happens based on the thread's current translation.

        Countries that are in ``settings.COUNTRIES_FIRST`` will be
        displayed before any sorted countries (in the order provided),
        and are only repeated in the sorted list if
        ``settings.COUNTRIES_FIRST_REPEAT`` is ``True``.

        The first countries can be separated from the sorted list by the
        value provided in ``settings.COUNTRIES_FIRST_BREAK``.
        """
        # Initializes countries_first, so needs to happen first.
        countries = self.countries

        # Yield countries that should be displayed first.
        countries_first = (self.translate_pair(code) for code in self.countries_first)

        # Define the sorting method.
        if pyuca:
            collator = pyuca.Collator()

            # Use UCA sorting if it's available.
            def sort_key(item):
                return collator.sort_key(item[1])
        else:
            # Cheap and dirty method to sort against ASCII characters only.
            def sort_key(item):
                return (
                    unicodedata.normalize("NFKD", item[1])
                    .encode("ascii", "ignore")
                    .decode("ascii")
                )

        if self.get_option("first_sort"):
            countries_first = sorted(countries_first, key=sort_key)

        for item in countries_first:
            yield item

        if self.countries_first:
            first_break = self.get_option("first_break")
            if first_break:
                yield ("", force_str(first_break))

        # Force translation before sorting.
        ignore_first = None if self.get_option("first_repeat") else self.countries_first
        countries = tuple(
            itertools.chain.from_iterable(
                self.translate_code(code, ignore_first) for code in countries
            )
        )

        # Return sorted country list.
        for item in sorted(countries, key=sort_key):
            yield item

コード例 #12

0

ファイルを表示

from collections import Iterable
from time import time
from itertools import tee, islice, chain, izip
from os import path

from Acquisition import aq_inner
from zope.component import getMultiAdapter
from zope.schema import getFieldsInOrder
from zope import i18n

import pyuca

allkeys = path.join('/'.join(path.split(pyuca.__file__)[:-1]), 'allkeys.txt')
collator = pyuca.Collator(allkeys)


def flatten(l):
    """Generator for flattening irregularly nested lists. 'Borrowed' from here:
    http://stackoverflow.com
    /questions/2158395/flatten-an-irregular-list-of-lists-in-python

    """
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, basestring):
            for sub in flatten(el):
                yield sub
        else:
            yield el


def get_interface_fields(interface):