def test_find_something_rare():
    st = characters(whitelist_categories=['Zs'], min_codepoint=12288)

    find(st, lambda c: unicodedata.category(c) == 'Zs')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) != 'Zs')
Esempio n. 2
def TokenOffsets(string: str):
    Yield the offsets of all Unicode category borders in the *string*,
    including the initial 0 and the final offset value of ``len(string)``.

    Caplitalized words special case: A single upper case letter ('Lu')
    followed by lower case letters ('Ll') are treated as a single token.
    if string is not None and len(string) > 0:
        yield 0
        last = category(string[0])

        for i in range(1, len(string)):
            current = category(string[i])

            if last != current:
                # "join" capitalized tokens:
                if last == 'Lu' and \
                   current == 'Ll' and \
                   (i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')):
                    yield i

            last = current

        yield len(string)
Esempio n. 3
def crear_nombre_usuario(nombre, apellidos):
    # En primer lugar quitamos tildes, colocamos nombres en minúsculas y :
    nombre = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    apellidos = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    iniciales_nombre = ''
    for parte in nombre:
        iniciales_nombre = iniciales_nombre + parte[0]
        iniciales_apellidos = apellidos[0]
    except:  # Estas dos líneas están para crear usuarios cuando no tienen apellidos
        iniciales_apellidos = 'sin'
    for ind in range(len(apellidos))[1:]:
        try:  # Por si acaso el usuario sólo tuviera un apellido:
            iniciales_apellidos = iniciales_apellidos + apellidos[ind][0]
        except IndexError:
    usuario = iniciales_nombre + iniciales_apellidos
    valid_usuario = False
    n = 1
    while valid_usuario == False:
        username = usuario + str(n)
            user = Gauser.objects.get(username=username)
            n += 1
            valid_usuario = True
    return username
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) != 'Lu')
    find(st, lambda c: unicodedata.category(c) != 'Nd')

    assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
Esempio n. 5
	def __new__(cls,s,on_fail='die',msg=None):
		if type(s) == cls: return s
		for k in cls.forbidden,cls.allowed:
			assert type(k) == list
			for ch in k: assert type(ch) == str and len(ch) == 1
			s = s.strip()
			if type(s) != str:
				s = s.decode('utf8')
			for ch in s:
				# Allow:    (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space
				# Disallow: (C)ontrol,(M)combining
				# Combining characters create width formatting issues, so disallow them for now
				if unicodedata.category(ch)[0] in 'CM':
					t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]]
					raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t))
			me = str.__new__(cls,s)
			if cls.max_screen_width:
				me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')])
				assert me.screen_width <= cls.max_screen_width,(
					'too wide (>{} screen width)'.format(cls.max_screen_width))
				assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len)
			assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len)
			assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\
				'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed)))
			assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\
				"contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden))
			return me
		except Exception as e:
			return cls.init_fail(e,s)
Esempio n. 6
def tokens(source):
    p = 0
    while p < len(source):
        ch = source[p]
        cat = category(ch)
        if ch in NEWLINE_CHARS:
            yield NewlineToken(source[p])
            p += 1
        elif cat[0] in "CZ":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "CZ":
                q += 1
            yield WhitespaceToken(source[p:q])
            p = q
        elif cat[0] in "LN":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "LN":
                q += 1
            yield WordToken(source[p:q])
            p = q
            q = p + 1
            while q < len(source) and source[q] == ch:
                q += 1
            yield SymbolToken(source[p:q])
            p = q
def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False):
    Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda":
     { "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"],
       "ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"]
     El diccionario no pude contener una misma cadena de busqueda en varías IDs.
     La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide,
     se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra:
     por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español" 
     coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad.
    import unicodedata
    match_dict = dict()
    matches = []

    # Pasamos la cadena a unicode
    data = unicode(data, "utf8")

    # Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode
    for key in match_list:
        if order_list and not key in order_list:
            raise Exception("key '%s' not in match_list" % key)
        for value in match_list[key]:
            if value in match_dict:
                raise Exception("Duplicate word in list: '%s'" % value)
            match_dict[unicode(value, "utf8")] = key

    # Si ignorecase = True, lo pasamos todo a mayusculas
    if ignorecase:
        data = data.upper()
        match_dict = dict((key.upper(), match_dict[key]) for key in match_dict)

    # Si ascii = True, eliminamos todos los accentos y Ñ
    if only_ascii:
        data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
        match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')),
                           match_dict[key]) for key in match_dict)

    # Ordenamos el listado de mayor tamaño a menor y buscamos.
    for match in sorted(match_dict, key=lambda x: len(x), reverse=True):
        s = data
        for a in matches:
            s = s.replace(a, "")
        if match in s:
    if matches:
        if order_list:
            return type("Mtch_list", (),
                        {"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])})
            return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None})
        if order_list:
            return type("Mtch_list", (), {"key": None, "index": len(order_list)})
            return type("Mtch_list", (), {"key": None, "index": None})
Esempio n. 8
	def ranking(self):
		For each result, removes stopwords, ranks the word, augments the query
		and returns True if successful else False
		print "Indexing results ...."

		for i in range(len(self.results)):
			title = result[0]
			summary = result[1]

			# Remove punctuation and create lists of words
			titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
			summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()

			for tw in titleWords:
				if tw.lower() in self.stopWords:
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, tw, True, True)
					self.applyRanking(i, tw, True, False)

			for sw in summaryWords:
				if sw.lower() in self.stopWords:
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, sw, False, True)
					self.applyRanking(i, sw, False, False)

		print "Indexing results ...."

		return self.augmentQuery()
Esempio n. 9
def normalize_roman(string, additional=None):
    """Removes diacritics from the string and converts to lowercase.

        >>> normalize_roman(u'Eèé')
    if additional:
        safe = additional.keys() + additional.values()
        def gen():
            for c in string:
                if c not in safe:
                    yield normalize_roman(c)
                elif c in additional:
                    yield additional[c]
                    yield c
        return ''.join(gen())
        chars = []
        for c in string:
            if unicodedata.category(c) == 'Lo':
                nor = unicodedata.normalize('NFD', c)
                chars.extend(x for x in nor if unicodedata.category(x) != 'Mn')
        return ''.join(chars).lower()
Esempio n. 10
 def characters(self, content):
     text = content.strip()
     if self._inTitle:
         if self._headerProcessed:
             if not self._ignoreTitle:
     else :
         if self._headerProcessed:           
             if not self._ignoreText:
                 if len(text) > 0:
                     if not self._glossTitleWritten and not self._inTitle:
                     if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable:
                     if self._endDfn:
                         if self._keywordTag == 'dfn':
                             if unicodedata.category(content[0]) == 'Pd':
                                 self._writeHtml(' ')
                             elif content[0] == ' ':
                                 if unicodedata.category(text[0]) != 'Pd':
                                     self._writeHtml(u' \u2014')
                                 self._writeHtml(u' \u2014 ')
                         else:                                   # 'h4' for fb2
                             if unicodedata.category(text[0]) == 'Pd':
                                 text = text[1:]
                         self._endDfn = False
Esempio n. 11
  def splitText(text):
    """ Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """
    segments = []
    remaining_text = __class__.cleanSpaces(text)

    while len(remaining_text) > __class__.MAX_SEGMENT_SIZE:
      cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE]

      # try to split at punctuation
      split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                      lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po"))
      if split_idx is None:
        # try to split at whitespace
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: unicodedata.category(x).startswith("Z"))
      if split_idx is None:
        # try to split at anything not a letter or number
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: not (unicodedata.category(x)[0] in ("L", "N")))
      if split_idx is None:
        # split at the last char
        split_idx = __class__.MAX_SEGMENT_SIZE - 1

      new_segment = cur_text[:split_idx + 1].rstrip()
      remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation)

    if remaining_text:

    return segments
Esempio n. 12
def consolidate_ampers(text: str) -> str:
    """Converts all ampersands in a text to a single one (&).

    :param text: A string which should have ampersands converted.
    :return: The text string after all ampersands have been replaced.

    chosen_amper_value = "\u0026"

    amper_values = dict.fromkeys(
        [chr(i) for i in range(sys.maxunicode)
         # Avoid unnamed control chars throwing ValueErrors
         if (unicodedata.category(chr(i)).startswith('P')
             or unicodedata.category(chr(i)).startswith('S'))
            r" ampersand|ampersand ",,
            re.IGNORECASE) is not None
         and chr(i) != chosen_amper_value]

    # Change all ampersands to one type of ampersand
    for value in amper_values:
        text = text.replace(value, chosen_amper_value)

    return text
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) == "Lu")
    find_any(st, lambda c: unicodedata.category(c) == "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) != "Lu")
    find_any(st, lambda c: unicodedata.category(c) != "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))
Esempio n. 15
 def parse(cls, string):
     from unicodedata import category
     parts = []
     last_ch = None
     for ch in string:
         if last_ch is None:
         elif ch == ".":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
         elif ch == "-":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
             if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]:
                 parts[-1][-1] += ch
         last_ch = ch
     for part in parts:
         for i, x in enumerate(part):
                 part[i] = int(x)
             except (ValueError, TypeError):
         while len(part) > 1 and not part[-1]:
             part[:] = part[:-1]
     return cls(*map(tuple, parts))
 def is_yelling(stuff):
     :return boolean True if all letters in stuff are uppercased
     letters = filter(lambda c: 'L' in unicodedata.category(c), unicode(stuff))  # 'L' category is for 'letter'
     if letters == u'':
         return False        
     return all(('u' in unicodedata.category(c) for c in letters))  # 'u' category is for 'uppercase'
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) == 'Lu')
    find(st, lambda c: unicodedata.category(c) == 'Nd')

        st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) != 'Lu')
    find(st, lambda c: unicodedata.category(c) != 'Nd')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) == 'Lu')
    find(st, lambda c: unicodedata.category(c) == 'Nd')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=("Lu", "Nd"))

    find(st, lambda c: unicodedata.category(c) == "Lu")
    find(st, lambda c: unicodedata.category(c) == "Nd")

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
Esempio n. 21
    def combine_modifiers(self, string):
        Given a string that is space-delimited on Unicode grapheme clusters,
        group Unicode modifier letters with their preceding base characters,
        deal with tie bars, etc.

        string : str
            A Unicode string tokenized into grapheme clusters to be tokenized into simple IPA.

        result = []
        graphemes = string.split()
        temp = ""
        count = len(graphemes)
        for grapheme in reversed(graphemes):
            count -= 1
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" and not ord(grapheme) in [712, 716]:
                temp = grapheme+temp
                # hack for the cases where a space modifier is the first character in the string
                if count == 0:
                    result[-1] = temp+result[-1]
            # catch and repair stress marks
            if len(grapheme) == 1 and ord(grapheme) in [712, 716]:
                result[-1] = grapheme+result[-1]
                temp = ""

            # combine contour tone marks (non-accents)
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
                if len(result) == 0:
                    temp = ""
                    if unicodedata.category(result[-1][0]) == "Sk":
                        result[-1] = grapheme+result[-1]
                        temp = ""

            temp = ""

        # last check for tie bars
        segments = result[::-1]
        i = 0
        r = []
        while i < len(segments):
            # tie bars
            if ord(segments[i][-1]) in [865, 860]:
                i = i+2
                i += 1
        return " ".join(r)
Esempio n. 22
def filterCharacters(s):
	Strip non printable characters

	@type  s: dict|list|tuple|bytes|string
	@param s: Object to remove non-printable characters from

	@rtype:	 dict|list|tuple|bytes|string
	@return: An object that corresponds with the original object, nonprintable characters removed.

	validCategories = ('Lu', 'Ll', 'Lt', 'LC', 'Lm', 'Lo', 'L', 'Mn', 'Mc', 'Me', 'M', 'Nd', 'Nl', 'No', 'N', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'P', 'Sm', 'Sc', 'Sk', 'So', 'S', 'Zs', 'Zl', 'Zp', 'Z')
	convertToBytes = False

	if isinstance(s, dict):
		new = {}
		for k,v in s.items():
			new[k] = filterCharacters(v)
		return new

	if isinstance(s, list):
		new = []
		for item in s:
		return new

	if isinstance(s, tuple):
		new = []
		for item in s:
		return tuple(new)

	if (3, 0) <= sys.version_info:
		if isinstance(s, bytes):
			s = s.decode('utf-8')
			convertToBytes = True

		if isinstance(s, str):
			s = ''.join(c for c in s if unicodedata.category(c) in validCategories)
			if convertToBytes:
				s = s.encode('utf-8')
			return s
			return None

		if isinstance(s, str):
			s = s.decode('utf-8')
			convertToBytes = True

		if isinstance(s, unicode):
			s = ''.join(c for c in s if unicodedata.category(c) in validCategories)
			if convertToBytes:
				s = s.encode('utf-8')
			return s
			return None
def hey(talk_str):
    if talk_str.isupper():
        return u'Woah, chill out!' # all letters uppercase 
    elif [c for c in talk_str 
        if category(c)[0]=='L' or category(c)[0]=='N'] == []:
        return u'Fine. Be that way!' # no letters and no numbers
    elif talk_str[-1] == '?':  # if not all letters uppercase
        return u'Sure.'
    else :
        return u'Whatever.'
Esempio n. 24
def strip_accents(s):
    normalize given string
    if isinstance(s, unicode):
        return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

    return "".join(
        c for c in unicodedata.normalize("NFD", s.decode("utf8", "replace")) if unicodedata.category(c) != "Mn"
Esempio n. 25
 def hey(self, message):
     message = unicode(message.strip())
     print message
     if message == '':
         return 'Fine. Be that way!'
     elif all([ud.category(x) == "Lu" for x in message if ud.category(x)[0] == "L"]) and any([ud.category(x)[0] == "L" for x in message]):
         return 'Woah, chill out!'
     elif message[-1] == '?':
         return 'Sure.'
         return 'Whatever.'
	def 數字調英文中央加分字符號(self, 語句):
		新語句 = []
		舊字 = '0'
		for 字 in 語句:
			if 舊字 != '0' and \
				unicodedata.category(舊字) in 統一碼數字類 and \
				unicodedata.category(字) in 統一碼羅馬字類:
			舊字 = 字
		return self.除掉重覆的空白(''.join(新語句))
Esempio n. 27
def subsplit(pos,len,unich,nextch):
   if cat[0]=='L' and cat[1] != 'o':
     return False
   if cat[0]=='P':
     if pos == 0 or pos >= len-1 or nextch and pos == len-2 and unicodedata.category(nextch)[0]=='P':
       return True
     if unich in _apo_set or unich in _hyp_set:
       return False
     return True
   return True
Esempio n. 28
def charname(s, verbose=False):
    if type(s) != str:
        print('Error: argument must be a str.')
    for i, c in enumerate(s):
        name =
        if verbose:
            long = general_category_values[unicodedata.category(c)]['Long']
            desc = general_category_values[unicodedata.category(c)]['Description']
            print('%d "%s" %s (%s: %s)' % (i, c, name, long, desc))
            category = unicodedata.category(c)
            print('%d "%s" %s (%s)' % (i, c, name, category))
def main():
    for line in fileinput.input():
            uid, text = line.strip().split(" ", 1)
        except ValueError:
        tokens = [token.strip() for token in RE_NUM.split(text) if len(token.strip()) > 0]
        ntoken = len(tokens)

        for i in range(ntoken):
            if tokens[i] and unicodedata.category(tokens[i][0]) != "Nd":

            # Length rule
            if tokens[i][0] == 0 or len(tokens[i]) > len(UNITS):
                tokens[i] = digit_to_single_chinese(tokens[i])

            # Pre-fix rules
            if (i-1) >= 0:
                if RE_LAST_END_SINGLE.match(tokens[i-1]):
                    tokens[i] = digit_to_single_chinese(tokens[i])
                if tokens[i-1].endswith("第") or tokens[i-1].endswith("比"):
                    tokens[i] = digit_to_chinese(tokens[i], liang=False)
                if len(tokens[i-1]) > 1 and tokens[i-1][1] in "零一二三四五六七八九":
                    tokens[i] = digit_to_single_chinese(tokens[i])
            # Post-fix rules
            if (i+1) < ntoken:
                if RE_NEXT_START_SINGLE.match(tokens[i+1]):
                    tokens[i] = digit_to_single_chinese(tokens[i])
                if tokens[i+1].startswith("年"):
                    if len(tokens[i]) > 4:
                        tokens[i] = "%s %s" % ( \
                                digit_to_chinese(tokens[i][:-4], liang=False),
                    elif len(tokens[i]) > 2:
                        tokens[i] = digit_to_single_chinese(tokens[i])
                if tokens[i+1].startswith("比"):
                    tokens[i] = digit_to_chinese(tokens[i], liang=False)
                if unicodedata.category(tokens[i+1][0]) == "Nd":
                    tokens[i] = digit_to_single_chinese(tokens[i])

            # General fall-back rule
            tokens[i] = digit_to_chinese(tokens[i])
        sys.stdout.write("%s %s\n" % (uid, RE_SPACES.sub(" ", "".join(tokens).strip())))
Esempio n. 30
def get_all_punctuation_map() -> Dict[int, type(None)]:
    """Creates a dictionary containing all unicode punctuation and symbols.

    :return: The dictionary, with the ord() of each char mapped to None.

    punctuation_map = dict.fromkeys(
        [i for i in range(sys.maxunicode)
         if unicodedata.category(chr(i)).startswith('P')
         or unicodedata.category(chr(i)).startswith('S')])

    return punctuation_map
Esempio n. 31
import re
import sys
import unicodedata

punctuation = ""

for i in range(sys.maxunicode):
    c = i
        c = unichr(c)
        c = chr(c)
    if (unicodedata.category(c)).startswith("P"):
        punctuation += c

class Matcher:
    def __init__(self, word_re="\w+"):
        self.word_re = word_re

    def __getstate__(self):
        return self.word_re

    def __setstate__(self, word_re):
        self.word_re = word_re

    def __call__(self, text):
        tokens = re.findall(self.word_re, text)
        # Remove empty tokens
        tokens = [token for token in tokens if token]
        return tuple(tokens)
def eliminar_acentos(frase):
    frase = frase.replace('ñ', '#')
    res = ''.join((x for x in unicodedata.normalize('NFD', frase)
                   if unicodedata.category(x) != 'Mn'))
    return res.replace('#', 'ñ')
Esempio n. 33
def remove_diacritics(word: str):
    return ''.join(c for c in unicodedata.normalize('NFKD', word)
                   if unicodedata.category(c) != 'Mn')
Esempio n. 34
def strip_unicode(string):
    for s, r in STRIP_UNICODE.items():
        string = string.replace(s, r)
    return ''.join(c for c in unicodedata.normalize('NFD', string)
                   if unicodedata.category(c) != 'Mn')
def removeDiacritics(string):
    code = ''.join((c for c in unicodedata.normalize('NFD', string)
                    if unicodedata.category(c) != 'Mn'))
    code = code.lower().replace(' ', '').replace('\'', '').replace('-', '')
    return code
Esempio n. 36
 def property_chars(self, prefix):
     return "".join(
         six.unichr(x) for x in range(sys.maxunicode)
         if unicodedata.category(six.unichr(x)).startswith(prefix))
Esempio n. 37
class Factory:

    random_letters = map(random.choice,
                         repeat(string.ascii_letters + string.digits))

    random_letters_with_spaces = map(
        random.choice, repeat(string.ascii_letters + string.digits + " "))

    # See django.contrib.auth.forms.UserCreationForm.username.
    random_letters_for_usernames = map(random.choice,
                                       repeat(string.ascii_letters + ".@+-"))

    random_http_responses = map(random.choice,

    random_octet = partial(random.randint, 0, 255)

    random_octets = iter(random_octet, None)

    random_unicode_codepoint = partial(random.randint, 0, 0x10FFFF)

    random_unicode_codepoints = iter(random_unicode_codepoint, None)

    random_unicode_characters = (
        char for char in map(chr, random_unicode_codepoints)
        if unicodedata.category(char)[0] in "LMNPS")

    random_unicode_non_ascii_characters = (
        char for char in random_unicode_characters if ord(char) >= 128)

    random_unicode_characters_with_spaces = (
        char for char in map(chr, random_unicode_codepoints)
        if unicodedata.category(char)[0] in "LMNPSZ")

    random_unicode_non_ascii_characters_with_spaces = (
        char for char in random_unicode_characters_with_spaces
        if char == " " or ord(char) >= 128)

    def make_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random ASCII letters or digits."""
        source = (self.random_letters_with_spaces
                  if spaces else self.random_letters)
        return prefix + "".join(islice(source, size))

    def make_unicode_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random Unicode characters."""
        source = (self.random_unicode_characters_with_spaces
                  if spaces else self.random_unicode_characters)
        return prefix + "".join(islice(source, size))

    def make_unicode_non_ascii_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random non-ASCII Unicode characters."""
        source = (self.random_unicode_non_ascii_characters_with_spaces
                  if spaces else self.random_unicode_non_ascii_characters)
        return prefix + "".join(islice(source, size))

    def make_bytes(self, size=10):
        """Return a `bytes` filled with random data."""
        return os.urandom(size)

    def make_username(self, size=10):
        """Create an arbitrary user name (but not the actual user)."""
        return "".join(islice(self.random_letters_for_usernames, size))

    def make_email_address(self, login_size=10):
        """Generate an arbitrary email address."""
        return "*****@*****.**" % self.make_string(size=login_size)

    def make_status_code(self):
        """Return an arbitrary HTTP status code."""
        return next(self.random_http_responses)

    exception_type_names = ("TestException#%d" % i for i in count(1))

    def make_exception_type(self, bases=(Exception, ), **namespace):
        return type(next(self.exception_type_names), bases, namespace)

    def make_exception(self, message=None, bases=(Exception, ), **namespace):
        exc_type = self.make_exception_type(bases, **namespace)
        return exc_type() if message is None else exc_type(message)

    def make_absolute_path(self,
        return path_seperator + path_seperator.join(
            for _ in range(directories))

    def pick_bool(self):
        """Return an arbitrary Boolean value (`True` or `False`)."""
        return random.choice((True, False))

    def pick_enum(self, enum, *, but_not=EMPTY_SET):
        """Pick a random item from an enumeration class.

        :param enum: An enumeration class such as `NODE_STATUS`. Can also be
            an `enum.Enum` subclass.
        :return: The value of one of its items.
        :param but_not: A list of choices' IDs to exclude.
        :type but_not: Sequence.
        if issubclass(enum, Enum):
            return random.choice(
                [value for value in enum if value not in but_not])
            return random.choice([
                value for key, value in vars(enum).items()
                if not key.startswith("_") and value not in but_not

    def pick_port(self, port_min=1024, port_max=65535):
        assert port_min >= 0 and port_max <= 65535
        return random.randint(port_min, port_max)

    def pick_choice(self, choices, but_not=None):
        """Pick a random item from `choices`.

        :param choices: A sequence of choices in Django form choices format:
                ('choice_id_1', "Choice name 1"),
                ('choice_id_2', "Choice name 2"),
        :param but_not: A list of choices' IDs to exclude.
        :type but_not: Sequence.
        :return: The "id" portion of a random choice out of `choices`.
        if but_not is None:
            but_not = ()
        return random.choice(
            [choice for choice in choices if choice[0] not in but_not])[0]

    def make_vlan_tag(self, allow_none=False, *, but_not=EMPTY_SET):
        """Create a random VLAN tag.

        :param allow_none: Whether `None` ("no VLAN") can be allowed as an
            outcome.  If `True`, `None` will be included in the possible
            results with a deliberately over-represented probability, in order
            to help trip up bugs that might only show up once in about 4094
            calls otherwise.
        :param but_not: A set of tags that should not be returned.  Any zero
            or `None` entries will be ignored.
        if allow_none and self.pick_bool():
            return None
            for _ in range(100):
                vlan_tag = random.randint(1, 0xFFE)
                if vlan_tag not in but_not:
                    return vlan_tag
            raise TooManyRandomRetries("Could not find an available VLAN tag.")

    def ip_to_url_format(self, ip):
        # We return either '[ip:v6:address]' or 'a.b.c.d' depending on the
        # family of the IP Address.
        ip_addr = IPAddress(ip)
        if ip_addr.version == 6:
            return "[%s]" % str(ip_addr)
            return "%s" % str(ip_addr)

    def make_ipv4_address(self):
        octets = list(islice(self.random_octets, 4))
        if octets[0] == 0:
            octets[0] = 1
        return "%d.%d.%d.%d" % tuple(octets)

    def make_ipv6_address(self):
        # We return from the fc00::/7 space because that's a private
        # space and shouldn't cause problems of addressing the outside
        # world.
        network = IPNetwork("fc00::/7")
        # We can't use random.choice() because there are too many
        # elements in network.
        random_address_index = random.randint(0, network.size - 1)
        return str(IPAddress(network[random_address_index]))

    def make_ip_address(self, ipv6=None):
        """Create a random ip address.

        :param ipv6: True for ipv6, False for ipv4, None for random.

        :return: an IP Address
        :rtype: string
        if ipv6 is None:
            ipv6 = random.randint(0, 1)
        # intentionally allowing all "true" values, including "1".
        if ipv6:
            return self.make_ipv6_address()
            return self.make_ipv4_address()

    def make_UUID(self):
        return str(uuid1())

    def make_UUID_with_timestamp(self, timestamp, clock_seq=None, node=None):
        if node is None:
            node = random.getrandbits(48) | 0x010000000000
        if clock_seq is None:
            clock_seq = random.getrandbits(14)
        timestamp = int(timestamp * 1e9 / 100) + 0x01B21DD213814000
        time_low = timestamp & 0xFFFFFFFF
        time_mid = (timestamp >> 32) & 0xFFFF
        time_hi_version = (timestamp >> 48) & 0x0FFF
        clock_seq_low = clock_seq & 0xFF
        clock_seq_hi_variant = (clock_seq >> 8) & 0x3F
        fields = (
        return str(UUID(fields=fields, version=1))

    def _make_random_network(
        """Generate a random IP network.

        :param slash: Netmask or bit width of the network, e.g. 24 or
            '' for what used to be known as a class-C network.
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :param random_address_factory: A callable that returns a random IP
            address. If not provided, will default to
        :return: A network spanning at least 8 IP addresses (at most 29 bits).
        :rtype: :class:`IPNetwork`
        but_not = frozenset(but_not)
        if disjoint_from is None:
            disjoint_from = []
        if slash is None:
            slash = random.randint(16, 29)
        if random_address_factory is None:
            random_address_factory = self.make_ipv4_address
        # Look randomly for a network that matches our criteria.
        for _ in range(100):
            network = IPNetwork("%s/%s" %
                                (random_address_factory(), slash)).cidr
            forbidden = network in but_not
            clashes = network_clashes(network, disjoint_from)
            if not forbidden and not clashes:
                return network
        raise TooManyRandomRetries("Could not find available network")

    def make_ipv4_network(self,
        """Generate a random IPv4 network.

        :param slash: Netmask or bit width of the network, e.g. 24 or
            '' for what used to be known as a class-C network.
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :return: A network spanning at least 16 IP addresses (at most 28 bits).
        :rtype: :class:`IPNetwork`
        if slash is None:
            slash = random.randint(16, 28)
        return self._make_random_network(

    def make_ipv6_network(self,
        """Generate a random IPv6 network.

        :param slash: Netmask or bit width of the network. If not
            specified, will default to a bit width of between 112 (65536
            addresses) and 125 (8 addresses);
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :return: A network spanning at least 8 IP addresses.
        :rtype: :class:`IPNetwork`
        if slash is None:
            slash = random.randint(112, 125)
        return self._make_random_network(

    def make_ip4_or_6_network(self, version=None, host_bits=None):
        """Generate a random IPv4 or IPv6 network."""
        slash = None
        if version is None:
            version = random.choice([4, 6])
        if version == 4:
            if host_bits is not None:
                slash = 32 - host_bits
            return self.make_ipv4_network(slash=slash)
            if host_bits is not None:
                slash = 128 - host_bits
            return self.make_ipv6_network(slash=slash)

    def pick_ip_in_dynamic_range(self, ngi, *, but_not=EMPTY_SET):
        first = ngi.get_dynamic_ip_range().first
        last = ngi.get_dynamic_ip_range().last
        but_not = {IPAddress(but) for but in but_not if but is not None}
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in static range")

    def pick_ip_in_static_range(self, ngi, *, but_not=EMPTY_SET):
        first = ngi.get_static_ip_range().first
        last = ngi.get_static_ip_range().last
        but_not = {IPAddress(but) for but in but_not if but is not None}
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in static range")

    def pick_ip_in_network(self, network, *, but_not=EMPTY_SET):
        but_not = {
            for but in but_not if but is not None and IPAddress(but) in network
        # Unless the prefix length is very small, make sure we don't select
        # a normally-unusable IP address.
        if network.version == 6 and network.prefixlen < 127:
            # Don't pick the all-zeroes address, since it has special meaning
            # in IPv6 as the subnet-router anycast address. IPv6 does not have
            # a broadcast address, though.
            first, last = network.first + 1, network.last
            network_size = network.size - 1
        elif network.prefixlen < 31:
            # Don't pick broadcast or network addresses.
            first, last = network.first + 1, network.last - 1
            network_size = network.size - 2
            first, last = network.first, network.last
            network_size = network.size
        if len(but_not) == network_size:
            raise ValueError(
                "No IP addresses available in network: %s (but_not=%r)" %
                (network, but_not))
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in network: %s (but_not=%r)" %
            (network, but_not))

    def make_ip_range(self, network):
        """Return a pair of IP addresses from the given network.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        for _ in range(100):
            ip_range = tuple(
                    for _ in range(2)))
            if ip_range[0] < ip_range[1]:
                return ip_range
        raise TooManyRandomRetries(
            "Could not find available IP range in network: %s" % network)

    def make_ipv4_range(self, network=None):
        """Return a pair of IPv4 addresses.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        if network is None:
            network = self.make_ipv4_network()
        return self.make_ip_range(network=network)

    def make_ipv6_range(self, network=None):
        """Return a pair of IPv6 addresses.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        if network is None:
            network = self.make_ipv6_network()
        return self.make_ip_range(network=network)

    def make_mac_address(self, delimiter=":"):
        assert isinstance(delimiter, str)
        octets = islice(self.random_octets, 6)
        return delimiter.join(format(octet, "02x") for octet in octets)

    def make_random_leases(self, num_leases=1):
        """Create a dict of arbitrary ip-to-mac address mappings."""
        # This could be a dict comprehension, but the current loop
        # guards against shortfalls as random IP addresses collide.
        leases = {}
        while len(leases) < num_leases:
            leases[self.make_ipv4_address()] = self.make_mac_address()
        return leases

    def make_date(self, year=2017):
        start = time.mktime(datetime.datetime(year, 1, 1).timetuple())
        end = time.mktime(datetime.datetime(year + 1, 1, 1).timetuple())
        stamp = random.randrange(start, end)
        return datetime.datetime.fromtimestamp(stamp)

    def make_timedelta(self):
        return datetime.timedelta(
            days=random.randint(0, 3 * 365),
            seconds=random.randint(0, 24 * 60 * 60 - 1),
            microseconds=random.randint(0, 999999),

    def make_file(self, location, name=None, contents=None):
        """Create a file, and write data to it.

        Prefer the eponymous convenience wrapper in
        :class:`maastesting.testcase.MAASTestCase`.  It creates a temporary
        directory and arranges for its eventual cleanup.

        :param location: Directory.  Use a temporary directory for this, and
            make sure it gets cleaned up after the test!
        :param name: Optional name for the file.  If none is given, one will
            be made up.
        :param contents: Optional contents for the file. If omitted, some
            arbitrary ASCII text will be written. If Unicode content is
            provided, it will be encoded with UTF-8.
        :type contents: unicode, but containing only ASCII characters.
        :return: Path to the file.
        if name is None:
            name = self.make_string()
        if contents is None:
            contents = self.make_string().encode("ascii")
        if isinstance(contents, str):
            contents = contents.encode("utf-8")
        path = os.path.join(location, name)
        with open(path, "wb") as f:
        return path

    def make_name(self, prefix=None, sep="-", size=6):
        """Generate a random name.

        :param prefix: Optional prefix.  Pass one to help make test failures
            and tracebacks easier to read!  If you don't, you might as well
            use `make_string`.
        :param sep: Separator that will go between the prefix and the random
            portion of the name.  Defaults to a dash.
        :param size: Length of the random portion of the name.  Don't get
            hung up on this; you may need more if uniqueness is really
            important or less if it doesn't but legibility does, but
            generally, use the default.
        :return: A randomized unicode string.
        if prefix is None:
            return self.make_string(size=size)
            return prefix + sep + self.make_string(size=size)

    def make_hostname(self, prefix="host", *args, **kwargs):
        """Generate a random hostname.

        The returned hostname is lowercase because python's urlparse
        implicitely lowercases the hostnames."""
        return self.make_name(prefix=prefix, *args, **kwargs).lower()

    # Always select from a scheme that allows parameters in the URL so
    # that we can round-trip a URL with params successfully (otherwise
    # the params don't get parsed out of the path).
    _make_parsed_url_schemes = tuple(scheme
                                     for scheme in urllib.parse.uses_params
                                     if scheme != "")

    def make_parsed_url(
        """Generate a random parsed URL object.

        Contains randomly generated values for all parts of a URL: scheme,
        location, path, parameters, query, and fragment. However, each part
        can be overridden individually.

        If port=None or port=True, make_port() will be used to select a random
        port, while port=False will create a netloc for the URL that does not
        specify a port. To specify a port in netloc, port parameter
        must be False.

        :return: Instance of :py:class:`urlparse.ParseResult`.
        if port is not False and netloc is not None and netloc.count(":") == 1:
            raise AssertionError(
                "A port number has been requested, however the given netloc "
                "spec %r already contains a port number." % (netloc, ))
        if scheme is None:
            # Select a scheme that allows parameters; see above.
            scheme = random.choice(self._make_parsed_url_schemes)
        if port is None or port is True:
            port = self.pick_port()
        if netloc is None:
            netloc = "" % self.make_name("netloc").lower()
            if isinstance(port, int) and not isinstance(port, bool):
                netloc += ":%d" % port
        if path is None:
            # A leading forward-slash will be added in geturl() if we
            # don't, so ensure it's here now so tests can compare URLs
            # without worrying about it.
            path = self.make_name("/path")
            # Same here with the forward-slash prefix.
            if not path.startswith("/"):
                path = "/" + path
        if params is None:
            params = self.make_name("params")
        if query is None:
            query = self.make_name("query")
        if fragment is None:
            fragment = self.make_name("fragment")
        return urllib.parse.ParseResult(scheme, netloc, path, params, query,

    def make_url(
        """Generate a random URL.

        Contains randomly generated values for all parts of a URL: scheme,
        location, path, parameters, query, and fragment. However, each part
        can be overridden individually.

        :return: string
        return self.make_parsed_url(scheme, netloc, path, params, query,

    def make_simple_http_url(self, netloc=None, path=None, port=None):
        """Create an arbitrary HTTP URL with only a location and path."""
        return self.make_parsed_url(

    def make_names(self, *prefixes):
        """Generate random names.

        Yields a name for each prefix specified.

        :param prefixes: Zero or more prefixes. See `make_name`.
        for prefix in prefixes:
            yield self.make_name(prefix)

    def make_tarball(self, location, contents):
        """Create a tarball containing the given files.

        :param location: Path to a directory where the tarball can be stored.
        :param contents: A dict mapping file names to file contents.  Where
            the value is `None`, the file will contain arbitrary data.
        :return: Path to a gzip-compressed tarball.
        tarball = os.path.join(location, "%s.tar.gz" % self.make_name())
        with TempDirectory() as working_dir:
            source = working_dir.path
            for name, content in contents.items():
                self.make_file(source, name, content)

            subprocess.check_call(["tar", "-C", source, "-czf", tarball, "."])

        return tarball

    def make_response(self, status_code, content, content_type=None):
        """Return a similar response to that which `urllib` returns."""
        headers = http.client.HTTPMessage()
        if content_type is not None:
        return urllib.request.addinfourl(fp=io.BytesIO(content),

    def make_streams(self, stdin=None, stdout=None, stderr=None):
        """Make a fake return value for a SSHClient.exec_command."""
        # is called so stdout can't be None.
        if stdout is None:
            stdout = mock.Mock()

        return (stdin, stdout, stderr)

    def make_CalledProcessError(self):
        """Make a fake :py:class:`subprocess.CalledProcessError`."""
        return subprocess.CalledProcessError(
            returncode=random.randint(1, 10),

    def make_kernel_string(self,
        ubuntu = UbuntuDistroInfo()
        # Only select from MAAS supported releases so we don't have to deal
        # with versions name overlap(e.g Warty and Wily).
            ubuntu_rows = ubuntu._rows
        except AttributeError:
            ubuntu_rows = [row.__dict__ for row in ubuntu._releases]
        supported_releases = [
            release for release in ubuntu_rows
            if int(release["version"].split(".")[0]) >= 12
        release = random.choice(supported_releases)
        # Remove 'LTS' from version if it exists
        version_str = release["version"].split(" ")[0]
        strings = [
            "hwe-%s" % release["series"][0],
            "hwe-%s" % version_str,
            "hwe-%s-edge" % version_str,
        if not generic_only:
            strings += [
                "hwe-%s-lowlatency" % version_str,
                "hwe-%s-lowlatency-edge" % version_str,
        if can_be_release_or_version:
            strings += [release["series"], version_str]
        return random.choice(strings)

    def make_dhcp_packet(
        transaction_id: bytes = None,
        truncated: bool = False,
        truncated_option_value: bool = False,
        bad_cookie: bool = False,
        truncated_option_length: bool = False,
        include_server_identifier: bool = False,
        server_ip: str = "",
        include_end_option: bool = True,
    ) -> bytes:
        """Returns a [possibly invalid] DHCP packet."""
        if transaction_id is None:
            transaction_id = self.make_bytes(size=4)
        options = b""
        if include_server_identifier:
            # 0x36 == 54 (Server Identifier option)
            ip_bytes = int(IPAddress(server_ip).value).to_bytes(4, "big")
            options += b"\x36\x04" + ip_bytes
        if truncated_option_value:
            options += b"\x36\x04\x7f\x01"
            include_end_option = False
        if truncated_option_length:
            options += b"\x36"
            include_end_option = False
        # Currently, we only validation the transaction ID, and the fact that
        # the reply packet has a "Server Identifier" option. This might be
        # considered a bug, but in practice it works out.
        packet = (
            # Message type: 0x02 (BOOTP operation: reply).
            # Hardware type: Ethernet
            # Hardware address length: 6
            # Hops: 0
            b"\x00" +
            # Transaction ID
            transaction_id +
            # Seconds
            # Flags
            # Client IP address:
            # Your (client) IP address:
            # Next server IP address:
            # Relay agent IP address:
            b"\x00\x00\x00\x00" +
            # Client hardware address
            # Hardware address padding
            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" +
            # Server host name
            (b"\x00" * 67) +
            # Boot filename
            (b"\x00" * 125) +
            # Cookie
            (b"\x63\x82\x53\x63" if not bad_cookie else b"xxxx") +
            # "DHCP Offer" option
            b"\x35\x01\x02" + options +
            # End options.
            (b"\xff" if include_end_option else b""))
        if truncated:
            packet = packet[:200]
        return packet
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Esempio n. 39
    for c in range(sys.maxunicode + 1)
    if unicodedata.category(chr(c))[0] == "M"
print("len(UNICODE_NSM) = {}".format(len(UNICODE_NSM)))
print("len(MARK_SET) = {}".format(len(MARK_SET)))

filepath = "UnicodeData.txt"
with open(filepath) as f:
    text =

text = text[:10000]

def main():
    ground_truth = loop_count(text)
    functions = [  # (loop_count, 'loop_count'),
        # (generator_count, 'generator_count'),
Esempio n. 40
def category_count(text):
    return sum(unicodedata.category(char) != "Mn" for char in text)
Esempio n. 41
import cloudscraper
from bs4 import BeautifulSoup
from bs4.element import Comment, Tag
from requests import Response, Session

from ..assets.user_agents import user_agents
from ..utils.ssl_no_verify import no_ssl_verification
from .exeptions import LNException

logger = logging.getLogger(__name__)

LINE_SEP = '<br>'

INVISIBLE_CHARS = [c for c in range(sys.maxunicode) if unicodedata.category(chr(c)) in {'Cf', 'Cc'}]
NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7f, 0xa0), INVISIBLE_CHARS)
NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}

REQUEST_SEMAPHORES: Dict[str, Semaphore] = {}

def get_domain_semaphore(url):
    host = urlparse(url).hostname or url
    if host not in REQUEST_SEMAPHORES:
    return REQUEST_SEMAPHORES[host]

class Crawler(ABC):
    '''Blueprint for creating new crawlers'''
Esempio n. 42
 def remove_accents(self, data):
     return ''.join(x for x in unicodedata.normalize('NFKD', data) if \
     unicodedata.category(x)[0] == 'L')
Esempio n. 43
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Esempio n. 44
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text.lower()):
        if not tok.is_stop:
            if tok.is_alpha and not (tok.is_digit or len(tok.text) == 1):
                if not tok.is_ascii:
                    tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
    if not s:
        return "emptystring"
        s = ' '.join(s)
        return s
Esempio n. 45
File: Progetto: kowh-ai/ckan
    def remove_control_characters(s):
        if not s:
            return ""

        return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
import unicodedata
import sys

cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
# unicodedata.combining(chr(c))

b = unicodedata.normalize('NFD', s)


# Maps all unicode decimal digit characters to their equivalent in ASCII
digitmap = {
    c: ord('0') + unicodedata.digit(chr(c))
    for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'ND'

# Arabic digits
x = '\u0661\u0662\u0663'


b = unicodedata.normalize('NFD', s)
b.encode('ascii', 'ignore').decode('ascii')

# coding: utf-8
from __future__ import print_function
from __future__ import unicode_literals

from unidecode import unidecode
import unicodedata

hun_characters = {bytearray([i]).decode('iso-8859-2') for i in range(256)}
hun_punctuation = {
    for c in hun_characters
    if unicodedata.category(c).startswith('P')
translate_remove_hun_punctuation = {ord(c): None for c in hun_punctuation}
translate_space_for_hun_punctuation = {ord(c): ' ' for c in hun_punctuation}

def lower(words):
    return tuple(w.lower() for w in words)

def remove_accents(words):
    return tuple(''.__class__(unidecode(w)) for w in words)

def lower_without_accents(words):
    return lower(remove_accents(words))

def remove_punctuations(words):