def test_find_something_rare():
    st = characters(whitelist_categories=['Zs'], min_codepoint=12288)

    find(st, lambda c: unicodedata.category(c) == 'Zs')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) != 'Zs')
Example #2
0
File: strtok.py Project: fnl/libfnl
def TokenOffsets(string: str):
    """
    Yield the offsets of all Unicode category borders in the *string*,
    including the initial 0 and the final offset value of ``len(string)``.

    Caplitalized words special case: A single upper case letter ('Lu')
    followed by lower case letters ('Ll') are treated as a single token.
    """
    if string is not None and len(string) > 0:
        yield 0
        last = category(string[0])

        for i in range(1, len(string)):
            current = category(string[i])

            if last != current:
                # "join" capitalized tokens:
                if last == 'Lu' and \
                   current == 'Ll' and \
                   (i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')):
                    pass
                else:
                    yield i

            last = current

        yield len(string)
Example #3
0
def crear_nombre_usuario(nombre, apellidos):
    # En primer lugar quitamos tildes, colocamos nombres en minúsculas y :
    nombre = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    apellidos = ''.join(
        (c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if
         unicodedata.category(c) != 'Mn')).lower().split()
    iniciales_nombre = ''
    for parte in nombre:
        iniciales_nombre = iniciales_nombre + parte[0]
    try:
        iniciales_apellidos = apellidos[0]
    except:  # Estas dos líneas están para crear usuarios cuando no tienen apellidos
        iniciales_apellidos = 'sin'
    for ind in range(len(apellidos))[1:]:
        try:  # Por si acaso el usuario sólo tuviera un apellido:
            iniciales_apellidos = iniciales_apellidos + apellidos[ind][0]
        except IndexError:
            pass
    usuario = iniciales_nombre + iniciales_apellidos
    valid_usuario = False
    n = 1
    while valid_usuario == False:
        username = usuario + str(n)
        try:
            user = Gauser.objects.get(username=username)
            n += 1
        except:
            valid_usuario = True
    return username
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) != 'Lu')
    find(st, lambda c: unicodedata.category(c) != 'Nd')

    assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
Example #5
0
File: obj.py Project: mmgen/mmgen
	def __new__(cls,s,on_fail='die',msg=None):
		if type(s) == cls: return s
		cls.arg_chk(on_fail)
		for k in cls.forbidden,cls.allowed:
			assert type(k) == list
			for ch in k: assert type(ch) == str and len(ch) == 1
		try:
			s = s.strip()
			if type(s) != str:
				s = s.decode('utf8')
			for ch in s:
				# Allow:    (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space
				# Disallow: (C)ontrol,(M)combining
				# Combining characters create width formatting issues, so disallow them for now
				if unicodedata.category(ch)[0] in 'CM':
					t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]]
					raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t))
			me = str.__new__(cls,s)
			if cls.max_screen_width:
				me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')])
				assert me.screen_width <= cls.max_screen_width,(
					'too wide (>{} screen width)'.format(cls.max_screen_width))
			else:
				assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len)
			assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len)
			assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\
				'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed)))
			assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\
				"contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden))
			return me
		except Exception as e:
			return cls.init_fail(e,s)
def tokens(source):
    p = 0
    while p < len(source):
        ch = source[p]
        cat = category(ch)
        if ch in NEWLINE_CHARS:
            yield NewlineToken(source[p])
            p += 1
        elif cat[0] in "CZ":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "CZ":
                q += 1
            yield WhitespaceToken(source[p:q])
            p = q
        elif cat[0] in "LN":
            q = p + 1
            while q < len(source) and category(source[q])[0] in "LN":
                q += 1
            yield WordToken(source[p:q])
            p = q
        else:
            q = p + 1
            while q < len(source) and source[q] == ch:
                q += 1
            yield SymbolToken(source[p:q])
            p = q
def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False):
    """
    Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda":
     { "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"],
       "ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"]
     }
    
     El diccionario no pude contener una misma cadena de busqueda en varías IDs.
        
     La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide,
     se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra:
     por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español" 
     coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad.
    
    """
    import unicodedata
    match_dict = dict()
    matches = []

    # Pasamos la cadena a unicode
    data = unicode(data, "utf8")

    # Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode
    for key in match_list:
        if order_list and not key in order_list:
            raise Exception("key '%s' not in match_list" % key)
        for value in match_list[key]:
            if value in match_dict:
                raise Exception("Duplicate word in list: '%s'" % value)
            match_dict[unicode(value, "utf8")] = key

    # Si ignorecase = True, lo pasamos todo a mayusculas
    if ignorecase:
        data = data.upper()
        match_dict = dict((key.upper(), match_dict[key]) for key in match_dict)

    # Si ascii = True, eliminamos todos los accentos y Ñ
    if only_ascii:
        data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
        match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')),
                           match_dict[key]) for key in match_dict)

    # Ordenamos el listado de mayor tamaño a menor y buscamos.
    for match in sorted(match_dict, key=lambda x: len(x), reverse=True):
        s = data
        for a in matches:
            s = s.replace(a, "")
        if match in s:
            matches.append(match)
    if matches:
        if order_list:
            return type("Mtch_list", (),
                        {"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])})
        else:
            return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None})
    else:
        if order_list:
            return type("Mtch_list", (), {"key": None, "index": len(order_list)})
        else:
            return type("Mtch_list", (), {"key": None, "index": None})
Example #8
0
	def ranking(self):
		"""
		For each result, removes stopwords, ranks the word, augments the query
		and returns True if successful else False
		"""
		print "Indexing results ...."

		for i in range(len(self.results)):
			result = self.results[i]
			title = result[0]
			summary = result[1]

			# Remove punctuation and create lists of words
			titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
			summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()

			for tw in titleWords:
				if tw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, tw, True, True)
				else:
					self.applyRanking(i, tw, True, False)

			for sw in summaryWords:
				if sw.lower() in self.stopWords:
					continue
				if self.user_feedback[i] == 'y':
					self.applyRanking(i, sw, False, True)
				else:
					self.applyRanking(i, sw, False, False)

		print "Indexing results ...."

		return self.augmentQuery()
Example #9
0
def normalize_roman(string, additional=None):
    """Removes diacritics from the string and converts to lowercase.

        >>> normalize_roman(u'Eèé')
        u'eee'
    """
    if additional:
        safe = additional.keys() + additional.values()
        def gen():
            for c in string:
                if c not in safe:
                    yield normalize_roman(c)
                elif c in additional:
                    yield additional[c]
                else:
                    yield c
        return ''.join(gen())
    else:
        chars = []
        for c in string:
            if unicodedata.category(c) == 'Lo':
                chars.append(c)
            else:
                nor = unicodedata.normalize('NFD', c)
                chars.extend(x for x in nor if unicodedata.category(x) != 'Mn')
        return ''.join(chars).lower()
Example #10
0
 def characters(self, content):
     text = content.strip()
     
     if self._inTitle:
         if self._headerProcessed:
             if not self._ignoreTitle:
                 self._writeHtml(content)
                 
     else :
         if self._headerProcessed:           
             if not self._ignoreText:
                 if len(text) > 0:
                     if not self._glossTitleWritten and not self._inTitle:
                         self._writeDefaultTitle()                                        
                     if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable:
                         self._startGeneratedPara()
                     if self._endDfn:
                         if self._keywordTag == 'dfn':
                             if unicodedata.category(content[0]) == 'Pd':
                                 self._writeHtml(' ')
                             elif content[0] == ' ':
                                 if unicodedata.category(text[0]) != 'Pd':
                                     self._writeHtml(u' \u2014')
                             else:
                                 self._writeHtml(u' \u2014 ')
                             self._writeHtml(content)
                         else:                                   # 'h4' for fb2
                             if unicodedata.category(text[0]) == 'Pd':
                                 text = text[1:]
                             self._writeHtml(text.strip())
                         self._endDfn = False
                     else:
                         self._writeHtml(content)
Example #11
0
  def splitText(text):
    """ Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """
    segments = []
    remaining_text = __class__.cleanSpaces(text)

    while len(remaining_text) > __class__.MAX_SEGMENT_SIZE:
      cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE]

      # try to split at punctuation
      split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                      # https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
                                                      lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po"))
      if split_idx is None:
        # try to split at whitespace
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: unicodedata.category(x).startswith("Z"))
      if split_idx is None:
        # try to split at anything not a letter or number
        split_idx = __class__.findLastCharIndexMatching(cur_text,
                                                        lambda x: not (unicodedata.category(x)[0] in ("L", "N")))
      if split_idx is None:
        # split at the last char
        split_idx = __class__.MAX_SEGMENT_SIZE - 1

      new_segment = cur_text[:split_idx + 1].rstrip()
      segments.append(new_segment)
      remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation)

    if remaining_text:
      segments.append(remaining_text)

    return segments
Example #12
0
def consolidate_ampers(text: str) -> str:
    """Converts all ampersands in a text to a single one (&).

    :param text: A string which should have ampersands converted.
    :return: The text string after all ampersands have been replaced.
    """

    chosen_amper_value = "\u0026"

    amper_values = dict.fromkeys(
        [chr(i) for i in range(sys.maxunicode)
         # Avoid unnamed control chars throwing ValueErrors
         if (unicodedata.category(chr(i)).startswith('P')
             or unicodedata.category(chr(i)).startswith('S'))
         and re.search(
            r" ampersand|ampersand ", unicodedata.name(chr(i)),
            re.IGNORECASE) is not None
         and chr(i) != chosen_amper_value]
    )

    # Change all ampersands to one type of ampersand
    for value in amper_values:
        text = text.replace(value, chosen_amper_value)

    return text
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) == "Lu")
    find_any(st, lambda c: unicodedata.category(c) == "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=("Lu", "Nd"))

    find_any(st, lambda c: unicodedata.category(c) != "Lu")
    find_any(st, lambda c: unicodedata.category(c) != "Nd")

    assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))
Example #15
0
 def parse(cls, string):
     from unicodedata import category
     parts = []
     last_ch = None
     for ch in string:
         if last_ch is None:
             parts.append([ch])
         elif ch == ".":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
             parts[-1].append("")
         elif ch == "-":
             if last_ch in ".-":
                 parts[-1][-1] += "0"
             parts.append([""])
         else:
             if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]:
                 parts.append([ch])
             else:
                 parts[-1][-1] += ch
         last_ch = ch
     for part in parts:
         for i, x in enumerate(part):
             try:
                 part[i] = int(x)
             except (ValueError, TypeError):
                 pass
         while len(part) > 1 and not part[-1]:
             part[:] = part[:-1]
     return cls(*map(tuple, parts))
 def is_yelling(stuff):
     """
     :return boolean True if all letters in stuff are uppercased
     """
     letters = filter(lambda c: 'L' in unicodedata.category(c), unicode(stuff))  # 'L' category is for 'letter'
     if letters == u'':
         return False        
     return all(('u' in unicodedata.category(c) for c in letters))  # 'u' category is for 'uppercase'
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) == 'Lu')
    find(st, lambda c: unicodedata.category(c) == 'Nd')

    assert_no_examples(
        st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_exclude_characters_of_specific_groups():
    st = characters(blacklist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) != 'Lu')
    find(st, lambda c: unicodedata.category(c) != 'Nd')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=('Lu', 'Nd'))

    find(st, lambda c: unicodedata.category(c) == 'Lu')
    find(st, lambda c: unicodedata.category(c) == 'Nd')

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_characters_of_specific_groups():
    st = characters(whitelist_categories=("Lu", "Nd"))

    find(st, lambda c: unicodedata.category(c) == "Lu")
    find(st, lambda c: unicodedata.category(c) == "Nd")

    with pytest.raises(NoSuchExample):
        find(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
Example #21
0
    def combine_modifiers(self, string):
        """
        Given a string that is space-delimited on Unicode grapheme clusters,
        group Unicode modifier letters with their preceding base characters,
        deal with tie bars, etc.

        Parameters
        ----------
        string : str
            A Unicode string tokenized into grapheme clusters to be tokenized into simple IPA.

        """
        result = []
        graphemes = string.split()
        temp = ""
        count = len(graphemes)
        for grapheme in reversed(graphemes):
            count -= 1
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" and not ord(grapheme) in [712, 716]:
                temp = grapheme+temp
                # hack for the cases where a space modifier is the first character in the string
                if count == 0:
                    result[-1] = temp+result[-1]
                continue
            # catch and repair stress marks
            if len(grapheme) == 1 and ord(grapheme) in [712, 716]:
                result[-1] = grapheme+result[-1]
                temp = ""
                continue

            # combine contour tone marks (non-accents)
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
                if len(result) == 0:
                    result.append(grapheme)
                    temp = ""
                    continue
                else:
                    if unicodedata.category(result[-1][0]) == "Sk":
                        result[-1] = grapheme+result[-1]
                        temp = ""
                        continue

            result.append(grapheme+temp)
            temp = ""

        # last check for tie bars
        segments = result[::-1]
        i = 0
        r = []
        while i < len(segments):
            # tie bars
            if ord(segments[i][-1]) in [865, 860]:
                r.append(segments[i]+segments[i+1])
                i = i+2
            else:
                r.append(segments[i])
                i += 1
        return " ".join(r)
Example #22
0
def filterCharacters(s):
	"""
	Strip non printable characters

	@type  s: dict|list|tuple|bytes|string
	@param s: Object to remove non-printable characters from

	@rtype:	 dict|list|tuple|bytes|string
	@return: An object that corresponds with the original object, nonprintable characters removed.
	"""

	validCategories = ('Lu', 'Ll', 'Lt', 'LC', 'Lm', 'Lo', 'L', 'Mn', 'Mc', 'Me', 'M', 'Nd', 'Nl', 'No', 'N', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'P', 'Sm', 'Sc', 'Sk', 'So', 'S', 'Zs', 'Zl', 'Zp', 'Z')
	convertToBytes = False

	if isinstance(s, dict):
		new = {}
		for k,v in s.items():
			new[k] = filterCharacters(v)
		return new

	if isinstance(s, list):
		new = []
		for item in s:
			new.append(filterCharacters(item))
		return new

	if isinstance(s, tuple):
		new = []
		for item in s:
			new.append(filterCharacters(item))
		return tuple(new)

	if (3, 0) <= sys.version_info:
		if isinstance(s, bytes):
			s = s.decode('utf-8')
			convertToBytes = True

		if isinstance(s, str):
			s = ''.join(c for c in s if unicodedata.category(c) in validCategories)
			if convertToBytes:
				s = s.encode('utf-8')
			return s
		else:
			return None

	else:
		if isinstance(s, str):
			s = s.decode('utf-8')
			convertToBytes = True

		if isinstance(s, unicode):
			s = ''.join(c for c in s if unicodedata.category(c) in validCategories)
			if convertToBytes:
				s = s.encode('utf-8')
			return s
		else:
			return None
def hey(talk_str):
    if talk_str.isupper():
        return u'Woah, chill out!' # all letters uppercase 
    elif [c for c in talk_str 
        if category(c)[0]=='L' or category(c)[0]=='N'] == []:
        return u'Fine. Be that way!' # no letters and no numbers
    elif talk_str[-1] == '?':  # if not all letters uppercase
        return u'Sure.'
    else :
        return u'Whatever.'
Example #24
0
def strip_accents(s):
    """ 
    normalize given string
    """
    if isinstance(s, unicode):
        return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

    return "".join(
        c for c in unicodedata.normalize("NFD", s.decode("utf8", "replace")) if unicodedata.category(c) != "Mn"
    )
Example #25
0
 def hey(self, message):
     message = unicode(message.strip())
     print message
     if message == '':
         return 'Fine. Be that way!'
     elif all([ud.category(x) == "Lu" for x in message if ud.category(x)[0] == "L"]) and any([ud.category(x)[0] == "L" for x in message]):
         return 'Woah, chill out!'
     elif message[-1] == '?':
         return 'Sure.'
     else:
         return 'Whatever.'
	def 數字調英文中央加分字符號(self, 語句):
		新語句 = []
		舊字 = '0'
		for 字 in 語句:
			if 舊字 != '0' and \
				unicodedata.category(舊字) in 統一碼數字類 and \
				unicodedata.category(字) in 統一碼羅馬字類:
				新語句.append(分字符號)
			新語句.append(字)
			舊字 = 字
		return self.除掉重覆的空白(''.join(新語句))
Example #27
0
def subsplit(pos,len,unich,nextch):
   cat=unicodedata.category(unich)
   if cat[0]=='L' and cat[1] != 'o':
     return False
   if cat[0]=='P':
     if pos == 0 or pos >= len-1 or nextch and pos == len-2 and unicodedata.category(nextch)[0]=='P':
       return True
     if unich in _apo_set or unich in _hyp_set:
       return False
     return True
   return True
Example #28
0
def charname(s, verbose=False):
    if type(s) != str:
        print('Error: argument must be a str.')
    for i, c in enumerate(s):
        name = unicodedata.name(c)
        if verbose:
            long = general_category_values[unicodedata.category(c)]['Long']
            desc = general_category_values[unicodedata.category(c)]['Description']
            print('%d "%s" %s (%s: %s)' % (i, c, name, long, desc))
        else:
            category = unicodedata.category(c)
            print('%d "%s" %s (%s)' % (i, c, name, category))
def main():
    for line in fileinput.input():
        try:
            uid, text = line.strip().split(" ", 1)
        except ValueError:
            continue
        tokens = [token.strip() for token in RE_NUM.split(text) if len(token.strip()) > 0]
        ntoken = len(tokens)

        for i in range(ntoken):
            if tokens[i] and unicodedata.category(tokens[i][0]) != "Nd":
                continue

            # Length rule
            if tokens[i][0] == 0 or len(tokens[i]) > len(UNITS):
                tokens[i] = digit_to_single_chinese(tokens[i])
                continue

            # Pre-fix rules
            if (i-1) >= 0:
                if RE_LAST_END_SINGLE.match(tokens[i-1]):
                    tokens[i] = digit_to_single_chinese(tokens[i])
                    continue
                if tokens[i-1].endswith("第") or tokens[i-1].endswith("比"):
                    tokens[i] = digit_to_chinese(tokens[i], liang=False)
                    continue
                if len(tokens[i-1]) > 1 and tokens[i-1][1] in "零一二三四五六七八九":
                    tokens[i] = digit_to_single_chinese(tokens[i])
                    continue
            # Post-fix rules
            if (i+1) < ntoken:
                if RE_NEXT_START_SINGLE.match(tokens[i+1]):
                    tokens[i] = digit_to_single_chinese(tokens[i])
                    continue
                if tokens[i+1].startswith("年"):
                    if len(tokens[i]) > 4:
                        tokens[i] = "%s %s" % ( \
                                digit_to_chinese(tokens[i][:-4], liang=False),
                                digit_to_single_chinese(tokens[i][-4:]))
                        continue
                    elif len(tokens[i]) > 2:
                        tokens[i] = digit_to_single_chinese(tokens[i])
                        continue
                if tokens[i+1].startswith("比"):
                    tokens[i] = digit_to_chinese(tokens[i], liang=False)
                    continue
                if unicodedata.category(tokens[i+1][0]) == "Nd":
                    tokens[i] = digit_to_single_chinese(tokens[i])
                    continue

            # General fall-back rule
            tokens[i] = digit_to_chinese(tokens[i])
        sys.stdout.write("%s %s\n" % (uid, RE_SPACES.sub(" ", "".join(tokens).strip())))
Example #30
0
def get_all_punctuation_map() -> Dict[int, type(None)]:
    """Creates a dictionary containing all unicode punctuation and symbols.

    :return: The dictionary, with the ord() of each char mapped to None.
    """

    punctuation_map = dict.fromkeys(
        [i for i in range(sys.maxunicode)
         if unicodedata.category(chr(i)).startswith('P')
         or unicodedata.category(chr(i)).startswith('S')])

    return punctuation_map
Example #31
0
import re
import sys
import unicodedata

punctuation = ""

for i in range(sys.maxunicode):
    c = i
    try:
        c = unichr(c)
    except:
        c = chr(c)
    if (unicodedata.category(c)).startswith("P"):
        punctuation += c


class Matcher:
    def __init__(self, word_re="\w+"):
        self.word_re = word_re

    def __getstate__(self):
        return self.word_re

    def __setstate__(self, word_re):
        self.word_re = word_re

    def __call__(self, text):
        tokens = re.findall(self.word_re, text)
        # Remove empty tokens
        tokens = [token for token in tokens if token]
        return tuple(tokens)
def eliminar_acentos(frase):
    frase = frase.replace('ñ', '#')
    res = ''.join((x for x in unicodedata.normalize('NFD', frase)
                   if unicodedata.category(x) != 'Mn'))
    return res.replace('#', 'ñ')
Example #33
0
def remove_diacritics(word: str):
    return ''.join(c for c in unicodedata.normalize('NFKD', word)
                   if unicodedata.category(c) != 'Mn')
Example #34
0
def strip_unicode(string):
    for s, r in STRIP_UNICODE.items():
        string = string.replace(s, r)
    return ''.join(c for c in unicodedata.normalize('NFD', string)
                   if unicodedata.category(c) != 'Mn')
def removeDiacritics(string):
    code = ''.join((c for c in unicodedata.normalize('NFD', string)
                    if unicodedata.category(c) != 'Mn'))
    code = code.lower().replace(' ', '').replace('\'', '').replace('-', '')
    return code
Example #36
0
 def property_chars(self, prefix):
     return "".join(
         six.unichr(x) for x in range(sys.maxunicode)
         if unicodedata.category(six.unichr(x)).startswith(prefix))
Example #37
0
class Factory:

    random_letters = map(random.choice,
                         repeat(string.ascii_letters + string.digits))

    random_letters_with_spaces = map(
        random.choice, repeat(string.ascii_letters + string.digits + " "))

    # See django.contrib.auth.forms.UserCreationForm.username.
    random_letters_for_usernames = map(random.choice,
                                       repeat(string.ascii_letters + ".@+-"))

    random_http_responses = map(random.choice,
                                repeat(tuple(http.client.responses)))

    random_octet = partial(random.randint, 0, 255)

    random_octets = iter(random_octet, None)

    random_unicode_codepoint = partial(random.randint, 0, 0x10FFFF)

    random_unicode_codepoints = iter(random_unicode_codepoint, None)

    random_unicode_characters = (
        char for char in map(chr, random_unicode_codepoints)
        if unicodedata.category(char)[0] in "LMNPS")

    random_unicode_non_ascii_characters = (
        char for char in random_unicode_characters if ord(char) >= 128)

    random_unicode_characters_with_spaces = (
        char for char in map(chr, random_unicode_codepoints)
        if unicodedata.category(char)[0] in "LMNPSZ")

    random_unicode_non_ascii_characters_with_spaces = (
        char for char in random_unicode_characters_with_spaces
        if char == " " or ord(char) >= 128)

    def make_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random ASCII letters or digits."""
        source = (self.random_letters_with_spaces
                  if spaces else self.random_letters)
        return prefix + "".join(islice(source, size))

    def make_unicode_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random Unicode characters."""
        source = (self.random_unicode_characters_with_spaces
                  if spaces else self.random_unicode_characters)
        return prefix + "".join(islice(source, size))

    def make_unicode_non_ascii_string(self, size=10, spaces=False, prefix=""):
        """Return a `str` filled with random non-ASCII Unicode characters."""
        source = (self.random_unicode_non_ascii_characters_with_spaces
                  if spaces else self.random_unicode_non_ascii_characters)
        return prefix + "".join(islice(source, size))

    def make_bytes(self, size=10):
        """Return a `bytes` filled with random data."""
        return os.urandom(size)

    def make_username(self, size=10):
        """Create an arbitrary user name (but not the actual user)."""
        return "".join(islice(self.random_letters_for_usernames, size))

    def make_email_address(self, login_size=10):
        """Generate an arbitrary email address."""
        return "*****@*****.**" % self.make_string(size=login_size)

    def make_status_code(self):
        """Return an arbitrary HTTP status code."""
        return next(self.random_http_responses)

    exception_type_names = ("TestException#%d" % i for i in count(1))

    def make_exception_type(self, bases=(Exception, ), **namespace):
        return type(next(self.exception_type_names), bases, namespace)

    def make_exception(self, message=None, bases=(Exception, ), **namespace):
        exc_type = self.make_exception_type(bases, **namespace)
        return exc_type() if message is None else exc_type(message)

    def make_absolute_path(self,
                           directories=3,
                           directory_length=10,
                           path_seperator="/"):
        return path_seperator + path_seperator.join(
            self.make_string(size=directory_length)
            for _ in range(directories))

    def pick_bool(self):
        """Return an arbitrary Boolean value (`True` or `False`)."""
        return random.choice((True, False))

    def pick_enum(self, enum, *, but_not=EMPTY_SET):
        """Pick a random item from an enumeration class.

        :param enum: An enumeration class such as `NODE_STATUS`. Can also be
            an `enum.Enum` subclass.
        :return: The value of one of its items.
        :param but_not: A list of choices' IDs to exclude.
        :type but_not: Sequence.
        """
        if issubclass(enum, Enum):
            return random.choice(
                [value for value in enum if value not in but_not])
        else:
            return random.choice([
                value for key, value in vars(enum).items()
                if not key.startswith("_") and value not in but_not
            ])

    def pick_port(self, port_min=1024, port_max=65535):
        assert port_min >= 0 and port_max <= 65535
        return random.randint(port_min, port_max)

    def pick_choice(self, choices, but_not=None):
        """Pick a random item from `choices`.

        :param choices: A sequence of choices in Django form choices format:
            [
                ('choice_id_1', "Choice name 1"),
                ('choice_id_2', "Choice name 2"),
            ]
        :param but_not: A list of choices' IDs to exclude.
        :type but_not: Sequence.
        :return: The "id" portion of a random choice out of `choices`.
        """
        if but_not is None:
            but_not = ()
        return random.choice(
            [choice for choice in choices if choice[0] not in but_not])[0]

    def make_vlan_tag(self, allow_none=False, *, but_not=EMPTY_SET):
        """Create a random VLAN tag.

        :param allow_none: Whether `None` ("no VLAN") can be allowed as an
            outcome.  If `True`, `None` will be included in the possible
            results with a deliberately over-represented probability, in order
            to help trip up bugs that might only show up once in about 4094
            calls otherwise.
        :param but_not: A set of tags that should not be returned.  Any zero
            or `None` entries will be ignored.
        """
        if allow_none and self.pick_bool():
            return None
        else:
            for _ in range(100):
                vlan_tag = random.randint(1, 0xFFE)
                if vlan_tag not in but_not:
                    return vlan_tag
            raise TooManyRandomRetries("Could not find an available VLAN tag.")

    def ip_to_url_format(self, ip):
        # We return either '[ip:v6:address]' or 'a.b.c.d' depending on the
        # family of the IP Address.
        ip_addr = IPAddress(ip)
        if ip_addr.version == 6:
            return "[%s]" % str(ip_addr)
        else:
            return "%s" % str(ip_addr)

    def make_ipv4_address(self):
        octets = list(islice(self.random_octets, 4))
        if octets[0] == 0:
            octets[0] = 1
        return "%d.%d.%d.%d" % tuple(octets)

    def make_ipv6_address(self):
        # We return from the fc00::/7 space because that's a private
        # space and shouldn't cause problems of addressing the outside
        # world.
        network = IPNetwork("fc00::/7")
        # We can't use random.choice() because there are too many
        # elements in network.
        random_address_index = random.randint(0, network.size - 1)
        return str(IPAddress(network[random_address_index]))

    def make_ip_address(self, ipv6=None):
        """Create a random ip address.

        :param ipv6: True for ipv6, False for ipv4, None for random.

        :return: an IP Address
        :rtype: string
        """
        if ipv6 is None:
            ipv6 = random.randint(0, 1)
        # intentionally allowing all "true" values, including "1".
        if ipv6:
            return self.make_ipv6_address()
        else:
            return self.make_ipv4_address()

    def make_UUID(self):
        return str(uuid1())

    def make_UUID_with_timestamp(self, timestamp, clock_seq=None, node=None):
        if node is None:
            node = random.getrandbits(48) | 0x010000000000
        if clock_seq is None:
            clock_seq = random.getrandbits(14)
        timestamp = int(timestamp * 1e9 / 100) + 0x01B21DD213814000
        time_low = timestamp & 0xFFFFFFFF
        time_mid = (timestamp >> 32) & 0xFFFF
        time_hi_version = (timestamp >> 48) & 0x0FFF
        clock_seq_low = clock_seq & 0xFF
        clock_seq_hi_variant = (clock_seq >> 8) & 0x3F
        fields = (
            time_low,
            time_mid,
            time_hi_version,
            clock_seq_hi_variant,
            clock_seq_low,
            node,
        )
        return str(UUID(fields=fields, version=1))

    def _make_random_network(
        self,
        slash=None,
        but_not=EMPTY_SET,
        disjoint_from=None,
        random_address_factory=None,
    ):
        """Generate a random IP network.

        :param slash: Netmask or bit width of the network, e.g. 24 or
            '255.255.255.0' for what used to be known as a class-C network.
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :param random_address_factory: A callable that returns a random IP
            address. If not provided, will default to
            Factory.make_ipv4_address().
        :return: A network spanning at least 8 IP addresses (at most 29 bits).
        :rtype: :class:`IPNetwork`
        """
        but_not = frozenset(but_not)
        if disjoint_from is None:
            disjoint_from = []
        if slash is None:
            slash = random.randint(16, 29)
        if random_address_factory is None:
            random_address_factory = self.make_ipv4_address
        # Look randomly for a network that matches our criteria.
        for _ in range(100):
            network = IPNetwork("%s/%s" %
                                (random_address_factory(), slash)).cidr
            forbidden = network in but_not
            clashes = network_clashes(network, disjoint_from)
            if not forbidden and not clashes:
                return network
        raise TooManyRandomRetries("Could not find available network")

    def make_ipv4_network(self,
                          slash=None,
                          *,
                          but_not=EMPTY_SET,
                          disjoint_from=None):
        """Generate a random IPv4 network.

        :param slash: Netmask or bit width of the network, e.g. 24 or
            '255.255.255.0' for what used to be known as a class-C network.
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :return: A network spanning at least 16 IP addresses (at most 28 bits).
        :rtype: :class:`IPNetwork`
        """
        if slash is None:
            slash = random.randint(16, 28)
        return self._make_random_network(
            slash=slash,
            but_not=but_not,
            disjoint_from=disjoint_from,
            random_address_factory=self.make_ipv4_address,
        )

    def make_ipv6_network(self,
                          slash=None,
                          *,
                          but_not=EMPTY_SET,
                          disjoint_from=None):
        """Generate a random IPv6 network.

        :param slash: Netmask or bit width of the network. If not
            specified, will default to a bit width of between 112 (65536
            addresses) and 125 (8 addresses);
        :param but_not: Optional iterable of `IPNetwork` objects whose values
            should not be returned.  Use this when you need a different network
            from any returned previously.  The new network may overlap any of
            these, but it won't be identical.
        :param disjoint_from: Optional iterable of `IPNetwork` objects whose
            IP ranges the new network must not overlap.
        :return: A network spanning at least 8 IP addresses.
        :rtype: :class:`IPNetwork`
        """
        if slash is None:
            slash = random.randint(112, 125)
        return self._make_random_network(
            slash=slash,
            but_not=but_not,
            disjoint_from=disjoint_from,
            random_address_factory=self.make_ipv6_address,
        )

    def make_ip4_or_6_network(self, version=None, host_bits=None):
        """Generate a random IPv4 or IPv6 network."""
        slash = None
        if version is None:
            version = random.choice([4, 6])
        if version == 4:
            if host_bits is not None:
                slash = 32 - host_bits
            return self.make_ipv4_network(slash=slash)
        else:
            if host_bits is not None:
                slash = 128 - host_bits
            return self.make_ipv6_network(slash=slash)

    def pick_ip_in_dynamic_range(self, ngi, *, but_not=EMPTY_SET):
        first = ngi.get_dynamic_ip_range().first
        last = ngi.get_dynamic_ip_range().last
        but_not = {IPAddress(but) for but in but_not if but is not None}
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in static range")

    def pick_ip_in_static_range(self, ngi, *, but_not=EMPTY_SET):
        first = ngi.get_static_ip_range().first
        last = ngi.get_static_ip_range().last
        but_not = {IPAddress(but) for but in but_not if but is not None}
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in static range")

    def pick_ip_in_network(self, network, *, but_not=EMPTY_SET):
        but_not = {
            IPAddress(but)
            for but in but_not if but is not None and IPAddress(but) in network
        }
        # Unless the prefix length is very small, make sure we don't select
        # a normally-unusable IP address.
        if network.version == 6 and network.prefixlen < 127:
            # Don't pick the all-zeroes address, since it has special meaning
            # in IPv6 as the subnet-router anycast address. IPv6 does not have
            # a broadcast address, though.
            first, last = network.first + 1, network.last
            network_size = network.size - 1
        elif network.prefixlen < 31:
            # Don't pick broadcast or network addresses.
            first, last = network.first + 1, network.last - 1
            network_size = network.size - 2
        else:
            first, last = network.first, network.last
            network_size = network.size
        if len(but_not) == network_size:
            raise ValueError(
                "No IP addresses available in network: %s (but_not=%r)" %
                (network, but_not))
        for _ in range(100):
            address = IPAddress(random.randint(first, last))
            if address not in but_not:
                return str(address)
        raise TooManyRandomRetries(
            "Could not find available IP in network: %s (but_not=%r)" %
            (network, but_not))

    def make_ip_range(self, network):
        """Return a pair of IP addresses from the given network.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        """
        for _ in range(100):
            ip_range = tuple(
                sorted(
                    IPAddress(factory.pick_ip_in_network(network))
                    for _ in range(2)))
            if ip_range[0] < ip_range[1]:
                return ip_range
        raise TooManyRandomRetries(
            "Could not find available IP range in network: %s" % network)

    def make_ipv4_range(self, network=None):
        """Return a pair of IPv4 addresses.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        """
        if network is None:
            network = self.make_ipv4_network()
        return self.make_ip_range(network=network)

    def make_ipv6_range(self, network=None):
        """Return a pair of IPv6 addresses.

        :param network: Return IP addresses within this network.
        :param but_not: A pair of addresses that should not be returned.
        :return: A pair of `IPAddress`.
        """
        if network is None:
            network = self.make_ipv6_network()
        return self.make_ip_range(network=network)

    def make_mac_address(self, delimiter=":"):
        assert isinstance(delimiter, str)
        octets = islice(self.random_octets, 6)
        return delimiter.join(format(octet, "02x") for octet in octets)

    def make_random_leases(self, num_leases=1):
        """Create a dict of arbitrary ip-to-mac address mappings."""
        # This could be a dict comprehension, but the current loop
        # guards against shortfalls as random IP addresses collide.
        leases = {}
        while len(leases) < num_leases:
            leases[self.make_ipv4_address()] = self.make_mac_address()
        return leases

    def make_date(self, year=2017):
        start = time.mktime(datetime.datetime(year, 1, 1).timetuple())
        end = time.mktime(datetime.datetime(year + 1, 1, 1).timetuple())
        stamp = random.randrange(start, end)
        return datetime.datetime.fromtimestamp(stamp)

    def make_timedelta(self):
        return datetime.timedelta(
            days=random.randint(0, 3 * 365),
            seconds=random.randint(0, 24 * 60 * 60 - 1),
            microseconds=random.randint(0, 999999),
        )

    def make_file(self, location, name=None, contents=None):
        """Create a file, and write data to it.

        Prefer the eponymous convenience wrapper in
        :class:`maastesting.testcase.MAASTestCase`.  It creates a temporary
        directory and arranges for its eventual cleanup.

        :param location: Directory.  Use a temporary directory for this, and
            make sure it gets cleaned up after the test!
        :param name: Optional name for the file.  If none is given, one will
            be made up.
        :param contents: Optional contents for the file. If omitted, some
            arbitrary ASCII text will be written. If Unicode content is
            provided, it will be encoded with UTF-8.
        :type contents: unicode, but containing only ASCII characters.
        :return: Path to the file.
        """
        if name is None:
            name = self.make_string()
        if contents is None:
            contents = self.make_string().encode("ascii")
        if isinstance(contents, str):
            contents = contents.encode("utf-8")
        path = os.path.join(location, name)
        with open(path, "wb") as f:
            f.write(contents)
        return path

    def make_name(self, prefix=None, sep="-", size=6):
        """Generate a random name.

        :param prefix: Optional prefix.  Pass one to help make test failures
            and tracebacks easier to read!  If you don't, you might as well
            use `make_string`.
        :param sep: Separator that will go between the prefix and the random
            portion of the name.  Defaults to a dash.
        :param size: Length of the random portion of the name.  Don't get
            hung up on this; you may need more if uniqueness is really
            important or less if it doesn't but legibility does, but
            generally, use the default.
        :return: A randomized unicode string.
        """
        if prefix is None:
            return self.make_string(size=size)
        else:
            return prefix + sep + self.make_string(size=size)

    def make_hostname(self, prefix="host", *args, **kwargs):
        """Generate a random hostname.

        The returned hostname is lowercase because python's urlparse
        implicitely lowercases the hostnames."""
        return self.make_name(prefix=prefix, *args, **kwargs).lower()

    # Always select from a scheme that allows parameters in the URL so
    # that we can round-trip a URL with params successfully (otherwise
    # the params don't get parsed out of the path).
    _make_parsed_url_schemes = tuple(scheme
                                     for scheme in urllib.parse.uses_params
                                     if scheme != "")

    def make_parsed_url(
        self,
        scheme=None,
        netloc=None,
        path=None,
        port=None,
        params=None,
        query=None,
        fragment=None,
    ):
        """Generate a random parsed URL object.

        Contains randomly generated values for all parts of a URL: scheme,
        location, path, parameters, query, and fragment. However, each part
        can be overridden individually.

        If port=None or port=True, make_port() will be used to select a random
        port, while port=False will create a netloc for the URL that does not
        specify a port. To specify a port in netloc, port parameter
        must be False.

        :return: Instance of :py:class:`urlparse.ParseResult`.
        """
        if port is not False and netloc is not None and netloc.count(":") == 1:
            raise AssertionError(
                "A port number has been requested, however the given netloc "
                "spec %r already contains a port number." % (netloc, ))
        if scheme is None:
            # Select a scheme that allows parameters; see above.
            scheme = random.choice(self._make_parsed_url_schemes)
        if port is None or port is True:
            port = self.pick_port()
        if netloc is None:
            netloc = "%s.example.com" % self.make_name("netloc").lower()
            if isinstance(port, int) and not isinstance(port, bool):
                netloc += ":%d" % port
        if path is None:
            # A leading forward-slash will be added in geturl() if we
            # don't, so ensure it's here now so tests can compare URLs
            # without worrying about it.
            path = self.make_name("/path")
        else:
            # Same here with the forward-slash prefix.
            if not path.startswith("/"):
                path = "/" + path
        if params is None:
            params = self.make_name("params")
        if query is None:
            query = self.make_name("query")
        if fragment is None:
            fragment = self.make_name("fragment")
        return urllib.parse.ParseResult(scheme, netloc, path, params, query,
                                        fragment)

    def make_url(
        self,
        scheme=None,
        netloc=None,
        path=None,
        params=None,
        query=None,
        fragment=None,
    ):
        """Generate a random URL.

        Contains randomly generated values for all parts of a URL: scheme,
        location, path, parameters, query, and fragment. However, each part
        can be overridden individually.

        :return: string
        """
        return self.make_parsed_url(scheme, netloc, path, params, query,
                                    fragment).geturl()

    def make_simple_http_url(self, netloc=None, path=None, port=None):
        """Create an arbitrary HTTP URL with only a location and path."""
        return self.make_parsed_url(
            scheme="http",
            netloc=netloc,
            path=path,
            port=port,
            params="",
            query="",
            fragment="",
        ).geturl()

    def make_names(self, *prefixes):
        """Generate random names.

        Yields a name for each prefix specified.

        :param prefixes: Zero or more prefixes. See `make_name`.
        """
        for prefix in prefixes:
            yield self.make_name(prefix)

    def make_tarball(self, location, contents):
        """Create a tarball containing the given files.

        :param location: Path to a directory where the tarball can be stored.
        :param contents: A dict mapping file names to file contents.  Where
            the value is `None`, the file will contain arbitrary data.
        :return: Path to a gzip-compressed tarball.
        """
        tarball = os.path.join(location, "%s.tar.gz" % self.make_name())
        with TempDirectory() as working_dir:
            source = working_dir.path
            for name, content in contents.items():
                self.make_file(source, name, content)

            subprocess.check_call(["tar", "-C", source, "-czf", tarball, "."])

        return tarball

    def make_response(self, status_code, content, content_type=None):
        """Return a similar response to that which `urllib` returns."""
        headers = http.client.HTTPMessage()
        if content_type is not None:
            headers.set_type(content_type)
        return urllib.request.addinfourl(fp=io.BytesIO(content),
                                         headers=headers,
                                         url=None,
                                         code=status_code)

    def make_streams(self, stdin=None, stdout=None, stderr=None):
        """Make a fake return value for a SSHClient.exec_command."""
        # stdout.read() is called so stdout can't be None.
        if stdout is None:
            stdout = mock.Mock()

        return (stdin, stdout, stderr)

    def make_CalledProcessError(self):
        """Make a fake :py:class:`subprocess.CalledProcessError`."""
        return subprocess.CalledProcessError(
            returncode=random.randint(1, 10),
            cmd=[self.make_name("command")],
            output=factory.make_bytes(),
        )

    def make_kernel_string(self,
                           can_be_release_or_version=False,
                           generic_only=False):
        ubuntu = UbuntuDistroInfo()
        # Only select from MAAS supported releases so we don't have to deal
        # with versions name overlap(e.g Warty and Wily).
        try:
            ubuntu_rows = ubuntu._rows
        except AttributeError:
            ubuntu_rows = [row.__dict__ for row in ubuntu._releases]
        supported_releases = [
            release for release in ubuntu_rows
            if int(release["version"].split(".")[0]) >= 12
        ]
        release = random.choice(supported_releases)
        # Remove 'LTS' from version if it exists
        version_str = release["version"].split(" ")[0]
        strings = [
            "hwe-%s" % release["series"][0],
            "hwe-%s" % version_str,
            "hwe-%s-edge" % version_str,
        ]
        if not generic_only:
            strings += [
                "hwe-%s-lowlatency" % version_str,
                "hwe-%s-lowlatency-edge" % version_str,
            ]
        if can_be_release_or_version:
            strings += [release["series"], version_str]
        return random.choice(strings)

    def make_dhcp_packet(
        self,
        transaction_id: bytes = None,
        truncated: bool = False,
        truncated_option_value: bool = False,
        bad_cookie: bool = False,
        truncated_option_length: bool = False,
        include_server_identifier: bool = False,
        server_ip: str = "127.1.1.1",
        include_end_option: bool = True,
    ) -> bytes:
        """Returns a [possibly invalid] DHCP packet."""
        if transaction_id is None:
            transaction_id = self.make_bytes(size=4)
        options = b""
        if include_server_identifier:
            # 0x36 == 54 (Server Identifier option)
            ip_bytes = int(IPAddress(server_ip).value).to_bytes(4, "big")
            options += b"\x36\x04" + ip_bytes
        if truncated_option_value:
            options += b"\x36\x04\x7f\x01"
            include_end_option = False
        if truncated_option_length:
            options += b"\x36"
            include_end_option = False
        # Currently, we only validation the transaction ID, and the fact that
        # the reply packet has a "Server Identifier" option. This might be
        # considered a bug, but in practice it works out.
        packet = (
            # Message type: 0x02 (BOOTP operation: reply).
            b"\x02"
            # Hardware type: Ethernet
            b"\x01"
            # Hardware address length: 6
            b"\x06"
            # Hops: 0
            b"\x00" +
            # Transaction ID
            transaction_id +
            # Seconds
            b"\x00\x00"
            # Flags
            b"\x00\x00"
            # Client IP address: 0.0.0.0
            b"\x00\x00\x00\x00"
            # Your (client) IP address: 0.0.0.0
            b"\x00\x00\x00\x00"
            # Next server IP address: 0.0.0.0
            b"\x00\x00\x00\x00"
            # Relay agent IP address: 0.0.0.0
            b"\x00\x00\x00\x00" +
            # Client hardware address
            b"\x01\x02\x03\x04\x05\x06"
            # Hardware address padding
            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" +
            # Server host name
            (b"\x00" * 67) +
            # Boot filename
            (b"\x00" * 125) +
            # Cookie
            (b"\x63\x82\x53\x63" if not bad_cookie else b"xxxx") +
            # "DHCP Offer" option
            b"\x35\x01\x02" + options +
            # End options.
            (b"\xff" if include_end_option else b""))
        if truncated:
            packet = packet[:200]
        return packet
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Example #39
0
    "\U000E01E5",
    "\U000E01E6",
    "\U000E01E7",
    "\U000E01E8",
    "\U000E01E9",
    "\U000E01EA",
    "\U000E01EB",
    "\U000E01EC",
    "\U000E01ED",
    "\U000E01EE",
    "\U000E01EF",
]
MARK_SET = {
    chr(c)
    for c in range(sys.maxunicode + 1)
    if unicodedata.category(chr(c))[0] == "M"
}
print("len(UNICODE_NSM) = {}".format(len(UNICODE_NSM)))
print("len(MARK_SET) = {}".format(len(MARK_SET)))

filepath = "UnicodeData.txt"
with open(filepath) as f:
    text = f.read()

text = text[:10000]


def main():
    ground_truth = loop_count(text)
    functions = [  # (loop_count, 'loop_count'),
        # (generator_count, 'generator_count'),
Example #40
0
def category_count(text):
    return sum(unicodedata.category(char) != "Mn" for char in text)
Example #41
0
import cloudscraper
from bs4 import BeautifulSoup
from bs4.element import Comment, Tag
from requests import Response, Session

from ..assets.user_agents import user_agents
from ..utils.ssl_no_verify import no_ssl_verification
from .exeptions import LNException

logger = logging.getLogger(__name__)


LINE_SEP = '<br>'

INVISIBLE_CHARS = [c for c in range(sys.maxunicode) if unicodedata.category(chr(c)) in {'Cf', 'Cc'}]
NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7f, 0xa0), INVISIBLE_CHARS)
NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}

MAX_CONCURRENT_REQUEST_PER_DOMAIN = 15
REQUEST_SEMAPHORES: Dict[str, Semaphore] = {}

def get_domain_semaphore(url):
    host = urlparse(url).hostname or url
    if host not in REQUEST_SEMAPHORES:
        REQUEST_SEMAPHORES[host] = Semaphore(MAX_CONCURRENT_REQUEST_PER_DOMAIN)
    return REQUEST_SEMAPHORES[host]


class Crawler(ABC):
    '''Blueprint for creating new crawlers'''
Example #42
0
 def remove_accents(self, data):
     return ''.join(x for x in unicodedata.normalize('NFKD', data) if \
     unicodedata.category(x)[0] == 'L')
Example #43
0
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Example #44
0
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text.lower()):
        if not tok.is_stop:
            if tok.is_alpha and not (tok.is_digit or len(tok.text) == 1):
                if not tok.is_ascii:
                    tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
                    s.append(tok)
                else:
                    s.append(tok.text)
    if not s:
        return "emptystring"
    else:
        s = ' '.join(s)
        return s
Example #45
0
File: feed.py Project: kowh-ai/ckan
    def remove_control_characters(s):
        if not s:
            return ""

        return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
import unicodedata
import sys

cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
# unicodedata.combining(chr(c))

b = unicodedata.normalize('NFD', s)
print(b)

print(b.translate(cmb_chrs))

# Maps all unicode decimal digit characters to their equivalent in ASCII
digitmap = {
    c: ord('0') + unicodedata.digit(chr(c))
    for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'ND'
}
print(len(digitmap))

# Arabic digits
x = '\u0661\u0662\u0663'
print(x.translate(digitmap))

print(s)

b = unicodedata.normalize('NFD', s)
b.encode('ascii', 'ignore').decode('ascii')
print(b)

print(b.translate(remap))
# coding: utf-8
from __future__ import print_function
from __future__ import unicode_literals


from unidecode import unidecode
import unicodedata

hun_characters = {bytearray([i]).decode('iso-8859-2') for i in range(256)}
hun_punctuation = {
    c
    for c in hun_characters
    if unicodedata.category(c).startswith('P')
}
translate_remove_hun_punctuation = {ord(c): None for c in hun_punctuation}
translate_space_for_hun_punctuation = {ord(c): ' ' for c in hun_punctuation}


def lower(words):
    return tuple(w.lower() for w in words)


def remove_accents(words):
    return tuple(''.__class__(unidecode(w)) for w in words)


def lower_without_accents(words):
    return lower(remove_accents(words))


def remove_punctuations(words):