def clean_tokenize(self, input_text, accentmark, minimunlen, numeric, alpha,stopwords):
        """
        Clean document, removing accents, punctuation and symbols
        :param text: string to clean
        :return: string cleaned without punctuation and stop words
        """
        final_text = []
        if not accentmark:
            text = input_text.replace('\n', ' ').replace('\r', '').replace('”', '').replace('“', '').replace('.', '')
            nfkd_form = unicodedata.normalize('NFKD', text)
            unicode_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]).lower()
            clean_text = unicode_text.translate(punctuation)
            clean_text = str(''.join([i if ord(i) < 128 else ' ' for i in clean_text])).lower()
            words = word_tokenize(clean_text, language='spanish')
            #words = nltk.regexp_tokenize(clean_text, r"([a-zA-Z])*")
        else:
            text = u"".join([c for c in input_text if not unicodedata.combining(c)])
            words = word_tokenize(text, language='spanish')
        for word in words:
            result = True
            if len(word) > minimunlen:
                if stopwords:
                    if word.lower() in self.ALL_STOPWORDS:
                        result = False

                if result and numeric and word.isnumeric():
                    result = False
                elif result and alpha and not word.isalpha():
                    result = False
                if result:
                    final_text.append(word)

        return final_text
Esempio n. 2
0
    def preprocess_str(line: str, return_mapping: bool = False) -> Union[Tuple[str, List[int], List[int]], str]:
        """ Removes unicode and other characters from str

        Args:
            line: string to process
            return_mapping: return mapping from line to preprocessed line or not

        Returns:
            preprocessed line, raw2preprocessed mapping, preprocessed2raw mapping

        """
        line = line.replace("''", '" ').replace("``", '" ')
        if not return_mapping:
            return ''.join(c for c in line if not unicodedata.combining(c))

        r2p = [len(line)] * (len(line) + 1)
        p2r = [len(line)] * (len(line) + 1)
        s = ''
        for i, c in enumerate(line):
            if unicodedata.combining(c):
                r2p[i] = -1
            else:
                s += c
                r2p[i] = len(s) - 1
                p2r[len(s) - 1] = i
        return s, r2p, p2r
Esempio n. 3
0
 def clean_song_data(self, artist, title):
     # convert to lowercase
     artist = artist.lower()
     title = title.lower()
     
     # remove accents
     artist = unicodedata.normalize('NFKD', artist)
     artist = "".join([c for c in artist if not unicodedata.combining(c)])
     title = unicodedata.normalize('NFKD', title)
     title = "".join([c for c in title if not unicodedata.combining(c)])
     
     if self.ignore_brackets:
         LYRICS_TITLE_STRIP.append("\(.*\)")
 
     # replace ampersands and the like
     for exp in LYRICS_ARTIST_REPLACE:
         artist = re.sub(exp[0], exp[1], artist)
     for exp in LYRICS_TITLE_REPLACE:
         title = re.sub(exp[0], exp[1], title)
 
     # strip things like "(live at Somewhere)", "(acoustic)", etc
     for exp in LYRICS_TITLE_STRIP:
         title = re.sub (exp, '', title)
 
     # compress spaces
     title = title.strip()
     artist = artist.strip()
             
     return (artist, title)
Esempio n. 4
0
def shave_marks_latin(txt):
    norm_txt = normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)
Esempio n. 5
0
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue  # Ignore diacritic on latin base char.
        keepers.append(c)
        # If it isn't combining char, it's a new base char.
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)
Esempio n. 6
0
def remove_initial_vowel(word):
    if not word:
        return u''
    word = unicodedata.normalize('NFKD', word)
    removed = u''
    while word[0] in vowels or unicodedata.combining(word[0]):
        removed += word[0]
        test = u''.join([c for c in removed if not unicodedata.combining(c)])
        if test and test not in vowels and test not in diphthongs:
            return word
        if len(word) == 1:
            return u''
        word = word[1:]
    return word
def shave_marks_latin(txt):
    process_txt = unicodedata.normalize('NFD', txt)
    keepers = []
    latin_base = False
    for char in process_txt:
        if not unicodedata.combining(char) or not latin_base:
            keepers.append(char)
        elif unicodedata.combining(char) and not latin_base:
            keepers.append(char)
        elif not unicodedata.combining(char) and latin_base:
            keepers.append(char)
        if not unicodedata.combining(char):
            latin_base = char in string.ascii_letters
    text = ''.join(keepers)
    return unicodedata.normalize('NFC', text)
Esempio n. 8
0
    def _format_for_latex(self, text):
        """ Function to sanitize text, so that it can be typeset by latex.
            This sanitation consists of three operations:
                1. Normalize the text to NFC.
                    This compresses diacritics where possible.
                2. Replacement of unknown unicode characters with a default.
                3. Replacement of non-typesettable character with their
                    latex counterpart or equivalent character.

            Parameters
            ----------
            text : str
                The text to sanitize.

            Returns
            -------
            The text after sanitation.
        """
        correct_line = ''
        normalized_line = unicodedata.normalize('NFC', text)
        for idx, c in enumerate(normalized_line):
            try:
                if unicodedata.combining(c) != 0:
                    continue
                next_char = normalized_line[idx+1]
                name = unicodedata.name(c, None)
                codepoint = ord(next_char)
                if codepoint in special_char_mapping.keys():
                    latex_command = special_char_mapping[codepoint]
                    correct_line += self._build_latex_replacement(latex_command, c)
                elif c in special_char_mapping.keys():
                    latex_command = special_char_mapping[c]
                    correct_line += self._build_latex_replacement(latex_command, c)
                elif name is None:
                    self._preamble['latexsym'] = ''
                    correct_line += '□'
                else:
                    correct_line += c
            except IndexError:
                if unicodedata.combining(c) != 0:
                    continue
                if c in special_char_mapping.keys():
                    latex_command = special_char_mapping[c]
                    correct_line += self._build_latex_replacement(latex_command, c)
                else:
                    correct_line += c

        return correct_line
Esempio n. 9
0
def strip_accents(text):
    """
    Remove accents (diacritic) from all characters.
    """
    return ''.join((char for char
                    in unicodedata.normalize('NFD', text)
                    if not unicodedata.combining(char)))
Esempio n. 10
0
def _char_block_width(char):
    # Basic Latin, which is probably the most common case
    #if char in xrange(0x0021, 0x007e):
    #if char >= 0x0021 and char <= 0x007e:
    if 0x0021 <= char <= 0x007e:
        return 1
    # Chinese, Japanese, Korean (common)
    if 0x4e00 <= char <= 0x9fff:
        return 2
    # Hangul
    if 0xac00 <= char <= 0xd7af:
        return 2
    # Combining?
    if unicodedata.combining(uni_chr(char)):
        return 0
    # Hiragana and Katakana
    if 0x3040 <= char <= 0x309f or 0x30a0 <= char <= 0x30ff:
        return 2
    # Full-width Latin characters
    if 0xff01 <= char <= 0xff60:
        return 2
    # CJK punctuation
    if 0x3000 <= char <= 0x303e:
        return 2
    # Backspace and delete
    if char in (0x0008, 0x007f):
        return -1
    # Other control characters
    elif char in (0x0000, 0x001f):
        return 0
    # Take a guess
    return 1
Esempio n. 11
0
def about(u, cp=None, name=None):
    global data_loaded

    ## load UnicodeData
    if not data_loaded:
        load_data()
        data_loaded = True

    if cp is None:
        ## cp is not provided, we can safely grab the codepoint
        cp = ord(u)
    else:
        ## codepoint is provided but is in hexadeciaml
        cp = int(cp, 16)

    if name is None:
        name = 'No Name Found'
        ## we need the U+XXXX numbers
        ## which are hex numbers
        ## it is how the numbers are formatted in the UnicodeData file
        search_cp = '%04X' % (cp)
        if search_cp in cp_names:
            name = cp_names[search_cp]

    ## TODO: Replace this...
    if not unicodedata.combining(u):
        template = 'U+%04X %s (%s)'
    else:
        template = 'U+%04X %s (\xe2\x97\x8c%s)'

    return template % (cp, name, u.encode('utf-8'))
Esempio n. 12
0
def codepoint(bot, trigger):
    arg = trigger.group(2)
    if not arg:
        bot.reply('What code point do you want me to look up?')
        return module.NOLIMIT
    stripped = arg.strip()
    if len(stripped) > 0:
        arg = stripped
    if len(arg) > 1:
        if arg.startswith('U+'):
            arg = arg[2:]
        try:
            arg = unichr(int(arg, 16))
        except (ValueError, TypeError):
            bot.reply("That's not a valid code point.")
            return module.NOLIMIT

    point, name = get_codepoint_name(arg)
    if name is None:
        name = '(No name found)'

    template = 'U+%s %s (\xe2\x97\x8c%s)'
    if not unicodedata.combining(arg):
        template = 'U+%s %s (%s)'

    bot.say(template % (point, name, arg))
Esempio n. 13
0
File: models.py Progetto: Almad/ella
def remove_diacritical(text):
    line = unicodedata.normalize("NFKD", text)
    output = ""
    for c in line:
        if not unicodedata.combining(c):
            output += c
    return output
def artist_search(results, media, lang, artist_name):

  # Precompose.
  try:
    artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8'))
  except UnicodeError:
    artist_name = unicodedata.normalize('NFKD', artist_name)

  # Strip diacritics.
  stripped = u''
  for i in range(len(artist_name)):
    point = artist_name[i]
    if not unicodedata.combining(point):
      stripped += point
  artist_name = stripped


  json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name)))

  score = 100
  normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name))
  for artist in json_obj['artists']:

    # Require a perfect match after normalization to avoid false positives.
    normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name']))
    Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result))
    if normalized_artist_name == normalized_artist_result:        
      results.add(SearchResult(
        id = artist['urlSafeName'],
        score = score
      ))
      score = score - 1
	def __remove_acento(self, letra):

		if letra == 'ç' or letra == 'Ç':	# cedilha faz parte do teclado, entao eh obrigado a ser calculado
			return letra

		nkfd_form = unicodedata.normalize('NFKD', letra) 
		return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 16
0
def make_sortable(text):
    text = text.lower()
    text = text.decode('utf-8')
    normalized = unicodedata.normalize('NFKD', text)
    text = u''.join([c for c in normalized if not unicodedata.combining(c)])
    text = text.encode('utf-8')
    return text
Esempio n. 17
0
def removeAccents(str):
    """Remove any form of UTF-8 accents.

    See: http://stackoverflow.com/questions/517923/
    """
    nkfd_form = unicodedata.normalize('NFKD', str)
    return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 18
0
def codepoint(bot, trigger):
    arg = trigger.group(2).strip()
    if len(arg) == 0:
        bot.reply('What code point do you want me to look up?')
        return NOLIMIT
    elif len(arg) > 1:
        if arg.startswith('U+'):
            arg = arg[2:]
        try:
            arg = unichr(int(arg, 16))
        except:
            bot.reply("That's not a valid code point.")
            return NOLIMIT

    # Get the hex value for the code point, and drop the 0x from the front
    point = str(hex(ord(u'' + arg)))[2:]
    # Make the hex 4 characters long with preceding 0s, and all upper case
    point = point.rjust(4, str('0')).upper()
    try:
        name = unicodedata.name(arg)
    except ValueError:
        return 'U+%s (No name found)' % point

    if not unicodedata.combining(arg):
        template = 'U+%s %s (%s)'
    else:
        template = 'U+%s %s (\xe2\x97\x8c%s)'
    bot.say(template % (point, name, arg))
Esempio n. 19
0
def normalizeUnicode(text, encoding='humanascii'):
    """
    This method is used for normalization of unicode characters to the base ASCII
    letters. Output is ASCII encoded string (or char) with only ASCII letters,
    digits, punctuation and whitespace characters. Case is preserved.
    """
    if text == "":
	return ""

    unicodeinput = True
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8')
        unicodeinput = False

    res = ''
    global allowed, allowedid
    if encoding == 'humanascii' or encoding == 'identifier':
        enc = 'ascii'
    else:
        enc = encoding
    for ch in text:
        if (encoding == 'humanascii') and (ch in allowed):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        if (encoding == 'identifier') and (ch in allowedid):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        else:
            try:
                ch.encode(enc,'strict')
                if encoding == 'identifier':
                    res += '_'
                else:
                    res += ch
            except UnicodeEncodeError:
                ordinal = ord(ch)
                if mapping.has_key(ordinal):
                    # try to apply custom mappings
                    res += mapping.get(ordinal)
                elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
                    normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip()
                    # normalized string may contain non-letter chars too. Remove them
                    # normalized string may result to  more than one char
                    if encoding == 'identifier':
                        res += ''.join([c for c in normalized if c in allowedid])
                    else:
                        res += ''.join([c for c in normalized if c in allowed])
                else:
                    # hex string instead of unknown char
                    res += "%x" % ordinal
    if encoding == 'identifier':
        res = res.strip('_').replace('_____','_').replace('____','_').replace('___','_').replace('__','_')
        if not res.strip('_')[0] in string.ascii_letters:
            res = '_' + res
    if unicodeinput:
        return res
    else:
        return res.encode('utf-8')
Esempio n. 20
0
def remove_accents(input_str):
    """Suboptimal-but-better-than-nothing way to replace accented
    latin letters by an ASCII equivalent. Will obviously change the
    meaning of input_str and work only for some cases"""
    input_str = ustr(input_str)
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 21
0
    def _text_chars(self, length, truncate, text, whole_words):
        """
        Truncates a string after a certain number of chars.
        """
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
                continue
            s_len += 1
            if end_index is None and s_len > length:
                end_index = i
            if s_len > length:
                truncated = text[:end_index or 0]

                if whole_words:
                    if not char.isspace():
                        # Current character is whitespace, find previous
                        # whole word
                        truncated = truncated.rsplit(' ', 1)[0]

                    # Remove trailing whitespace and punctuation
                    truncated = truncated.rstrip(
                        string.whitespace + string.punctuation
                    )

                # Return the truncated string
                return self.add_truncation_text(truncated, truncate)

        # Return the original string since no truncation was necessary
        return text
Esempio n. 22
0
def remove_diacritics(s: str) -> str:
    """Canonicalises and removes all diacritics from the given string"""
    return "".join(c for c in unicodedata.normalize('NFKD', str(s))
                   if not unicodedata.combining(c))
Esempio n. 23
0
    def process(self, data):
        for raw_ch in data:
            if self.utf8_len == 0:
                if ord(raw_ch) < 128:
                    ch = unicode(raw_ch)
                elif ord(raw_ch) < 0xc0:
                    # Unexpected continuation character
                    ch = unichr(ord(raw_ch))
                elif ord(raw_ch) < 0xe0:
                    self.utf8_buffer = raw_ch
                    self.utf8_len = 1
                elif ord(raw_ch) < 0xf0:
                    self.utf8_buffer = raw_ch
                    self.utf8_len = 2
                elif ord(raw_ch) < 0xf8:
                    self.utf8_buffer = raw_ch
                    self.utf8_len = 3
                elif ord(raw_ch) < 0xfc:
                    self.utf8_buffer = raw_ch
                    self.utf8_len = 4
                elif ord(raw_ch) < 0xfe:
                    self.utf8_buffer = raw_ch
                    self.utf8_len = 5
                else:
                    # Invalid first byte
                    ch = unichr(ord(raw_ch))
            else:
                if (ord(raw_ch) & 0xc0) != 0x80:
                    # Invalid continuation character
                    ch = unichr(ord(raw_ch))
                    self.utf8_len = 0
                else:
                    self.utf8_buffer += raw_ch
                    self.utf8_len -= 1
                    if self.utf8_len == 0:
                        ch = unicode(self.utf8_buffer, 'utf8', 'replace')

            if self.utf8_len > 0:
                continue

            # Check for combining characters
            try:
                if (unicodedata.combining(ch) != 0) and (self.cursor_col > 0):
                    # Combining character, so combine it with the previously written character
                    last_ch = self.screen[self.cursor_row][self.cursor_col - 1]
                    combined = unicodedata.normalize("NFC", last_ch + ch)
                    if len(combined) == 1:
                        # Successful combine, write out new character
                        self.screen[self.cursor_row][self.cursor_col -
                                                     1] = combined
                        self.dirty.add(self.cursor_row)
                        continue
            except TypeError:
                # Invalid character
                ch = u' '

            if self.window_title_mode:
                if ch == u'\007':  # Bell character ends window title
                    if self.title_callback and not self.ignored_window_title:
                        self.title_callback(self.unprocessed_input)
                    self.unprocessed_input = u""
                    self.window_title_mode = False
                else:
                    self.unprocessed_input += ch
            elif ch in self.special_chars:
                self.special_chars[ch]()
            elif self.escape_mode:
                self.unprocessed_input += ch
                if len(self.unprocessed_input) == 1:
                    if (ch != u'[') and (ch != u']') and (
                            ch not in self.charset_escapes):
                        # Special type of escape sequence, no parameters
                        self.process_escape(self.unprocessed_input)
                        self.unprocessed_input = u""
                        self.escape_mode = False
                elif (len(self.unprocessed_input)
                      == 2) and (self.unprocessed_input[0]
                                 in self.charset_escapes):
                    if self.unprocessed_input == "(0":
                        # Select line drawing character set
                        self.line_draw = True
                    else:
                        # Other character set escape, just use UTF8
                        self.line_draw = False
                    self.unprocessed_input = u""
                    self.escape_mode = False
                elif (ch >= u'@') and (ch <= u'~'):
                    # Ending character found, process sequence
                    self.process_escape(self.unprocessed_input)
                    self.unprocessed_input = u""
                    self.escape_mode = False
                else:
                    # Parameter character, add to pending string
                    if self.unprocessed_input.startswith(u']') and (ch
                                                                    == u';'):
                        # Setting window title, wait for bell character to finish
                        self.start_window_title(self.unprocessed_input)
                        self.unprocessed_input = u""
                        self.escape_mode = False
                        self.window_title_mode = True
            elif ch == u'\033':
                self.escape()
            else:
                self.write_char(ch)

        if self.update_callback:
            self.update_callback()
def sanitize_key(input_str):
    input_str = re.sub(r"\\.", "",
                       input_str).strip().lower().replace("the ", "")
    # remove accents from unicode characters
    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str.lower()))
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 25
0
def to_ascii(value):
    "Convert any non-ASCII character to its closest ASCII equivalent."
    if value is None:
        return ""
    value = unicodedata.normalize("NFKD", str(value))
    return "".join([c for c in value if not unicodedata.combining(c)])
Esempio n. 26
0
def remove_diacritics(value):
    """Return string with diacritics removed. Value must be unicode."""
    if value:
        value = u''.join(char for char in unicodedata.normalize('NFKD', value)
                         if not unicodedata.combining(char))
    return value
def normalized_metadata_frequencies(loader_obj):
    print('%s: Generating normalized metadata frequencies...' % time.ctime())
    frequencies = loader_obj.destination + '/frequencies'
    for field in loader_obj.metadata_fields:
        try:
            output = open(frequencies + "/normalized_" + field + "_frequencies", "w")
            for line in open(frequencies + "/" + field + "_frequencies"):
                word, count = line.split("\t")
                norm_word = word.decode('utf-8').lower()
                norm_word = [i for i in unicodedata.normalize("NFKD", norm_word) if not unicodedata.combining(i)]
                norm_word = ''.join(norm_word).encode('utf-8')
                print(norm_word + "\t" + word, file=output)
            output.close()
        except:
            pass
Esempio n. 28
0
def _strip_accents(s):
   return u''.join(
       (c for c in unicodedata.normalize('NFD', s)
        if not unicodedata.combining(c)))
Esempio n. 29
0
 def strip_accents(self, text):
     if self is None: return
     text = unicodedata.normalize('NFKD', text)
     return ''.join([c for c in text if not unicodedata.combining(c)])
def remove_accents(input_str):
    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 31
0
File: screens.py Progetto: zed/pyte
    def draw(self, data):
        """Display decoded characters at the current cursor position and
        advances the cursor if :data:`~pyte.modes.DECAWM` is set.

        :param str data: text to display.

        .. versionchanged:: 0.5.0

           Character width is taken into account. Specifically, zero-width
           and unprintable characters do not affect screen state. Full-width
           characters are rendered into two consecutive character containers.
        """
        data = data.translate(
            self.g1_charset if self.charset else self.g0_charset)

        for char in data:
            char_width = wcwidth(char)

            # If this was the last column in a line and auto wrap mode is
            # enabled, move the cursor to the beginning of the next line,
            # otherwise replace characters already displayed with newly
            # entered.
            if self.cursor.x == self.columns:
                if mo.DECAWM in self.mode:
                    self.dirty.add(self.cursor.y)
                    self.carriage_return()
                    self.linefeed()
                elif char_width > 0:
                    self.cursor.x -= char_width

            # If Insert mode is set, new characters move old characters to
            # the right, otherwise terminal is in Replace mode and new
            # characters replace old characters at cursor position.
            if mo.IRM in self.mode and char_width > 0:
                self.insert_characters(char_width)

            line = self.buffer[self.cursor.y]
            if char_width == 1:
                line[self.cursor.x] = self.cursor.attrs._replace(data=char)
            elif char_width == 2:
                # A two-cell character has a stub slot after it.
                line[self.cursor.x] = self.cursor.attrs._replace(data=char)
                if self.cursor.x + 1 < self.columns:
                    line[self.cursor.x + 1] = self.cursor.attrs \
                        ._replace(data="")
            elif char_width == 0 and unicodedata.combining(char):
                # A zero-cell character is combined with the previous
                # character either on this or preceeding line.
                if self.cursor.x:
                    last = line[self.cursor.x - 1]
                    normalized = unicodedata.normalize("NFC", last.data + char)
                    line[self.cursor.x - 1] = last._replace(data=normalized)
                elif self.cursor.y:
                    last = self.buffer[self.cursor.y - 1][self.columns - 1]
                    normalized = unicodedata.normalize("NFC", last.data + char)
                    self.buffer[self.cursor.y - 1][self.columns - 1] = \
                        last._replace(data=normalized)
            else:
                break  # Unprintable character or doesn't advance the cursor.

            # .. note:: We can't use :meth:`cursor_forward()`, because that
            #           way, we'll never know when to linefeed.
            if char_width > 0:
                self.cursor.x = min(self.cursor.x + char_width, self.columns)

        self.dirty.add(self.cursor.y)
Esempio n. 32
0
print(s2)

# (b) Examine equality and length
print('s1 == s2', s1 == s2)
print(len(s1), len(s2))

# (c) Normalize and try the same experiment
import unicodedata

n_s1 = unicodedata.normalize('NFC', s1)
n_s2 = unicodedata.normalize('NFC', s2)

print('n_s1 == n_s2', n_s1 == n_s2)
print(len(n_s1), len(n_s2))

# (d) Example of normalizing to a decomposed form and stripping accents
t1 = unicodedata.normalize('NFD', s1)
print(''.join(c for c in t1 if not unicodedata.combining(c)))

s = '\ufb01'  # A single character

print(unicodedata.normalize('NFD', s))
print(unicodedata.normalize('NFKD', s))
print(unicodedata.normalize('NFKC', s))

t1 = unicodedata.normalize('NFD', s1)
# combining() 函数可以测试一个字符是否为和音字符
print(''.join(c for c in t1 if not unicodedata.combining(c)))


Esempio n. 33
0
def shave_marks(text):
    norm_text = unicodedata.normalize('NFD', text)  # 采用'NFD'把所有字符分解成基字符和组合记号
    shaved = ''.join(c for c in norm_text
                     if not unicodedata.combining(c))  # 过滤掉所有组合记号
    return unicodedata.normalize('NFC', shaved)  # 重组所有字符
Esempio n. 34
0
import re
import string
import sys
import unicodedata


LOWER_LETTERS = set(string.ascii_lowercase)
ACCEPTED_LETTERS = set.union(LOWER_LETTERS, set("0123456789"), {"'"})


if __name__ == "__main__":
    for text in sys.stdin:
        # stay only ascii symbols
        nfkd_form = unicodedata.normalize("NFKD", text.strip())
        nfkd_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
        # lowercase text
        nfkd_text = nfkd_text.lower()
        # remove hyphen
        nfkd_text = nfkd_text.replace("-", " ")
        # change & -> and
        nfkd_text = nfkd_text.replace("&", " and ")
        nfkd_text = re.sub(" +", " ", nfkd_text).strip()
        # stay words with at least one letter and containing only available tokens
        # otherviwe skip a word
        cleaned_text = []
        for word in nfkd_text.split(" "):
            word = word.lower()
            if len(set(word).intersection(ACCEPTED_LETTERS)) > 0:
                # add word if it contains acceptable tokens
                if len(set(word) - ACCEPTED_LETTERS) == 0:
                    cleaned_text.append(word)
Esempio n. 35
0
def _combining_class(cp: int) -> int:
    v = unicodedata.combining(chr(cp))
    if v == 0:
        if not unicodedata.name(chr(cp)):
            raise ValueError('Unknown character in unicodedata')
    return v
Esempio n. 36
0
 def filter(self, original, filename):
     return filter(lambda s: not unicodedata.combining(s),
                   unicodedata.normalize('NFKD', filename))
Esempio n. 37
0
def to_unicode(text):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
Esempio n. 38
0
def remove_accents(input_str):
    if isinstance(input_str, unicode):
        nfkd_form = unicodedata.normalize('NFKD', input_str)
        return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    else:
        return input_str
Esempio n. 39
0
def strip_accents(s):
    return ''.join(c for c in normalize('NFKD', s) if not combining(c))
Esempio n. 40
0
async def remove_special_chars(string):
    """https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79"""

    nfkd = unicodedata.normalize('NFKD', string)
    no_accents = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z0-9 \\\]', '', no_accents).lower()
 def remove_accents(self, string):
     nkfd_form = unicodedata.normalize('NFKD', unicode(string))
     return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 42
0
    def getGameLog(self, last: int = 0) -> None:

        # Find href to player page from league totals page
        html = self.league.totalsHTML

        rows = html.find('tbody').findAll('tr', {'class': 'full_table'})

        lname = self.name.split(" ")[1].lower()
        lname = u"".join([
            c for c in unicodedata.normalize('NFKD', lname)
            if not unicodedata.combining(c)
        ])

        nameFound = False
        while len(rows) != 0:
            mid = len(rows) // 2
            name = rows[mid].find('td').find('a').text
            currlname = name.split(" ")[1].lower()
            currlname = u"".join([
                c for c in unicodedata.normalize('NFKD', currlname)
                if not unicodedata.combining(c)
            ])

            if currlname == lname:
                for row in rows:
                    if row.find('td').find('a').text == self.name:
                        nameLink = row.find('td', {
                            'data-stat': 'player'
                        }).find('a')
                        playerLink = nameLink['href'].replace('.html', "")
                        nameFound = True
                        break
                break
            elif currlname < lname:
                rows = rows[mid + 1:]
            else:
                rows = rows[:mid]

        if not nameFound:
            print("Error: Name not found")
            return None

        # Use href to find game log page
        gamesPage = f"https://www.basketball-reference.com{playerLink}/gamelog/2020"

        html = getSoup(gamesPage)

        rows = html.find('tbody').findAll('tr')[21:35]

        # add to games to game log
        avg = {'date': 'Average'}
        for cat in categories[1:]:
            avg[cat] = 0
        avg['fpts'] = 0
        gp = 0

        self.gameLog = []
        for row in rows[-last:]:
            try:
                date = row.find('td', {'data-stat': 'date_game'}).text
            except AttributeError:
                continue

            game = {'date': date}

            fpts = 0
            played = True
            for cat in categories[1:]:
                cell = row.find('td', {'data-stat': cat})
                if cell == None:  # player is out, has no stats
                    played = False
                    game['mp'] = row.find('td', {'data-stat': 'reason'}).text
                    break  # move to next game
                else:
                    game[cat] = cell.text

                if game[cat] != "" and cat != 'mp':
                    game[cat] = float(
                        game[cat])  # cast all values to float if possible
                    avg[cat] += game[cat]
                elif cat != 'mp':
                    game[cat] = 0
                else:
                    minutes = game[cat].split(":")
                    avg[cat] += int(minutes[0]) + (float(minutes[1]) / 60)

                if cat in self.league.scoring:  # if category is scored
                    fpts += self.league.scoring[cat] * game[
                        cat]  # add to total

            if played == True:
                game['fpts'] = fpts
                avg['fpts'] += game['fpts']
                gp += 1

            self.gameLog.append(game)

        for key in avg:
            if key != 'date':
                avg[key] /= gp

        self.gameLog.append(avg)
Esempio n. 43
0
def normalize(s):
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c)).translate(SEARCH_TRANS)
Esempio n. 44
0
def strip_combining_chars(text):
    if isinstance(text, str) and sys.version_info < (3, 0):
        return text
    return u''.join([c for c in text if not unicodedata.combining(c)])
def removeAccents(value):
    return u"".join([
        c for c in unicodedata.normalize('NFKD', unicode(value))
        if not unicodedata.combining(c)
    ])
Esempio n. 46
0
 def normalize(self, token):
     nkfd = unicodedata.normalize("NFKD", token.lower())
     yield u"".join([c for c in nkfd if not unicodedata.combining(c)])
Esempio n. 47
0
def shave_marks(txt):
    """ 去掉全部变音符号 """
    norm_txt = normalize('NFD', txt)  # 把字符分解成基字符和组合记号
    shaved = ''.join(c for c in norm_txt if not combining(c))  # 过滤掉所有组合字符
    return normalize('NFC', shaved)
Esempio n. 48
0
def _char_ok(char):
    """Confirm that character is not a combining mark, or that it is an
    iota subscript"""
    cc = combining(char)
    return cc == 0 or cc == 240
Esempio n. 49
0
def normalize(input_str):
    ascii_string = u"".join(input_str)
    nkfd_form = unicodedata.normalize('NFKD', ascii_string)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
Esempio n. 50
0
a = s.translate(remap)
print(a)
'''
python is awesome
'''
'''
可以看到,类似\t和\f这样的空格符已经被重新映射成一个单独的空格,回车符\r已经完全被删除掉了.
可以利用这种得新映射的思想进一步构建出更加庞大的转换表.例如,我们把所有的Unicode组合字符都去掉:
'''

import unicodedata
import sys

cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))

b = unicodedata.normalize('NFD', a)

print(b)

print(b.translate(cmb_chrs))
'''
python is awesome

python is awesome
'''
'''
在这个例子中,我们使用dict.fromkeys()方法构建一个将每个组合字符都映射为None的字典.
原始输入会通过unicodedata.normalize()方法转换为分离形式,然后再通过translate()方法删除所有的重音符号.我们也可以利用相似的技术来去掉其他类型的字符(例如控制字符).
下面来看另一个例子.这里有一张转换表将所有的unicode十进制数字映射为它们对应的ASCII版本:
Esempio n. 51
0
def remove_accents_lower(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form
                     if not unicodedata.combining(c)]).lower()