def clean_tokenize(self, input_text, accentmark, minimunlen, numeric, alpha,stopwords): """ Clean document, removing accents, punctuation and symbols :param text: string to clean :return: string cleaned without punctuation and stop words """ final_text = [] if not accentmark: text = input_text.replace('\n', ' ').replace('\r', '').replace('”', '').replace('“', '').replace('.', '') nfkd_form = unicodedata.normalize('NFKD', text) unicode_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]).lower() clean_text = unicode_text.translate(punctuation) clean_text = str(''.join([i if ord(i) < 128 else ' ' for i in clean_text])).lower() words = word_tokenize(clean_text, language='spanish') #words = nltk.regexp_tokenize(clean_text, r"([a-zA-Z])*") else: text = u"".join([c for c in input_text if not unicodedata.combining(c)]) words = word_tokenize(text, language='spanish') for word in words: result = True if len(word) > minimunlen: if stopwords: if word.lower() in self.ALL_STOPWORDS: result = False if result and numeric and word.isnumeric(): result = False elif result and alpha and not word.isalpha(): result = False if result: final_text.append(word) return final_text
def preprocess_str(line: str, return_mapping: bool = False) -> Union[Tuple[str, List[int], List[int]], str]: """ Removes unicode and other characters from str Args: line: string to process return_mapping: return mapping from line to preprocessed line or not Returns: preprocessed line, raw2preprocessed mapping, preprocessed2raw mapping """ line = line.replace("''", '" ').replace("``", '" ') if not return_mapping: return ''.join(c for c in line if not unicodedata.combining(c)) r2p = [len(line)] * (len(line) + 1) p2r = [len(line)] * (len(line) + 1) s = '' for i, c in enumerate(line): if unicodedata.combining(c): r2p[i] = -1 else: s += c r2p[i] = len(s) - 1 p2r[len(s) - 1] = i return s, r2p, p2r
def clean_song_data(self, artist, title): # convert to lowercase artist = artist.lower() title = title.lower() # remove accents artist = unicodedata.normalize('NFKD', artist) artist = "".join([c for c in artist if not unicodedata.combining(c)]) title = unicodedata.normalize('NFKD', title) title = "".join([c for c in title if not unicodedata.combining(c)]) if self.ignore_brackets: LYRICS_TITLE_STRIP.append("\(.*\)") # replace ampersands and the like for exp in LYRICS_ARTIST_REPLACE: artist = re.sub(exp[0], exp[1], artist) for exp in LYRICS_TITLE_REPLACE: title = re.sub(exp[0], exp[1], title) # strip things like "(live at Somewhere)", "(acoustic)", etc for exp in LYRICS_TITLE_STRIP: title = re.sub (exp, '', title) # compress spaces title = title.strip() artist = artist.strip() return (artist, title)
def shave_marks_latin(txt): norm_txt = normalize('NFD', txt) latin_base = False keepers = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: continue keepers.append(c) if not unicodedata.combining(c): latin_base = c in string.ascii_letters shaved = ''.join(keepers) return unicodedata.normalize('NFC', shaved)
def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) latin_base = False keepers = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: continue # Ignore diacritic on latin base char. keepers.append(c) # If it isn't combining char, it's a new base char. if not unicodedata.combining(c): latin_base = c in string.ascii_letters shaved = ''.join(keepers) return unicodedata.normalize('NFC', shaved)
def remove_initial_vowel(word): if not word: return u'' word = unicodedata.normalize('NFKD', word) removed = u'' while word[0] in vowels or unicodedata.combining(word[0]): removed += word[0] test = u''.join([c for c in removed if not unicodedata.combining(c)]) if test and test not in vowels and test not in diphthongs: return word if len(word) == 1: return u'' word = word[1:] return word
def shave_marks_latin(txt): process_txt = unicodedata.normalize('NFD', txt) keepers = [] latin_base = False for char in process_txt: if not unicodedata.combining(char) or not latin_base: keepers.append(char) elif unicodedata.combining(char) and not latin_base: keepers.append(char) elif not unicodedata.combining(char) and latin_base: keepers.append(char) if not unicodedata.combining(char): latin_base = char in string.ascii_letters text = ''.join(keepers) return unicodedata.normalize('NFC', text)
def _format_for_latex(self, text): """ Function to sanitize text, so that it can be typeset by latex. This sanitation consists of three operations: 1. Normalize the text to NFC. This compresses diacritics where possible. 2. Replacement of unknown unicode characters with a default. 3. Replacement of non-typesettable character with their latex counterpart or equivalent character. Parameters ---------- text : str The text to sanitize. Returns ------- The text after sanitation. """ correct_line = '' normalized_line = unicodedata.normalize('NFC', text) for idx, c in enumerate(normalized_line): try: if unicodedata.combining(c) != 0: continue next_char = normalized_line[idx+1] name = unicodedata.name(c, None) codepoint = ord(next_char) if codepoint in special_char_mapping.keys(): latex_command = special_char_mapping[codepoint] correct_line += self._build_latex_replacement(latex_command, c) elif c in special_char_mapping.keys(): latex_command = special_char_mapping[c] correct_line += self._build_latex_replacement(latex_command, c) elif name is None: self._preamble['latexsym'] = '' correct_line += '□' else: correct_line += c except IndexError: if unicodedata.combining(c) != 0: continue if c in special_char_mapping.keys(): latex_command = special_char_mapping[c] correct_line += self._build_latex_replacement(latex_command, c) else: correct_line += c return correct_line
def strip_accents(text): """ Remove accents (diacritic) from all characters. """ return ''.join((char for char in unicodedata.normalize('NFD', text) if not unicodedata.combining(char)))
def _char_block_width(char): # Basic Latin, which is probably the most common case #if char in xrange(0x0021, 0x007e): #if char >= 0x0021 and char <= 0x007e: if 0x0021 <= char <= 0x007e: return 1 # Chinese, Japanese, Korean (common) if 0x4e00 <= char <= 0x9fff: return 2 # Hangul if 0xac00 <= char <= 0xd7af: return 2 # Combining? if unicodedata.combining(uni_chr(char)): return 0 # Hiragana and Katakana if 0x3040 <= char <= 0x309f or 0x30a0 <= char <= 0x30ff: return 2 # Full-width Latin characters if 0xff01 <= char <= 0xff60: return 2 # CJK punctuation if 0x3000 <= char <= 0x303e: return 2 # Backspace and delete if char in (0x0008, 0x007f): return -1 # Other control characters elif char in (0x0000, 0x001f): return 0 # Take a guess return 1
def about(u, cp=None, name=None): global data_loaded ## load UnicodeData if not data_loaded: load_data() data_loaded = True if cp is None: ## cp is not provided, we can safely grab the codepoint cp = ord(u) else: ## codepoint is provided but is in hexadeciaml cp = int(cp, 16) if name is None: name = 'No Name Found' ## we need the U+XXXX numbers ## which are hex numbers ## it is how the numbers are formatted in the UnicodeData file search_cp = '%04X' % (cp) if search_cp in cp_names: name = cp_names[search_cp] ## TODO: Replace this... if not unicodedata.combining(u): template = 'U+%04X %s (%s)' else: template = 'U+%04X %s (\xe2\x97\x8c%s)' return template % (cp, name, u.encode('utf-8'))
def codepoint(bot, trigger): arg = trigger.group(2) if not arg: bot.reply('What code point do you want me to look up?') return module.NOLIMIT stripped = arg.strip() if len(stripped) > 0: arg = stripped if len(arg) > 1: if arg.startswith('U+'): arg = arg[2:] try: arg = unichr(int(arg, 16)) except (ValueError, TypeError): bot.reply("That's not a valid code point.") return module.NOLIMIT point, name = get_codepoint_name(arg) if name is None: name = '(No name found)' template = 'U+%s %s (\xe2\x97\x8c%s)' if not unicodedata.combining(arg): template = 'U+%s %s (%s)' bot.say(template % (point, name, arg))
def remove_diacritical(text): line = unicodedata.normalize("NFKD", text) output = "" for c in line: if not unicodedata.combining(c): output += c return output
def artist_search(results, media, lang, artist_name): # Precompose. try: artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8')) except UnicodeError: artist_name = unicodedata.normalize('NFKD', artist_name) # Strip diacritics. stripped = u'' for i in range(len(artist_name)): point = artist_name[i] if not unicodedata.combining(point): stripped += point artist_name = stripped json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name))) score = 100 normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name)) for artist in json_obj['artists']: # Require a perfect match after normalization to avoid false positives. normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name'])) Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result)) if normalized_artist_name == normalized_artist_result: results.add(SearchResult( id = artist['urlSafeName'], score = score )) score = score - 1
def __remove_acento(self, letra): if letra == 'ç' or letra == 'Ç': # cedilha faz parte do teclado, entao eh obrigado a ser calculado return letra nkfd_form = unicodedata.normalize('NFKD', letra) return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def make_sortable(text): text = text.lower() text = text.decode('utf-8') normalized = unicodedata.normalize('NFKD', text) text = u''.join([c for c in normalized if not unicodedata.combining(c)]) text = text.encode('utf-8') return text
def removeAccents(str): """Remove any form of UTF-8 accents. See: http://stackoverflow.com/questions/517923/ """ nkfd_form = unicodedata.normalize('NFKD', str) return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def codepoint(bot, trigger): arg = trigger.group(2).strip() if len(arg) == 0: bot.reply('What code point do you want me to look up?') return NOLIMIT elif len(arg) > 1: if arg.startswith('U+'): arg = arg[2:] try: arg = unichr(int(arg, 16)) except: bot.reply("That's not a valid code point.") return NOLIMIT # Get the hex value for the code point, and drop the 0x from the front point = str(hex(ord(u'' + arg)))[2:] # Make the hex 4 characters long with preceding 0s, and all upper case point = point.rjust(4, str('0')).upper() try: name = unicodedata.name(arg) except ValueError: return 'U+%s (No name found)' % point if not unicodedata.combining(arg): template = 'U+%s %s (%s)' else: template = 'U+%s %s (\xe2\x97\x8c%s)' bot.say(template % (point, name, arg))
def normalizeUnicode(text, encoding='humanascii'): """ This method is used for normalization of unicode characters to the base ASCII letters. Output is ASCII encoded string (or char) with only ASCII letters, digits, punctuation and whitespace characters. Case is preserved. """ if text == "": return "" unicodeinput = True if not isinstance(text, unicode): text = unicode(text, 'utf-8') unicodeinput = False res = '' global allowed, allowedid if encoding == 'humanascii' or encoding == 'identifier': enc = 'ascii' else: enc = encoding for ch in text: if (encoding == 'humanascii') and (ch in allowed): # ASCII chars, digits etc. stay untouched res += ch continue if (encoding == 'identifier') and (ch in allowedid): # ASCII chars, digits etc. stay untouched res += ch continue else: try: ch.encode(enc,'strict') if encoding == 'identifier': res += '_' else: res += ch except UnicodeEncodeError: ordinal = ord(ch) if mapping.has_key(ordinal): # try to apply custom mappings res += mapping.get(ordinal) elif decomposition(ch) or len(normalize('NFKD',ch)) > 1: normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip() # normalized string may contain non-letter chars too. Remove them # normalized string may result to more than one char if encoding == 'identifier': res += ''.join([c for c in normalized if c in allowedid]) else: res += ''.join([c for c in normalized if c in allowed]) else: # hex string instead of unknown char res += "%x" % ordinal if encoding == 'identifier': res = res.strip('_').replace('_____','_').replace('____','_').replace('___','_').replace('__','_') if not res.strip('_')[0] in string.ascii_letters: res = '_' + res if unicodeinput: return res else: return res.encode('utf-8')
def remove_accents(input_str): """Suboptimal-but-better-than-nothing way to replace accented latin letters by an ASCII equivalent. Will obviously change the meaning of input_str and work only for some cases""" input_str = ustr(input_str) nkfd_form = unicodedata.normalize('NFKD', input_str) return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def _text_chars(self, length, truncate, text, whole_words): """ Truncates a string after a certain number of chars. """ s_len = 0 end_index = None for i, char in enumerate(text): if unicodedata.combining(char): # Don't consider combining characters # as adding to the string length continue s_len += 1 if end_index is None and s_len > length: end_index = i if s_len > length: truncated = text[:end_index or 0] if whole_words: if not char.isspace(): # Current character is whitespace, find previous # whole word truncated = truncated.rsplit(' ', 1)[0] # Remove trailing whitespace and punctuation truncated = truncated.rstrip( string.whitespace + string.punctuation ) # Return the truncated string return self.add_truncation_text(truncated, truncate) # Return the original string since no truncation was necessary return text
def remove_diacritics(s: str) -> str: """Canonicalises and removes all diacritics from the given string""" return "".join(c for c in unicodedata.normalize('NFKD', str(s)) if not unicodedata.combining(c))
def process(self, data): for raw_ch in data: if self.utf8_len == 0: if ord(raw_ch) < 128: ch = unicode(raw_ch) elif ord(raw_ch) < 0xc0: # Unexpected continuation character ch = unichr(ord(raw_ch)) elif ord(raw_ch) < 0xe0: self.utf8_buffer = raw_ch self.utf8_len = 1 elif ord(raw_ch) < 0xf0: self.utf8_buffer = raw_ch self.utf8_len = 2 elif ord(raw_ch) < 0xf8: self.utf8_buffer = raw_ch self.utf8_len = 3 elif ord(raw_ch) < 0xfc: self.utf8_buffer = raw_ch self.utf8_len = 4 elif ord(raw_ch) < 0xfe: self.utf8_buffer = raw_ch self.utf8_len = 5 else: # Invalid first byte ch = unichr(ord(raw_ch)) else: if (ord(raw_ch) & 0xc0) != 0x80: # Invalid continuation character ch = unichr(ord(raw_ch)) self.utf8_len = 0 else: self.utf8_buffer += raw_ch self.utf8_len -= 1 if self.utf8_len == 0: ch = unicode(self.utf8_buffer, 'utf8', 'replace') if self.utf8_len > 0: continue # Check for combining characters try: if (unicodedata.combining(ch) != 0) and (self.cursor_col > 0): # Combining character, so combine it with the previously written character last_ch = self.screen[self.cursor_row][self.cursor_col - 1] combined = unicodedata.normalize("NFC", last_ch + ch) if len(combined) == 1: # Successful combine, write out new character self.screen[self.cursor_row][self.cursor_col - 1] = combined self.dirty.add(self.cursor_row) continue except TypeError: # Invalid character ch = u' ' if self.window_title_mode: if ch == u'\007': # Bell character ends window title if self.title_callback and not self.ignored_window_title: self.title_callback(self.unprocessed_input) self.unprocessed_input = u"" self.window_title_mode = False else: self.unprocessed_input += ch elif ch in self.special_chars: self.special_chars[ch]() elif self.escape_mode: self.unprocessed_input += ch if len(self.unprocessed_input) == 1: if (ch != u'[') and (ch != u']') and ( ch not in self.charset_escapes): # Special type of escape sequence, no parameters self.process_escape(self.unprocessed_input) self.unprocessed_input = u"" self.escape_mode = False elif (len(self.unprocessed_input) == 2) and (self.unprocessed_input[0] in self.charset_escapes): if self.unprocessed_input == "(0": # Select line drawing character set self.line_draw = True else: # Other character set escape, just use UTF8 self.line_draw = False self.unprocessed_input = u"" self.escape_mode = False elif (ch >= u'@') and (ch <= u'~'): # Ending character found, process sequence self.process_escape(self.unprocessed_input) self.unprocessed_input = u"" self.escape_mode = False else: # Parameter character, add to pending string if self.unprocessed_input.startswith(u']') and (ch == u';'): # Setting window title, wait for bell character to finish self.start_window_title(self.unprocessed_input) self.unprocessed_input = u"" self.escape_mode = False self.window_title_mode = True elif ch == u'\033': self.escape() else: self.write_char(ch) if self.update_callback: self.update_callback()
def sanitize_key(input_str): input_str = re.sub(r"\\.", "", input_str).strip().lower().replace("the ", "") # remove accents from unicode characters nkfd_form = unicodedata.normalize('NFKD', unicode(input_str.lower())) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def to_ascii(value): "Convert any non-ASCII character to its closest ASCII equivalent." if value is None: return "" value = unicodedata.normalize("NFKD", str(value)) return "".join([c for c in value if not unicodedata.combining(c)])
def remove_diacritics(value): """Return string with diacritics removed. Value must be unicode.""" if value: value = u''.join(char for char in unicodedata.normalize('NFKD', value) if not unicodedata.combining(char)) return value
def normalized_metadata_frequencies(loader_obj): print('%s: Generating normalized metadata frequencies...' % time.ctime()) frequencies = loader_obj.destination + '/frequencies' for field in loader_obj.metadata_fields: try: output = open(frequencies + "/normalized_" + field + "_frequencies", "w") for line in open(frequencies + "/" + field + "_frequencies"): word, count = line.split("\t") norm_word = word.decode('utf-8').lower() norm_word = [i for i in unicodedata.normalize("NFKD", norm_word) if not unicodedata.combining(i)] norm_word = ''.join(norm_word).encode('utf-8') print(norm_word + "\t" + word, file=output) output.close() except: pass
def _strip_accents(s): return u''.join( (c for c in unicodedata.normalize('NFD', s) if not unicodedata.combining(c)))
def strip_accents(self, text): if self is None: return text = unicodedata.normalize('NFKD', text) return ''.join([c for c in text if not unicodedata.combining(c)])
def remove_accents(input_str): nkfd_form = unicodedata.normalize('NFKD', unicode(input_str)) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def draw(self, data): """Display decoded characters at the current cursor position and advances the cursor if :data:`~pyte.modes.DECAWM` is set. :param str data: text to display. .. versionchanged:: 0.5.0 Character width is taken into account. Specifically, zero-width and unprintable characters do not affect screen state. Full-width characters are rendered into two consecutive character containers. """ data = data.translate( self.g1_charset if self.charset else self.g0_charset) for char in data: char_width = wcwidth(char) # If this was the last column in a line and auto wrap mode is # enabled, move the cursor to the beginning of the next line, # otherwise replace characters already displayed with newly # entered. if self.cursor.x == self.columns: if mo.DECAWM in self.mode: self.dirty.add(self.cursor.y) self.carriage_return() self.linefeed() elif char_width > 0: self.cursor.x -= char_width # If Insert mode is set, new characters move old characters to # the right, otherwise terminal is in Replace mode and new # characters replace old characters at cursor position. if mo.IRM in self.mode and char_width > 0: self.insert_characters(char_width) line = self.buffer[self.cursor.y] if char_width == 1: line[self.cursor.x] = self.cursor.attrs._replace(data=char) elif char_width == 2: # A two-cell character has a stub slot after it. line[self.cursor.x] = self.cursor.attrs._replace(data=char) if self.cursor.x + 1 < self.columns: line[self.cursor.x + 1] = self.cursor.attrs \ ._replace(data="") elif char_width == 0 and unicodedata.combining(char): # A zero-cell character is combined with the previous # character either on this or preceeding line. if self.cursor.x: last = line[self.cursor.x - 1] normalized = unicodedata.normalize("NFC", last.data + char) line[self.cursor.x - 1] = last._replace(data=normalized) elif self.cursor.y: last = self.buffer[self.cursor.y - 1][self.columns - 1] normalized = unicodedata.normalize("NFC", last.data + char) self.buffer[self.cursor.y - 1][self.columns - 1] = \ last._replace(data=normalized) else: break # Unprintable character or doesn't advance the cursor. # .. note:: We can't use :meth:`cursor_forward()`, because that # way, we'll never know when to linefeed. if char_width > 0: self.cursor.x = min(self.cursor.x + char_width, self.columns) self.dirty.add(self.cursor.y)
print(s2) # (b) Examine equality and length print('s1 == s2', s1 == s2) print(len(s1), len(s2)) # (c) Normalize and try the same experiment import unicodedata n_s1 = unicodedata.normalize('NFC', s1) n_s2 = unicodedata.normalize('NFC', s2) print('n_s1 == n_s2', n_s1 == n_s2) print(len(n_s1), len(n_s2)) # (d) Example of normalizing to a decomposed form and stripping accents t1 = unicodedata.normalize('NFD', s1) print(''.join(c for c in t1 if not unicodedata.combining(c))) s = '\ufb01' # A single character print(unicodedata.normalize('NFD', s)) print(unicodedata.normalize('NFKD', s)) print(unicodedata.normalize('NFKC', s)) t1 = unicodedata.normalize('NFD', s1) # combining() 函数可以测试一个字符是否为和音字符 print(''.join(c for c in t1 if not unicodedata.combining(c)))
def shave_marks(text): norm_text = unicodedata.normalize('NFD', text) # 采用'NFD'把所有字符分解成基字符和组合记号 shaved = ''.join(c for c in norm_text if not unicodedata.combining(c)) # 过滤掉所有组合记号 return unicodedata.normalize('NFC', shaved) # 重组所有字符
import re import string import sys import unicodedata LOWER_LETTERS = set(string.ascii_lowercase) ACCEPTED_LETTERS = set.union(LOWER_LETTERS, set("0123456789"), {"'"}) if __name__ == "__main__": for text in sys.stdin: # stay only ascii symbols nfkd_form = unicodedata.normalize("NFKD", text.strip()) nfkd_text = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) # lowercase text nfkd_text = nfkd_text.lower() # remove hyphen nfkd_text = nfkd_text.replace("-", " ") # change & -> and nfkd_text = nfkd_text.replace("&", " and ") nfkd_text = re.sub(" +", " ", nfkd_text).strip() # stay words with at least one letter and containing only available tokens # otherviwe skip a word cleaned_text = [] for word in nfkd_text.split(" "): word = word.lower() if len(set(word).intersection(ACCEPTED_LETTERS)) > 0: # add word if it contains acceptable tokens if len(set(word) - ACCEPTED_LETTERS) == 0: cleaned_text.append(word)
def _combining_class(cp: int) -> int: v = unicodedata.combining(chr(cp)) if v == 0: if not unicodedata.name(chr(cp)): raise ValueError('Unknown character in unicodedata') return v
def filter(self, original, filename): return filter(lambda s: not unicodedata.combining(s), unicodedata.normalize('NFKD', filename))
def to_unicode(text): nfkd_form = unicodedata.normalize('NFKD', text) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
def remove_accents(input_str): if isinstance(input_str, unicode): nfkd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) else: return input_str
def strip_accents(s): return ''.join(c for c in normalize('NFKD', s) if not combining(c))
async def remove_special_chars(string): """https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79""" nfkd = unicodedata.normalize('NFKD', string) no_accents = u"".join([c for c in nfkd if not unicodedata.combining(c)]) return re.sub('[^a-zA-Z0-9 \\\]', '', no_accents).lower()
def remove_accents(self, string): nkfd_form = unicodedata.normalize('NFKD', unicode(string)) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getGameLog(self, last: int = 0) -> None: # Find href to player page from league totals page html = self.league.totalsHTML rows = html.find('tbody').findAll('tr', {'class': 'full_table'}) lname = self.name.split(" ")[1].lower() lname = u"".join([ c for c in unicodedata.normalize('NFKD', lname) if not unicodedata.combining(c) ]) nameFound = False while len(rows) != 0: mid = len(rows) // 2 name = rows[mid].find('td').find('a').text currlname = name.split(" ")[1].lower() currlname = u"".join([ c for c in unicodedata.normalize('NFKD', currlname) if not unicodedata.combining(c) ]) if currlname == lname: for row in rows: if row.find('td').find('a').text == self.name: nameLink = row.find('td', { 'data-stat': 'player' }).find('a') playerLink = nameLink['href'].replace('.html', "") nameFound = True break break elif currlname < lname: rows = rows[mid + 1:] else: rows = rows[:mid] if not nameFound: print("Error: Name not found") return None # Use href to find game log page gamesPage = f"https://www.basketball-reference.com{playerLink}/gamelog/2020" html = getSoup(gamesPage) rows = html.find('tbody').findAll('tr')[21:35] # add to games to game log avg = {'date': 'Average'} for cat in categories[1:]: avg[cat] = 0 avg['fpts'] = 0 gp = 0 self.gameLog = [] for row in rows[-last:]: try: date = row.find('td', {'data-stat': 'date_game'}).text except AttributeError: continue game = {'date': date} fpts = 0 played = True for cat in categories[1:]: cell = row.find('td', {'data-stat': cat}) if cell == None: # player is out, has no stats played = False game['mp'] = row.find('td', {'data-stat': 'reason'}).text break # move to next game else: game[cat] = cell.text if game[cat] != "" and cat != 'mp': game[cat] = float( game[cat]) # cast all values to float if possible avg[cat] += game[cat] elif cat != 'mp': game[cat] = 0 else: minutes = game[cat].split(":") avg[cat] += int(minutes[0]) + (float(minutes[1]) / 60) if cat in self.league.scoring: # if category is scored fpts += self.league.scoring[cat] * game[ cat] # add to total if played == True: game['fpts'] = fpts avg['fpts'] += game['fpts'] gp += 1 self.gameLog.append(game) for key in avg: if key != 'date': avg[key] /= gp self.gameLog.append(avg)
def normalize(s): return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c)).translate(SEARCH_TRANS)
def strip_combining_chars(text): if isinstance(text, str) and sys.version_info < (3, 0): return text return u''.join([c for c in text if not unicodedata.combining(c)])
def removeAccents(value): return u"".join([ c for c in unicodedata.normalize('NFKD', unicode(value)) if not unicodedata.combining(c) ])
def normalize(self, token): nkfd = unicodedata.normalize("NFKD", token.lower()) yield u"".join([c for c in nkfd if not unicodedata.combining(c)])
def shave_marks(txt): """ 去掉全部变音符号 """ norm_txt = normalize('NFD', txt) # 把字符分解成基字符和组合记号 shaved = ''.join(c for c in norm_txt if not combining(c)) # 过滤掉所有组合字符 return normalize('NFC', shaved)
def _char_ok(char): """Confirm that character is not a combining mark, or that it is an iota subscript""" cc = combining(char) return cc == 0 or cc == 240
def normalize(input_str): ascii_string = u"".join(input_str) nkfd_form = unicodedata.normalize('NFKD', ascii_string) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
a = s.translate(remap) print(a) ''' python is awesome ''' ''' 可以看到,类似\t和\f这样的空格符已经被重新映射成一个单独的空格,回车符\r已经完全被删除掉了. 可以利用这种得新映射的思想进一步构建出更加庞大的转换表.例如,我们把所有的Unicode组合字符都去掉: ''' import unicodedata import sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) print(b) print(b.translate(cmb_chrs)) ''' python is awesome python is awesome ''' ''' 在这个例子中,我们使用dict.fromkeys()方法构建一个将每个组合字符都映射为None的字典. 原始输入会通过unicodedata.normalize()方法转换为分离形式,然后再通过translate()方法删除所有的重音符号.我们也可以利用相似的技术来去掉其他类型的字符(例如控制字符). 下面来看另一个例子.这里有一张转换表将所有的unicode十进制数字映射为它们对应的ASCII版本:
def remove_accents_lower(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]).lower()