def _unicode(content): result = None # Try charset from content-type encoding = self.encoding if not content: return _str('') # Fallback to auto-detected encoding. if self.encoding is None: encoding = chardet.detect(content)['encoding'] # Decode unicode from given encoding. try: result = _str(content, encoding, errors='replace') except (LookupError, TypeError): # A LookupError is raised if the encoding was not found which could # indicate a misspelling or similar mistake. # # A TypeError can be raised if encoding is None # # So we try blindly encoding. result = _str(content, errors='replace') return result
def auto_detect(text): charset = None # Try a rapid detection using filemagic with magic.Magic() as m: magic_print = m.id_buffer(text) # UTF-8 is rapidly detected, but if the result is not UTF-8, # we use the more precise, but slower, chardet. if magic_print is not None and "UTF-8" in magic_print: charset = "utf-8" else: charset = chardet.detect(text)['encoding'] if charset is not None: charset = charset.lower() return charset
def _autodetect_encoding(binary_data): return chardet.detect(binary_data)["encoding"]
def _encoding(content): return chardet.detect(content)['encoding']
def _autodetect_encoding(binary_data): return chardet.detect(binary_data)['encoding']
def guess_encoding(s): chardet = get_chardet_module() if chardet: encoding = chardet.detect(s)['encoding'] return encoding or 'utf-8'