def detect_encoding(main, file_path): text = b'' with open(file_path, 'rb') as f: if main.settings_custom['files']['auto_detection_settings'][ 'number_lines_no_limit']: text = f.read() else: for i, line in enumerate(f): if i < main.settings_custom['files'][ 'auto_detection_settings']['number_lines']: text += line else: break results = charset_normalizer.from_bytes(text) if results: encoding = results.best().encoding else: encoding = 'utf_8' # Test decodability if encoding != 'utf_8': try: with open(file_path, 'r', encoding=encoding) as f: f.read() # Fall back to UTF-8 if fail except UnicodeDecodeError: encoding = 'utf_8' return encoding
def test_alphabet_property_undefined_range(): payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3' best_guess = from_bytes(payload).best() assert best_guess is not None, "Payload should have given something, detection failure" assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected" assert best_guess.alphabets == [], "This property in that edge case, should return a empty list"
def _charset_guess_encoding( data: Iterable[bytes], default_encoding: str = "utf-8" ) -> Tuple[str, float]: for line in data: result = from_bytes(line).best() if result: return (result.encoding, result.coherence) return (default_encoding, 0.0)
def predict_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ result = from_bytes(text, explain=False) return tidy_result(result, default=default)
def detect_encoding(bytesobject): """"Read all input or first chunk and return a list of encodings""" # alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py # unicode-test if isutf8(bytesobject): return ['utf-8'] guesses = [] # additional module if cchardet_detect is not None: cchardet_guess = cchardet_detect(bytesobject)['encoding'] if cchardet_guess is not None: guesses.append(cchardet_guess.lower()) # try charset_normalizer on first part, fallback on full document detection_results = from_bytes(bytesobject[:15000]) or from_bytes(bytesobject) # return alternatives if len(detection_results) > 0: guesses.extend([r.encoding for r in detection_results]) # it cannot be utf-8 (tested above) return [g for g in guesses if g not in UNICODE_ALIASES]
def test_large_payload_u8_sig_basic_entry(): payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig") best_guess = from_bytes(payload).best() assert best_guess is not None, "Large U8 payload case detection completely failed" assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!" assert best_guess.bom is True, "SIG/BOM property should be True" assert len(best_guess.raw) == len( payload ), "Large payload should remain untouched when accessed through .raw"
def apparent_encoding(self) -> typing.Optional[str]: """ Return the encoding, as determined by `charset_normalizer`. """ content = getattr(self, "_content", b"") if len(content) < 32: # charset_normalizer will issue warnings if we run it with # fewer bytes than this cutoff. return None match = charset_normalizer.from_bytes(self.content).best() return None if match is None else match.encoding
def test_misleading_large_sequence(): content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')).encode('utf_8') guesses = from_bytes(content) assert len(guesses) > 0 match = guesses.best() assert match is not None assert match.encoding == 'utf_8' assert str(match) is not None
def read(file_name: str, log: logging.Logger) -> str: """ Read a file into a string. @param file_name The item to read @param log The log @return The content of the item. """ try: with open(file_name, "rb") as file: return str(charset_normalizer.from_bytes(file.read()).best()) except FileNotFoundError as err: log.error("%s -> %s", file_name, err) return ""
def predict_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess encoding from a file handle.""" start = fh.tell() result: CharsetMatches = CharsetMatches() while True: data = fh.read(1024 * 10) if not data: break result = from_bytes(data, explain=False) if result: break fh.seek(start) return tidy_result(result, default=default)
def detect_encoding(content: ContentBytes) -> str: """ We default to UTF-8 if text too short, because the detection can return a random encoding leading to confusing results given the `charset_normalizer` version (< 2.0.5). >>> too_short = ']"foo"' >>> detected = from_bytes(too_short.encode()).best().encoding >>> detected 'ascii' >>> too_short.encode().decode(detected) ']"foo"' """ encoding = UTF8 if len(content) > TOO_SMALL_SEQUENCE: match = from_bytes(bytes(content)).best() if match: encoding = match.encoding return encoding
def validate_utf8(file): base_name = basename(file) text = b'' with open(file, "rb") as f: for i, line in enumerate(f): if i < 1000: text += line else: break results = charset_normalizer.from_bytes(text).best() if not (results.encoding == "utf-8" and results.coherence >= 0.99): warn(message=f"File {file} should encoding with UTF-8", level=1) sys.exit(1) with open(file, "r") as f: content = f.read() normalized_nfc_content = Text(content) if normalized_nfc_content != content: warn(message=f"File {base_name} should normalized to NFC", error_type="Format nfc-normalized-failed", file=base_name, level=1)
def decode_str(decrypted_text: bytes, encoding: str) -> Tuple[str, str]: """ Detect encoding type using chardet, if the confidence of the detected encoding is lower than 0.9 we will add a message indicates it. If encoding is given, will use it. """ msg = '' out = '' if not encoding: with warnings.catch_warnings(record=True) as e: charset_match = from_bytes(decrypted_text) if len(charset_match): out = str(charset_match[0]) demisto.debug( f"Decode decrypted text using {charset_match[0].encoding} encoding" ) if e: msg = f'Note: encoding detection ended with warning: {e[0].message} Characters may be missing.' \ ' You can try running this command again and pass the encoding code as argument.\n' else: out = decrypted_text.decode(encoding) return out, msg