Beispiel #1
0
def detect_encoding(main, file_path):
    text = b''

    with open(file_path, 'rb') as f:
        if main.settings_custom['files']['auto_detection_settings'][
                'number_lines_no_limit']:
            text = f.read()
        else:
            for i, line in enumerate(f):
                if i < main.settings_custom['files'][
                        'auto_detection_settings']['number_lines']:
                    text += line
                else:
                    break

    results = charset_normalizer.from_bytes(text)

    if results:
        encoding = results.best().encoding
    else:
        encoding = 'utf_8'

    # Test decodability
    if encoding != 'utf_8':
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                f.read()
        # Fall back to UTF-8 if fail
        except UnicodeDecodeError:
            encoding = 'utf_8'

    return encoding
Beispiel #2
0
def test_alphabet_property_undefined_range():
    payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'

    best_guess = from_bytes(payload).best()

    assert best_guess is not None, "Payload should have given something, detection failure"
    assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"
    assert best_guess.alphabets == [], "This property in that edge case, should return a empty list"
Beispiel #3
0
def _charset_guess_encoding(
    data: Iterable[bytes], default_encoding: str = "utf-8"
) -> Tuple[str, float]:
    for line in data:
        result = from_bytes(line).best()
        if result:
            return (result.encoding, result.coherence)
    return (default_encoding, 0.0)
Beispiel #4
0
def predict_encoding(text: bytes,
                     default: Encoding = DEFAULT_ENCODING) -> Encoding:
    """Guess string encoding.

    Given a piece of text, apply character encoding detection to
    guess the appropriate encoding of the text.
    """
    result = from_bytes(text, explain=False)
    return tidy_result(result, default=default)
Beispiel #5
0
def detect_encoding(bytesobject):
    """"Read all input or first chunk and return a list of encodings"""
    # alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py
    # unicode-test
    if isutf8(bytesobject):
        return ['utf-8']
    guesses = []
    # additional module
    if cchardet_detect is not None:
        cchardet_guess = cchardet_detect(bytesobject)['encoding']
        if cchardet_guess is not None:
            guesses.append(cchardet_guess.lower())
    # try charset_normalizer on first part, fallback on full document
    detection_results = from_bytes(bytesobject[:15000]) or from_bytes(bytesobject)
    # return alternatives
    if len(detection_results) > 0:
        guesses.extend([r.encoding for r in detection_results])
    # it cannot be utf-8 (tested above)
    return [g for g in guesses if g not in UNICODE_ALIASES]
def test_large_payload_u8_sig_basic_entry():
    payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig")
    best_guess = from_bytes(payload).best()

    assert best_guess is not None, "Large U8 payload case detection completely failed"
    assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
    assert best_guess.bom is True, "SIG/BOM property should be True"
    assert len(best_guess.raw) == len(
        payload
    ), "Large payload should remain untouched when accessed through .raw"
Beispiel #7
0
 def apparent_encoding(self) -> typing.Optional[str]:
     """
     Return the encoding, as determined by `charset_normalizer`.
     """
     content = getattr(self, "_content", b"")
     if len(content) < 32:
         # charset_normalizer will issue warnings if we run it with
         # fewer bytes than this cutoff.
         return None
     match = charset_normalizer.from_bytes(self.content).best()
     return None if match is None else match.encoding
def test_misleading_large_sequence():
    content = (("hello simple ascii " * TOO_BIG_SEQUENCE) +
               ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')).encode('utf_8')

    guesses = from_bytes(content)

    assert len(guesses) > 0
    match = guesses.best()
    assert match is not None
    assert match.encoding == 'utf_8'
    assert str(match) is not None
Beispiel #9
0
def read(file_name: str, log: logging.Logger) -> str:
    """
    Read a file into a string.

    @param      file_name  The item to read
    @param      log        The log

    @return     The content of the item.
    """
    try:
        with open(file_name, "rb") as file:
            return str(charset_normalizer.from_bytes(file.read()).best())
    except FileNotFoundError as err:
        log.error("%s -> %s", file_name, err)
        return ""
Beispiel #10
0
def predict_file_encoding(fh: BinaryIO,
                          default: Encoding = DEFAULT_ENCODING) -> Encoding:
    """Guess encoding from a file handle."""
    start = fh.tell()
    result: CharsetMatches = CharsetMatches()

    while True:
        data = fh.read(1024 * 10)
        if not data:
            break

        result = from_bytes(data, explain=False)
        if result:
            break

    fh.seek(start)
    return tidy_result(result, default=default)
Beispiel #11
0
def detect_encoding(content: ContentBytes) -> str:
    """
    We default to UTF-8 if text too short, because the detection
    can return a random encoding leading to confusing results
    given the `charset_normalizer` version (< 2.0.5).

    >>> too_short = ']"foo"'
    >>> detected = from_bytes(too_short.encode()).best().encoding
    >>> detected
    'ascii'
    >>> too_short.encode().decode(detected)
    ']"foo"'
    """
    encoding = UTF8
    if len(content) > TOO_SMALL_SEQUENCE:
        match = from_bytes(bytes(content)).best()
        if match:
            encoding = match.encoding
    return encoding
Beispiel #12
0
def validate_utf8(file):
    base_name = basename(file)
    text = b''
    with open(file, "rb") as f:
        for i, line in enumerate(f):
            if i < 1000:
                text += line
            else:
                break
    results = charset_normalizer.from_bytes(text).best()
    if not (results.encoding == "utf-8" and results.coherence >= 0.99):
        warn(message=f"File {file} should encoding with UTF-8", level=1)
        sys.exit(1)
    with open(file, "r") as f:
        content = f.read()
    normalized_nfc_content = Text(content)
    if normalized_nfc_content != content:
        warn(message=f"File {base_name} should normalized to NFC",
             error_type="Format nfc-normalized-failed",
             file=base_name,
             level=1)
Beispiel #13
0
def decode_str(decrypted_text: bytes, encoding: str) -> Tuple[str, str]:
    """
    Detect encoding type using chardet, if the confidence of the detected encoding is lower than 0.9 we will add a
    message indicates it. If encoding is given, will use it.
    """
    msg = ''
    out = ''
    if not encoding:
        with warnings.catch_warnings(record=True) as e:
            charset_match = from_bytes(decrypted_text)
            if len(charset_match):
                out = str(charset_match[0])
                demisto.debug(
                    f"Decode decrypted text using {charset_match[0].encoding} encoding"
                )
            if e:
                msg = f'Note: encoding detection ended with warning: {e[0].message} Characters may be missing.' \
                      ' You can try running this command again and pass the encoding code as argument.\n'
    else:
        out = decrypted_text.decode(encoding)

    return out, msg