예제 #1
0
def detect_file_enc(file_name):
    utf_8_bom_size = 3
    utf_8_bom_flag = b'\xef\xbb\xbf'

    enc_name = None
    file = open(file_name, 'rb')
    data = file.read(utf_8_bom_size)
    file.close()
    if data == utf_8_bom_flag:
        enc_name = 'utf-8-sig'

    usock = open(file_name, 'rb')
    detector = det_.UniversalDetector()
    for line in usock.readlines():
        detector.feed(line)
        if detector.done: break
    detector.close()
    usock.close()

    #     print(detector.result)

    if enc_name:
        detector.result['encoding'] = enc_name

    return detector.result
예제 #2
0
def detect(path):
    """
    Detect the encoding of file at `path` and return code or ``None``.

    Raise :exc:`IOError` if reading fails.
    """
    bom_encoding = detect_bom(path)
    if bom_encoding is not None:
        return bom_encoding
    from chardet import universaldetector
    detector = universaldetector.UniversalDetector()
    with open(path, "rb") as f:
        detector.reset()
        for line in f:
            detector.feed(line)
            if detector.done: break
    detector.close()
    code = detector.result["encoding"]
    if code is None: return None
    try:
        # chardet returns what seem to be IANA names. They need to be
        # translated to their Python equivalents. Some of the encodings
        # returned by chardet are not supported by Python.
        return translate_code(code)
    except ValueError:
        return None
예제 #3
0
def detect(aBuf):
    from chardet import universaldetector
    u = universaldetector.UniversalDetector()
    u.reset()
    u.feed(aBuf)
    u.close()
    return u.result
예제 #4
0
def detect_encodings(data):
    """
    Analyze the provided data for possible character encodings.

    This simply wraps chardet and extracts all the potential encodings it
    considered before deciding on a particular result.

    :param data: An array of bytes to treat as text data
    :type  data: bytes

    :return: A dictionary mapping possible encodings to confidence levels
    :rtype:  dict
    """
    if not data:
        # It's an empty string so we can safely say it's ascii
        return {'ascii': 1.0}

    # We can't use ``chardet.detect`` because we want to dig in the internals
    # of the detector to bias the utf-8 result.
    detector = universaldetector.UniversalDetector()
    detector.reset()
    detector.feed(data)
    result = detector.close()
    if not result:
        return {'utf-8': 1.0}
    encodings = {result['encoding']: result['confidence']}
    for prober in detector._mCharSetProbers:
        if prober:
            encodings[prober.get_charset_name()] = prober.get_confidence()

    return encodings
예제 #5
0
def detect_encoding(string):
  detector = universaldetector.UniversalDetector()
  detector.reset()
  detector.feed(string)
  detector.close()
  if detector.result and detector.result['confidence'] >= 0.6:
    return detector.result['encoding']
  return 'gb18030'
예제 #6
0
def bigfilechardet(filename, block=256):
    detector = universaldetector.UniversalDetector()
    with open(filename, 'rb') as fd:
        bytes = fd.read(block)
        while bytes and not detector.done:
            detector.feed(bytes)
            bytes = fd.read(block)
    detector.close()
    # print(detector.result)
    return detector.result.get("encoding")
예제 #7
0
def run_cmd_with_cmd_str_and_decode_gracefully(cmd_str,
                                               print_flag=False,
                                               capture_stdout=True):
    if capture_stdout:
        p = subprocess.Popen(cmd_str,
                             shell=True,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
    else:
        p = subprocess.Popen(cmd_str, shell=True, stderr=subprocess.STDOUT)

    if capture_stdout:
        encoding = None
        detector = detector_.UniversalDetector()

        result = bytearray()
        for line in p.stdout.readlines():
            result.extend(line)

            if not encoding:
                detector.feed(line)
                if detector.done:
                    encoding = detector.result['encoding']

        rtn_val = p.wait()
        detector.close()

        # print(detector.result)

        if not encoding:
            encoding = detector.result['encoding']
            if not encoding:
                encoding = system_util.get_system_encoding()
                # print('encoding: ' + encoding)

        rtn_str = ''
        if result:
            rtn_str = result.decode(encoding)

        if print_flag:
            print(rtn_str)
    else:
        rtn_val = p.wait()
        rtn_str = None

    return rtn_val, rtn_str
예제 #8
0
def convertAsciiToUtf8(txt):
    if not isinstance(txt, unicode):
        try:
            txt = txt.decode('utf-8')
        except UnicodeDecodeError:
            detector = universaldetector.UniversalDetector()
            detector.feed(txt)
            detector.close()
            result = detector.result
            encoding = result['encoding']
            if encoding:
                try:
                    txt = txt.decode(encoding)
                except UnicodeDecodeError:
                    txt = splitTextToConvertToUtf8(txt, encoding)
            else:
                txt = splitTextToConvertToUtf8(txt, 'utf-8')
    return txt