Exemple #1
0
    def detect_list_encoding(self, items, default=DEFAULT_ENCODING):
        detector = chardet.UniversalDetector()
        for text in items:
            if not isinstance(text, bytes):
                continue
            detector.feed(text)
            if detector.done:
                break

        detector.close()
        return normalize_result(detector.result, default)
Exemple #2
0
    def read_file_decoded(self, file_path):
        encoding = self.result.encoding
        with open(file_path, 'rb') as fh:
            body = fh.read()
            if encoding is None:
                result = chardet.detect(body)
                encoding = normalize_result(result, self.DEFAULT_ENCODING)

        try:
            body = body.decode(encoding)
            if encoding != self.DEFAULT_ENCODING:
                log.info("Decoding [%s] as: %s", self.result, encoding)
            return body
        except UnicodeDecodeError as ude:
            raise ProcessingException('Error decoding file as %s: %s' %
                                      (encoding, ude))
Exemple #3
0
    def read_file_decoded(self, entity, file_path):
        with open(file_path, 'rb') as fh:
            body = fh.read()
            if not entity.has('encoding'):
                result = chardet.detect(body)
                encoding = normalize_result(result, self.DEFAULT_ENCODING)
                entity.set('encoding', encoding)

        for encoding in entity.get('encoding'):
            try:
                body = body.decode(encoding)
                if encoding != self.DEFAULT_ENCODING:
                    log.info("Decoding [%r] as: %s", entity, encoding)
                return body
            except UnicodeDecodeError as ude:
                raise ProcessingException('Error decoding file as %s: %s' %
                                          (encoding, ude)) from ude