def detect_list_encoding(self, items, default=DEFAULT_ENCODING): detector = chardet.UniversalDetector() for text in items: if not isinstance(text, bytes): continue detector.feed(text) if detector.done: break detector.close() return normalize_result(detector.result, default)
def read_file_decoded(self, file_path): encoding = self.result.encoding with open(file_path, 'rb') as fh: body = fh.read() if encoding is None: result = chardet.detect(body) encoding = normalize_result(result, self.DEFAULT_ENCODING) try: body = body.decode(encoding) if encoding != self.DEFAULT_ENCODING: log.info("Decoding [%s] as: %s", self.result, encoding) return body except UnicodeDecodeError as ude: raise ProcessingException('Error decoding file as %s: %s' % (encoding, ude))
def read_file_decoded(self, entity, file_path): with open(file_path, 'rb') as fh: body = fh.read() if not entity.has('encoding'): result = chardet.detect(body) encoding = normalize_result(result, self.DEFAULT_ENCODING) entity.set('encoding', encoding) for encoding in entity.get('encoding'): try: body = body.decode(encoding) if encoding != self.DEFAULT_ENCODING: log.info("Decoding [%r] as: %s", entity, encoding) return body except UnicodeDecodeError as ude: raise ProcessingException('Error decoding file as %s: %s' % (encoding, ude)) from ude