def file_encoding(path): iso_unique = (b'\xb1', b'\xac', b'\xbc', b'\xa1', b'\xb6', b'\xa6') cp_unique = (b'\xb9', b'\xa5', b'\x9f', b'\x8f', b'\x8c', b'\x9c') iso_counter = 0 cp_counter = 0 _detector = cchardet.UniversalDetector() with open(path, 'rb') as f: for line in f: for c in iso_unique: iso_counter += line.count(c) for c in cp_unique: cp_counter += line.count(c) _detector.feed(line) if _detector.done: break _detector.close() backup_encoding = 'utf-8' encoding = _detector.result.get('encoding') confidence = _detector.result.get('confidence') or 0.0 if confidence < 0.95 and (cp_counter or iso_counter): backup_encoding = 'Windows-1250' if cp_counter > iso_counter else 'iso-8859-2' return encoding, backup_encoding
def detect_encoding(bytesio: io.BytesIO) -> str: """ Detect charset, as Python-friendly encoding string. Peculiarities: * Reads file by CHARDET_CHUNK_SIZE defined in settings.py * Stops seeking when detector.done flag True * Seeks back to beginning of file for downstream usage * Returns "utf-8" in case of empty file or ASCII -- since the parse framework is designed to be UTF-native. """ detector = chardet.UniversalDetector() while not detector.done: chunk = bytesio.read(settings.CHARDET_CHUNK_SIZE) if not chunk: break # EOF detector.feed(chunk) detector.close() bytesio.seek(0) encoding = detector.result["encoding"] if encoding is None: # There isn't enough data for chardet return "UTF-8" elif encoding == "ASCII": return "UTF-8" else: return encoding
def test_github_issue_20(self): """ https://github.com/PyYoshi/cChardet/issues/20 """ msg = b'\x8f' cchardet.detect(msg) detector = cchardet.UniversalDetector() detector.feed(msg) detector.close()
def guess_encoding_from_stream(stream, chunk_size=4096, chardet_threshold=0.5): detector = chardet.UniversalDetector() chunk = stream.read(chunk_size) while not detector.done and chunk: detector.feed(chunk) chunk = stream.read(chunk_size) detector.close() result = detector.result confidence = result.get("confidence") if not confidence or confidence < chardet_threshold: raise ValueError("Failed to detect encoding") encoding = result["encoding"] return encoding
def guess_file_encoding(fh, default=DEFAULT_ENCODING): """Guess encoding from a file handle.""" start = fh.tell() detector = chardet.UniversalDetector() for idx in six.moves.range(1024): data = fh.read(1024) if not len(data): break detector.feed(data) if detector.done: break detector.close() fh.seek(start) return normalize_result(detector.result, default=default)
def test_detector(self): detector = cchardet.UniversalDetector() with open( "tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt", 'rb') as f: line = f.readline() while line: detector.feed(line) if detector.done: break line = f.readline() detector.close() detected_encoding = detector.result eq_( "shift_jis", detected_encoding['encoding'].lower(), 'Expected %s, but got %s' % ("shift_jis", detected_encoding['encoding'].lower()))
def detect_encoding(bytesio: io.BytesIO): """ Detect charset, as Python-friendly encoding string. Peculiarities: * Reads file by CHARDET_CHUNK_SIZE defined in settings.py * Stops seeking when detector.done flag True * Seeks back to beginning of file for downstream usage """ detector = chardet.UniversalDetector() while not detector.done: chunk = bytesio.read(settings.CHARDET_CHUNK_SIZE) if not chunk: break # EOF detector.feed(chunk) detector.close() bytesio.seek(0) return detector.result["encoding"]