def evaluation_chardet(): with open('cache/evaluation_data.pickle', 'rb') as h: e_data = pickle.load(h) print("Evaluating chardet...") chardet_predictions = [] labels = [] for n, (bytes_tensor, enc_tensor, enc_string) in enumerate(e_data): label = enc_tensor.detach().cpu().numpy() labels.append(label) # Get chardet version 4.0.0 if this doesn't work chardet_prediction = chardet.detect_all(enc_string) # Only compare the UTF-8 and Windows-1252 predictions # This skips bugs in chardet's Korean CP949 and Turkish Windows-1254/ISO-8859-9 detectors utf8 = 0 windows1252 = 0 for e in chardet_prediction: if e["encoding"] == "utf-8": utf8 = e["confidence"] # ISO-8859-1 is a subset of Windows-1252, treat them the same elif (e["encoding"] in ["Windows-1252", "ISO-8859-1" ]) and e["confidence"] > windows1252: windows1252 = e["confidence"] if utf8 > windows1252: chardet_prediction_ = 1 elif utf8 < windows1252: chardet_prediction_ = 0 # If chardet detects neither..., default to Windows-1252 for output purposes else: print("Tie in chardet prediction") print(label, chardet_prediction, enc_string) chardet_prediction_ = 0 if label == 0 and chardet_prediction_ == 1: print("label: ", label, "predicted: ", chardet_prediction_, enc_string) chardet_predictions.append(chardet_prediction_) if n % 1000 == 0: print(str(n) + "/" + str(len(e_data))) print("Chardet Accuracy: " + str(sklearn.metrics.accuracy_score(labels, chardet_predictions))) print("Chardet Precision: " + str(sklearn.metrics.precision_score(labels, chardet_predictions))) print("Chardet Recall: " + str(sklearn.metrics.recall_score(labels, chardet_predictions))) print("Chardet F1 Score: " + str(sklearn.metrics.f1_score(labels, chardet_predictions))) print("Chardet Confusion Matrix: " + str(sklearn.metrics.confusion_matrix(labels, chardet_predictions)))
def test_detect_all_and_detect_one_should_agree(txt, enc, rnd): try: data = txt.encode(enc) except UnicodeEncodeError: assume(False) try: result = chardet.detect(data) results = chardet.detect_all(data) assert result["encoding"] == results[0]["encoding"] except Exception: raise Exception(f"{result} != {results}")
def test_detect_all_and_detect_one_should_agree(txt, enc, rnd): try: data = txt.encode(enc) except UnicodeEncodeError: assume(False) try: result = chardet.detect(data) results = chardet.detect_all(data) assert result['encoding'] == results[0]['encoding'] except Exception: raise Exception('%s != %s' % (result, results))
def decode_stream(f): """ Detects the character encoding of the given byte stream and returns as text stream """ data = f.read() f.seek(0) encodings = [d["encoding"] for d in chardet.detect_all(data)] if "utf-8" in encodings: # always go with UTF-8 if it appears to be an option encoding = "utf-8" else: encoding = encodings[0] if not encoding or encoding == "ascii": encoding = "utf-8" return io.TextIOWrapper(f, encoding=encoding)
def test_encoding_detection_rename_legacy(file_name, encoding): with open(file_name, "rb") as f: input_bytes = f.read() result = chardet.detect(input_bytes, should_rename_legacy=True) try: expected_unicode = input_bytes.decode(encoding) except LookupError: expected_unicode = "" try: detected_unicode = input_bytes.decode(result["encoding"]) except (LookupError, UnicodeDecodeError, TypeError): detected_unicode = "" if result: encoding_match = (result["encoding"] or "").lower() == encoding else: encoding_match = False # Only care about mismatches that would actually result in different # behavior when decoding expected_unicode = normalize("NFKD", expected_unicode) detected_unicode = normalize("NFKD", detected_unicode) if not encoding_match and expected_unicode != detected_unicode: wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n" wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n" diff = "".join([ line for line in ndiff(wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)) if not line.startswith(" ") ][:20]) all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True, should_rename_legacy=True) else: diff = "" encoding_match = True all_encodings = [result] assert encoding_match, ( f"Expected {encoding}, but got {result} for {file_name}. First 20 " f"lines of character differences: \n{diff}\n" f"All encodings: {pformat(all_encodings)}")