def evaluation_chardet():
    with open('cache/evaluation_data.pickle', 'rb') as h:
        e_data = pickle.load(h)

    print("Evaluating chardet...")
    chardet_predictions = []
    labels = []
    for n, (bytes_tensor, enc_tensor, enc_string) in enumerate(e_data):

        label = enc_tensor.detach().cpu().numpy()
        labels.append(label)

        # Get chardet version 4.0.0 if this doesn't work
        chardet_prediction = chardet.detect_all(enc_string)

        # Only compare the UTF-8 and Windows-1252 predictions
        # This skips bugs in chardet's Korean CP949 and Turkish Windows-1254/ISO-8859-9 detectors
        utf8 = 0
        windows1252 = 0
        for e in chardet_prediction:
            if e["encoding"] == "utf-8":
                utf8 = e["confidence"]

            # ISO-8859-1 is a subset of Windows-1252, treat them the same
            elif (e["encoding"] in ["Windows-1252", "ISO-8859-1"
                                    ]) and e["confidence"] > windows1252:
                windows1252 = e["confidence"]

        if utf8 > windows1252:
            chardet_prediction_ = 1
        elif utf8 < windows1252:
            chardet_prediction_ = 0
        # If chardet detects neither..., default to Windows-1252 for output purposes
        else:
            print("Tie in chardet prediction")
            print(label, chardet_prediction, enc_string)
            chardet_prediction_ = 0

        if label == 0 and chardet_prediction_ == 1:
            print("label: ", label, "predicted: ", chardet_prediction_,
                  enc_string)

        chardet_predictions.append(chardet_prediction_)

        if n % 1000 == 0:
            print(str(n) + "/" + str(len(e_data)))

    print("Chardet Accuracy: " +
          str(sklearn.metrics.accuracy_score(labels, chardet_predictions)))
    print("Chardet Precision: " +
          str(sklearn.metrics.precision_score(labels, chardet_predictions)))
    print("Chardet Recall: " +
          str(sklearn.metrics.recall_score(labels, chardet_predictions)))
    print("Chardet F1 Score: " +
          str(sklearn.metrics.f1_score(labels, chardet_predictions)))
    print("Chardet Confusion Matrix: " +
          str(sklearn.metrics.confusion_matrix(labels, chardet_predictions)))
Beispiel #2
0
 def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
     try:
         data = txt.encode(enc)
     except UnicodeEncodeError:
         assume(False)
     try:
         result = chardet.detect(data)
         results = chardet.detect_all(data)
         assert result["encoding"] == results[0]["encoding"]
     except Exception:
         raise Exception(f"{result} != {results}")
Beispiel #3
0
 def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
     try:
         data = txt.encode(enc)
     except UnicodeEncodeError:
         assume(False)
     try:
         result = chardet.detect(data)
         results = chardet.detect_all(data)
         assert result['encoding'] == results[0]['encoding']
     except Exception:
         raise Exception('%s != %s' % (result, results))
Beispiel #4
0
def decode_stream(f):
    """
    Detects the character encoding of the given byte stream and returns as text stream
    """

    data = f.read()
    f.seek(0)

    encodings = [d["encoding"] for d in chardet.detect_all(data)]

    if "utf-8" in encodings:  # always go with UTF-8 if it appears to be an option
        encoding = "utf-8"
    else:
        encoding = encodings[0]

    if not encoding or encoding == "ascii":
        encoding = "utf-8"

    return io.TextIOWrapper(f, encoding=encoding)
Beispiel #5
0
def test_encoding_detection_rename_legacy(file_name, encoding):
    with open(file_name, "rb") as f:
        input_bytes = f.read()
        result = chardet.detect(input_bytes, should_rename_legacy=True)
        try:
            expected_unicode = input_bytes.decode(encoding)
        except LookupError:
            expected_unicode = ""
        try:
            detected_unicode = input_bytes.decode(result["encoding"])
        except (LookupError, UnicodeDecodeError, TypeError):
            detected_unicode = ""
    if result:
        encoding_match = (result["encoding"] or "").lower() == encoding
    else:
        encoding_match = False
    # Only care about mismatches that would actually result in different
    # behavior when decoding
    expected_unicode = normalize("NFKD", expected_unicode)
    detected_unicode = normalize("NFKD", detected_unicode)
    if not encoding_match and expected_unicode != detected_unicode:
        wrapped_expected = "\n".join(textwrap.wrap(expected_unicode,
                                                   100)) + "\n"
        wrapped_detected = "\n".join(textwrap.wrap(detected_unicode,
                                                   100)) + "\n"
        diff = "".join([
            line for line in ndiff(wrapped_expected.splitlines(True),
                                   wrapped_detected.splitlines(True))
            if not line.startswith(" ")
        ][:20])
        all_encodings = chardet.detect_all(input_bytes,
                                           ignore_threshold=True,
                                           should_rename_legacy=True)
    else:
        diff = ""
        encoding_match = True
        all_encodings = [result]
    assert encoding_match, (
        f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
        f"lines of character differences: \n{diff}\n"
        f"All encodings: {pformat(all_encodings)}")