def classify_gibberish(body, site):
    body_plain_text = strip_unwanted(body, site)
    # Don't classify if the only text is a "frequent sentence", because
    # these are very short so they give inaccurate results when classifying.
    if body_plain_text == "" or is_frequent_sentence(body_plain_text)\
            or site in ["ja.stackoverflow.com", "ru.stackoverflow.com", "pt.stackoverflow.com",
                        "rus.stackexchange.com", "codegolf.stackexchange.com"]:
        return False, 1
    score = gibberishclassifier.classify(body_plain_text)
    return True, score
Example #2
0
def test_frequent_sentences(sentence, expected):
    assert is_frequent_sentence(sentence) == expected