def classify_gibberish(body, site): body_plain_text = strip_unwanted(body, site) # Don't classify if the only text is a "frequent sentence", because # these are very short so they give inaccurate results when classifying. if body_plain_text == "" or is_frequent_sentence(body_plain_text)\ or site in ["ja.stackoverflow.com", "ru.stackoverflow.com", "pt.stackoverflow.com", "rus.stackexchange.com", "codegolf.stackexchange.com"]: return False, 1 score = gibberishclassifier.classify(body_plain_text) return True, score
def test_frequent_sentences(sentence, expected): assert is_frequent_sentence(sentence) == expected