def test_html_encoded_chars(self): res = cleanup_title("OBAMA's TEAM WELCOMES CASTRO & " "CUBA TO AMERICA & TO RECeive CRITICISM: " "'WOULDN'T DISAGREE'...") self.assertEqual( res, "Obama's Team Welcomes Castro " "& Cuba To America & To Receive " "Criticism: 'Wouldn't Disagree'...")
def test_acronym_check(self): text = "'What's this another new US " \ "Usa U.S. U.s.a. title it's a miracle...'" res = cleanup_title(text) self.assertEqual( res, "What's This Another New " "US USA U.S. U.S.A. Title It's A " "Miracle...")
def validate_title(self, value): if not only_roman_chars(value): raise ValidationError("Can only have roman characters") # Clean up the title value = cleanup_title(value) # Check if title is in our excluded list or close to it for excluded_article in settings.DEFAULT_EXCLUDE_ARTICLES: # If the title has something similar to an article # we don't want to include, remove it # Not a list comprehension to ease readability. Not a # huge issue since this is normally called in a task. if SequenceMatcher(a=excluded_article.lower(), b=value.lower()).ratio() > .70: raise ValidationError("Contains content that is " "not allowed") # Check if title already exists query = 'MATCH (news:NewsArticle {title: "%s"}) ' \ 'RETURN news' % value res, _ = db.cypher_query(query) if res.one is not None: raise ValidationError("This field must be unique") return value
def test_lowercase(self): text = "Friends of Israel - The New Yorker" res = cleanup_title(text) self.assertEqual(res, "Friends Of Israel - The New Yorker")
def test_quotes(self): text = "\"Yet another title! What is this!\"" res = cleanup_title(text) self.assertEqual(res, "Yet Another Title! What Is This!")