def test_dont_condense_whitespace(self): s1 = "new york mets - atlanta braves" s2 = "new york mets atlanta braves" p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace( s1) p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace( s2) self.assertNotEqual(p1, p2)
def test_replace_non_letters_non_numbers_with_whitespace(self): strings = ["new york mets - atlanta braves", "Cães danados", "New York //// Mets $$$", "Ça va?"] for string in strings: proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string) regex = re.compile(r"(?ui)[\W]") for expr in regex.finditer(proc_string): self.assertEqual(expr.group(), " ")
def test_replace_non_letters_non_numbers_with_whitespace(self): strings = ["new york mets - atlanta braves", "Cães danados", "New York //// Mets $$$", "Ça va?"] for string in strings: proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string) regex = re.compile(r"(?ui)[\W]") for expr in regex.finditer(proc_string): self.assertEqual(expr.group(), " ")
def full_process(s, force_ascii=False): """Process string by -- removing all but letters and numbers -- trim whitespace -- force to lower case if force_ascii == True, force convert to ascii""" if force_ascii: s = asciidammit(s) # Keep only Letters and Numbers (see Unicode docs). string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s) # Force into lowercase. string_out = StringProcessor.to_lower_case(string_out) # Remove leading and trailing whitespaces. string_out = StringProcessor.strip(string_out) return string_out
def test_dont_condense_whitespace(self): s1 = "new york mets - atlanta braves" s2 = "new york mets atlanta braves" p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1) p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2) self.assertNotEqual(p1, p2)