Beispiel #1
0
 def test_replace_non_lettters_non_numbers_with_whitespace(self):
     strings = ["new york mets - atlanta braves", "Cães danados", "New York //// Mets $$$", "Ça va?"]
     for string in strings:
         proc_string = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(string)
         regex = re.compile(r"(?ui)[\W]")
         for expr in regex.finditer(proc_string):
             self.assertEquals(expr.group(), " ")
Beispiel #2
0
 def test_replace_non_lettters_non_numbers_with_whitespace(self):
     strings = [u"new york mets - atlanta braves", u"Cães danados", u"New York //// Mets $$$", u"Ça va?"]
     for string in strings:
         proc_string = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(string)
         regex = re.compile(r"(?ui)[\W]")
         for expr in regex.finditer(proc_string):
             self.assertEquals(expr.group(), " ")
Beispiel #3
0
def full_process(s, force_ascii=False):
    """Process string by
        -- removing all but letters and numbers
        -- trim whitespace
        -- force to lower case
        if force_ascii == True, force convert to ascii"""

    if s is None:
        return ""

    if force_ascii:
        s = asciidammit(s)
    # Keep only Letters and Numbres (see Unicode docs).
    string_out = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s)
    # Force into lowercase.
    string_out = StringProcessor.to_lower_case(string_out)
    # Remove leading and trailing whitespaces.
    string_out = StringProcessor.strip(string_out)
    return string_out
Beispiel #4
0
 def test_dont_condense_whitespace(self):
     s1 = "new york mets - atlanta braves"
     s2 = "new york mets atlanta braves"
     p1 = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s1)
     p2 = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s2)
     self.assertNotEqual(p1, p2)
Beispiel #5
0
 def test_dont_condense_whitespace(self):
     s1 = "new york mets - atlanta braves"
     s2 = "new york mets atlanta braves"
     p1 = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s1)
     p2 = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s2)
     self.assertNotEqual(p1, p2)