def test_dont_condense_whitespace(self): s1 = "new york mets - atlanta braves" s2 = "new york mets atlanta braves" p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace( s1) p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace( s2) self.assertNotEqual(p1, p2)
def query_processor(s, force_ascii=False): global processor_regex # Keep only letters, numbers and some special character in path string_out = processor_regex.sub(" ", s) # Force into lowercase. string_out = StringProcessor.to_lower_case(string_out) # Remove leading and trailing whitespaces. string_out = StringProcessor.strip(string_out) return string_out
def test_replace_non_letters_non_numbers_with_whitespace(self): strings = ["new york mets - atlanta braves", "Cães danados", "New York //// Mets $$$", "Ça va?"] for string in strings: proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string) regex = re.compile(r"(?ui)[\W]") for expr in regex.finditer(proc_string): self.assertEqual(expr.group(), " ")
def full_process(s, force_ascii=False): """Process string by -- removing all but letters and numbers -- trim whitespace -- force to lower case if force_ascii == True, force convert to ascii""" if force_ascii: s = asciidammit(s) # Keep only Letters and Numbers (see Unicode docs). string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s) # Force into lowercase. string_out = StringProcessor.to_lower_case(string_out) # Remove leading and trailing whitespaces. string_out = StringProcessor.strip(string_out) return string_out
def test_replace_non_lettters_non_numbers_with_whitespace(self): strings = [u"new york mets - atlanta braves", u"Cães danados", u"New York //// Mets $$$", u"Ça va?"] for string in strings: proc_string = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(string) regex = re.compile(r"(?ui)[\W]") for expr in regex.finditer(proc_string): self.assertEquals(expr.group(), " ")
def semi_process(s, force_ascii=False): """ Variation on Fuzzywuzzy's full_process: Process string by XX removing all but letters and numbers --> These are kept to keep consecutive spans -- trim whitespace XX force to lower case --> These are kept since annotators marked verbatim spans, so case is a good signal if force_ascii == True, force convert to ascii """ if s is None: return "" if force_ascii: s = asciidammit(s) # Remove leading and trailing whitespaces. string_out = StringProcessor.strip(s) return string_out
def test_dont_condense_whitespace(self): s1 = "new york mets - atlanta braves" s2 = "new york mets atlanta braves" p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1) p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2) self.assertNotEqual(p1, p2)
def string_processor(string): string_out = StringProcessor.to_lower_case(string) string_out = StringProcessor.strip(string_out) return string_out