Example #1
0
 def test_collapse_spaces(self):
     # Assert collapse multiple spaces.
     for a, b in (("    ", ""), (" .. ", ".."), (".  .", ". ."), (". \n", "."), ("\xa0", "")):
         self.assertEqual(web.collapse_spaces(a), b)
     # Assert preserve indendation.
     self.assertEqual(web.collapse_spaces("  . \n", indentation=True), "  .")
     print "pattern.web.collapse_spaces()"
Example #2
0
 def test_collapse_spaces(self):
     # Assert collapse multiple spaces.
     for a, b in (("    ", ""), (" .. ", ".."), (".  .", ". ."),
                  (". \n", "."), ("\xa0", "")):
         self.assertEqual(web.collapse_spaces(a), b)
     # Assert preserve indendation.
     self.assertEqual(web.collapse_spaces("  . \n", indentation=True),
                      "  .")
     print "pattern.web.collapse_spaces()"
def cleanup(text, remove_punctuation=True):
    if remove_punctuation:
        text = re.sub('[^A-Za-z0-9\s\n]+', '', text)
    text = collapse_linebreaks(text, threshold=1).replace('\n', ' ')
    text = collapse_tabs(text, indentation=False, replace=' ')
    text = collapse_spaces(text, indentation=False, replace=' ')
    return text.strip()