def test_collapse_spaces(self): # Assert collapse multiple spaces. for a, b in ((" ", ""), (" .. ", ".."), (". .", ". ."), (". \n", "."), ("\xa0", "")): self.assertEqual(web.collapse_spaces(a), b) # Assert preserve indendation. self.assertEqual(web.collapse_spaces(" . \n", indentation=True), " .") print "pattern.web.collapse_spaces()"
def cleanup(text, remove_punctuation=True): if remove_punctuation: text = re.sub('[^A-Za-z0-9\s\n]+', '', text) text = collapse_linebreaks(text, threshold=1).replace('\n', ' ') text = collapse_tabs(text, indentation=False, replace=' ') text = collapse_spaces(text, indentation=False, replace=' ') return text.strip()