def cleanup(text, remove_punctuation=True): if remove_punctuation: text = re.sub('[^A-Za-z0-9\s\n]+', '', text) text = collapse_linebreaks(text, threshold=1).replace('\n', ' ') text = collapse_tabs(text, indentation=False, replace=' ') text = collapse_spaces(text, indentation=False, replace=' ') return text.strip()
def test_collapse_linebreaks(self): # Assert collapse multiple linebreaks. for a, b in ( ("\n\n\n", "\n"), (".\n\n.", ".\n."), (".\r\n.", ".\n."), (".\n .", ".\n ."), (" \n .", "\n .")): self.assertEqual(web.collapse_linebreaks(a), b) print "pattern.web.collapse_linebreaks()"