def cleanup(text, remove_punctuation=True):
    if remove_punctuation:
        text = re.sub('[^A-Za-z0-9\s\n]+', '', text)
    text = collapse_linebreaks(text, threshold=1).replace('\n', ' ')
    text = collapse_tabs(text, indentation=False, replace=' ')
    text = collapse_spaces(text, indentation=False, replace=' ')
    return text.strip()
Exemple #2
0
 def test_collapse_linebreaks(self):
     # Assert collapse multiple linebreaks.
     for a, b in (
       ("\n\n\n", "\n"),
       (".\n\n.", ".\n."),
       (".\r\n.", ".\n."),
       (".\n  .", ".\n  ."),
       (" \n  .", "\n  .")):
         self.assertEqual(web.collapse_linebreaks(a), b)
     print "pattern.web.collapse_linebreaks()"
Exemple #3
0
 def test_collapse_linebreaks(self):
     # Assert collapse multiple linebreaks.
     for a, b in (
       ("\n\n\n", "\n"),
       (".\n\n.", ".\n."),
       (".\r\n.", ".\n."),
       (".\n  .", ".\n  ."),
       (" \n  .", "\n  .")):
         self.assertEqual(web.collapse_linebreaks(a), b)
     print "pattern.web.collapse_linebreaks()"