def clean_element(xml: str, selector=':root', progress_iterator=None, num_processes=1, **kwargs) -> str: """Return xml with the selected elements cleaned up. kwargs are passed to text_cleanup.raw.cleanup()""" # Use html.parser so that it doesn't try to fix the structure soup = BeautifulSoup(xml, 'html.parser') # Build entire list first to avoid modifying a live iterator nodes = set( node for element in soup.select(selector, **kwargs) for node in element.strings if not node.isspace()) if progress_iterator is None: progress_iterator = lambda x: x # noqa: E731 text_iterator = map(str, nodes) if num_processes > 1: with multiprocessing.Pool(num_processes) as pool: fixed = [] futures = [ pool.apply_async(cleanup, (text,), kwargs) for text in text_iterator] fixed = [future.get() for future in progress_iterator(futures)] else: fixed = progress_iterator([cleanup(t, **kwargs) for t in text_iterator]) for node, new in zip(nodes, fixed): node.replace_with(new) # Maybe show small diff here? return str(soup)
def test_complicated_sample(self): sample = """ In the context of 1960, Stranger in a Strange Land was a book that his publishers feared-itwas too far off the beaten path. So, in order to mini- mize possible losses, Robert was asked to cutthe monuscript down to 150,000 words-a loss of about 70,000 words. Other changes were alsorequested, before the editor was willing to take a chance on publication. """ expected = """ In the context of 1960, Stranger in a Strange Land was a book that his publishers feared-it was too far off the beaten path. So, in order to minimize possible losses, Robert was asked to cut the manuscript down to 150,000 words-a loss of about 70,000 words. Other changes were also requested, before the editor was willing to take a chance on publication. """ result = raw.cleanup(sample) self.assertEqual(result, expected)
def main(argv=None): """Entry point for text-cleanup cli.""" parser = argparse.ArgumentParser("Clean up text.") parser.add_argument('input', nargs='?', type=argparse.FileType(encoding='utf-8'), help="The input file to clean up.", default=sys.stdin) parser.add_argument('--output', type=argparse.FileType(mode='w', encoding='utf-8'), help="Write results to this filename.", default=sys.stdout) parser.add_argument( '--selector', '-s', help="Only clean elements mathching this CSS selector. Implies --xml.") parser.add_argument('--xml', action='store_true', help="Assume XML input.") parser.add_argument('--num_processes', '-n', metavar='N', help="Utilize N processes.", type=int, default=1) parser.add_argument('--disallow_substitution', action='store_false', help='Allow the correction to substitute letters.') parser.add_argument('--disallow_deletion', action='store_false', help='Allow the correction to delete letters.') parser.add_argument('--disallow_insertion', action='store_false', help='Allow the correction to insert letters.') parser.add_argument('--avoid_capitalized_words', action='store_true', help=("Ignore words starting with a capital letter" "unless we're *really* sure.")) group = parser.add_mutually_exclusive_group() group.add_argument( '--reformat-only', action='store_true', help="Prettify XML input without changing any of the text.") args = parser.parse_args(argv or sys.argv[1:]) # Fix dependencies between arguments (e.g. x implies y) if args.selector or args.reformat_only: args.xml = True if args.selector is None: args.selector = ':root' if args.xml: xml = args.input.read() if args.reformat_only: output = XML.reformat(xml) else: def make_bar(items): return progressbar.progressbar(items) output = XML.clean_element( xml, args.selector, progress_iterator=make_bar, num_processes=args.num_processes, insertion=not args.disallow_insertion, deletion=not args.disallow_deletion, substitution=not args.disallow_substitution, avoid_capitalized_words=args.avoid_capitalized_words, ) else: text = args.input.read() output = raw.cleanup( text, insertion=not args.disallow_insertion, deletion=not args.disallow_deletion, substitution=not args.disallow_substitution, avoid_capitalized_words=args.avoid_capitalized_words, ) args.output.write(output)
def test_complicated_punctuation(self): sample = """"Wait! I con't!" he said agaon-twice thot day now.""" expected = """"Wait! I can't!" he said again-twice that day now.""" result = raw.cleanup(sample) self.assertEqual(result, expected)
def test_missing_spaces_with_errors(self): sample = "This texthqs missingspaces, but also someerrors." expected = "This text has missing spaces, but also some errors." result = raw.cleanup(sample) self.assertEqual(result, expected)
def test_missing_spaces(self): sample = "This texthas a few missingspaces." expected = "This text has a few missing spaces." result = raw.cleanup(sample) self.assertEqual(result, expected)
def test_simple_misspelling(self): sample = "This tixt has one error." expected = "This text has one error." result = raw.cleanup(sample) self.assertEqual(result, expected)
def test_noop(self): expected = "This text has no errors." result = raw.cleanup(expected) self.assertEqual(result, expected)