def clean_markup(markup, keep_links=False, ignore_headers=True): """ Clean Wikimarkup to produce plaintext. :param keep_links: Set to True to keep internal and external links :param ignore_headers: if set to True, the output list will not contain headers, only Returns a list of paragraphs (unicode strings). """ if not keep_links: ignoreTag("a") extractor = Extractor(0, "", []) # returns a list of strings paragraphs, _ = extractor.clean_text(markup, mark_headers=True, expand_templates=False, escape_doc=True) resetIgnoredTags() if ignore_headers: paragraphs = filter(lambda s: not s.startswith("## "), paragraphs) return paragraphs
def clean_markup(markup, keep_links=False, ignore_headers=True): """ Clean Wikimarkup to produce plaintext. :param keep_links: Set to True to keep internal and external links :param ignore_headers: if set to True, the output list will not contain headers, only Returns a list of paragraphs (unicode strings). """ if not keep_links: ignoreTag("a") extractor = Extractor(0, "", []) # returns a list of strings paragraphs = extractor.clean_text(markup, mark_headers=True, expand_templates=False, escape_doc=True) resetIgnoredTags() if ignore_headers: paragraphs = filter(lambda s: not s.startswith("## "), paragraphs) return paragraphs
def main(): global acceptedNamespaces global expand_templates, templateCache parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="XML wiki dump file") groupO = parser.add_argument_group('Output') groupO.add_argument( "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)") groupO.add_argument( "-b", "--bytes", default="1M", help= "maximum bytes per output file (default %(default)s); 0 means to put a single article per file", metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") groupO.add_argument( "--json", action="store_true", help="write output in json format instead of the default <doc> format") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links") groupP.add_argument("-l", "--links", action="store_true", help="preserve links") groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces") groupP.add_argument("--templates", help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") groupP.add_argument( "--html-safe", default=True, help="use to produce HTML safe output within <doc>...</doc>") default_process_count = cpu_count() - 1 parser.add_argument( "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("--debug", action="store_true", help="print debug info") groupS.add_argument( "-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)") groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + __version__, help="print program version") args = parser.parse_args() Extractor.keepLinks = args.links Extractor.HtmlFormatting = args.html if args.html: Extractor.keepLinks = True Extractor.to_json = args.json expand_templates = args.no_templates try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 # 0 bytes means put a single article per file. file_size = 0 if args.bytes == '0' else int( args.bytes[:-1]) * 1024**power if file_size and file_size < minFileSize: raise ValueError() except ValueError: logging.error('Insufficient or invalid size: %s', args.bytes) return if args.namespaces: acceptedNamespaces = set(args.namespaces.split(',')) FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() if not args.quiet: logger.setLevel(logging.INFO) if args.debug: logger.setLevel(logging.DEBUG) input_file = args.input if not Extractor.keepLinks: ignoreTag('a') # sharing cache of parser templates is too slow: # manager = Manager() # templateCache = manager.dict() if args.article: if args.templates: if os.path.exists(args.templates): with open(args.templates) as file: load_templates(file) urlbase = '' with open(input_file) as input: for id, revid, title, page in collect_pages(input): Extractor(id, revid, urlbase, title, page).extract(sys.stdout) return output_path = args.output if output_path != '-' and not os.path.isdir(output_path): try: os.makedirs(output_path) except: logging.error('Could not create: %s', output_path) return process_dump(input_file, args.templates, output_path, file_size, args.compress, args.processes, args.html_safe)
def main(): global urlbase, acceptedNamespaces global expand_templates, templateCache, escape_doc parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="XML wiki dump file") groupO = parser.add_argument_group('Output') groupO.add_argument("-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)") groupO.add_argument("-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links") groupP.add_argument("-l", "--links", action="store_true", help="preserve links") groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces") groupP.add_argument("--templates", help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") groupP.add_argument("--escapedoc", action="store_true", help="use to escape the contents of the output <doc>...</doc>") default_process_count = cpu_count() - 1 parser.add_argument("--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("--debug", action="store_true", help="print debug info") groupS.add_argument("-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)") groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version") args = parser.parse_args() Extractor.keepLinks = args.links Extractor.toHTML = args.html if args.html: Extractor.keepLinks = True expand_templates = args.no_templates escape_doc = args.escapedoc try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 file_size = int(args.bytes[:-1]) * 1024 ** power if file_size < minFileSize: raise ValueError() except ValueError: logging.error('Insufficient or invalid size: %s', args.bytes) return if args.namespaces: acceptedNamespaces = set(args.namespaces.split(',')) FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() if not args.quiet: logger.setLevel(logging.INFO) if args.debug: logger.setLevel(logging.DEBUG) input_file = args.input if not Extractor.keepLinks: ignoreTag('a') # sharing cache of parser templates is too slow: # manager = Manager() # templateCache = manager.dict() if args.article: if args.templates: if os.path.exists(args.templates): with open(args.templates) as file: load_templates(file) with open(input_file) as file: page = file.read().decode('utf-8') m = re.search(r'<id>(.*)</id>', page) id = m.group(1) if m else 0 m = re.search(r'<title>(.*)</title>', page) if m: title = m.group(1) else: logging.error('Missing title element') return Extractor(id, title, [page]).extract(sys.stdout) return output_path = args.output if output_path != '-' and not os.path.isdir(output_path): try: os.makedirs(output_path) except: logging.error('Could not create: %s', output_path) return process_dump(input_file, args.templates, output_path, file_size, args.compress, args.processes)
def main(): global urlbase, acceptedNamespaces global expand_templates, templateCache, escape_doc parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="XML wiki dump file") groupO = parser.add_argument_group('Output') groupO.add_argument( "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)") groupO.add_argument( "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links") groupP.add_argument("-l", "--links", action="store_true", help="preserve links") groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces") groupP.add_argument("--templates", help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") groupP.add_argument( "--escapedoc", action="store_true", help="use to escape the contents of the output <doc>...</doc>") default_process_count = cpu_count() - 1 parser.add_argument( "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("--debug", action="store_true", help="print debug info") groupS.add_argument( "-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)") groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version") args = parser.parse_args() Extractor.keepLinks = args.links Extractor.toHTML = args.html if args.html: Extractor.keepLinks = True expand_templates = args.no_templates escape_doc = args.escapedoc try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 file_size = int(args.bytes[:-1]) * 1024**power if file_size < minFileSize: raise ValueError() except ValueError: logging.error('Insufficient or invalid size: %s', args.bytes) return if args.namespaces: acceptedNamespaces = set(args.namespaces.split(',')) FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() if not args.quiet: logger.setLevel(logging.INFO) if args.debug: logger.setLevel(logging.DEBUG) input_file = args.input if not Extractor.keepLinks: ignoreTag('a') # sharing cache of parser templates is too slow: # manager = Manager() # templateCache = manager.dict() if args.article: if args.templates: if os.path.exists(args.templates): with open(args.templates) as file: load_templates(file) with open(input_file) as file: page = file.read().decode('utf-8') m = re.search(r'<id>(.*)</id>', page) id = m.group(1) if m else 0 m = re.search(r'<title>(.*)</title>', page) if m: title = m.group(1) else: logging.error('Missing title element') return Extractor(id, title, [page]).extract(sys.stdout) return output_path = args.output if output_path != '-' and not os.path.isdir(output_path): try: os.makedirs(output_path) except: logging.error('Could not create: %s', output_path) return process_dump(input_file, args.templates, output_path, file_size, args.compress, args.processes)