Ejemplo n.º 1
0
def clean_markup(markup, keep_links=False, ignore_headers=True):
    """
    Clean Wikimarkup to produce plaintext.

    :param keep_links: Set to True to keep internal and external links
    :param ignore_headers: if set to True, the output list will not contain
    headers, only

    Returns a list of paragraphs (unicode strings).
    """

    if not keep_links:
        ignoreTag("a")

    extractor = Extractor(0, "", [])

    # returns a list of strings
    paragraphs, _ = extractor.clean_text(markup,
                                         mark_headers=True,
                                         expand_templates=False,
                                         escape_doc=True)
    resetIgnoredTags()

    if ignore_headers:
        paragraphs = filter(lambda s: not s.startswith("## "), paragraphs)

    return paragraphs
Ejemplo n.º 2
0
def clean_markup(markup, keep_links=False, ignore_headers=True):
    """
    Clean Wikimarkup to produce plaintext.

    :param keep_links: Set to True to keep internal and external links
    :param ignore_headers: if set to True, the output list will not contain
    headers, only 

    Returns a list of paragraphs (unicode strings).
    """

    if not keep_links:
        ignoreTag("a")

    extractor = Extractor(0, "", [])

    # returns a list of strings
    paragraphs = extractor.clean_text(markup, mark_headers=True, expand_templates=False, escape_doc=True)
    resetIgnoredTags()

    if ignore_headers:
        paragraphs = filter(lambda s: not s.startswith("## "), paragraphs)

    return paragraphs
Ejemplo n.º 3
0
def main():
    global acceptedNamespaces
    global expand_templates, templateCache

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="XML wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument(
        "-o",
        "--output",
        default="text",
        help="directory for extracted files (or '-' for dumping to stdout)")
    groupO.add_argument(
        "-b",
        "--bytes",
        default="1M",
        help=
        "maximum bytes per output file (default %(default)s); 0 means to put a single article per file",
        metavar="n[KMG]")
    groupO.add_argument("-c",
                        "--compress",
                        action="store_true",
                        help="compress output files using bzip")
    groupO.add_argument(
        "--json",
        action="store_true",
        help="write output in json format instead of the default <doc> format")

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html",
                        action="store_true",
                        help="produce HTML output, subsumes --links")
    groupP.add_argument("-l",
                        "--links",
                        action="store_true",
                        help="preserve links")
    groupP.add_argument("-ns",
                        "--namespaces",
                        default="",
                        metavar="ns1,ns2",
                        help="accepted namespaces")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates",
                        action="store_false",
                        help="Do not expand templates")
    groupP.add_argument(
        "--html-safe",
        default=True,
        help="use to produce HTML safe output within <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument(
        "--processes",
        type=int,
        default=default_process_count,
        help="Number of processes to use (default %(default)s)")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q",
                        "--quiet",
                        action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug",
                        action="store_true",
                        help="print debug info")
    groupS.add_argument(
        "-a",
        "--article",
        action="store_true",
        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("-v",
                        "--version",
                        action="version",
                        version='%(prog)s ' + __version__,
                        help="print program version")

    args = parser.parse_args()

    Extractor.keepLinks = args.links
    Extractor.HtmlFormatting = args.html
    if args.html:
        Extractor.keepLinks = True
    Extractor.to_json = args.json

    expand_templates = args.no_templates

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        # 0 bytes means put a single article per file.
        file_size = 0 if args.bytes == '0' else int(
            args.bytes[:-1]) * 1024**power
        if file_size and file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    input_file = args.input

    if not Extractor.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
    # manager = Manager()
    # templateCache = manager.dict()

    if args.article:
        if args.templates:
            if os.path.exists(args.templates):
                with open(args.templates) as file:
                    load_templates(file)

        urlbase = ''
        with open(input_file) as input:
            for id, revid, title, page in collect_pages(input):
                Extractor(id, revid, urlbase, title, page).extract(sys.stdout)
        return

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes, args.html_safe)
Ejemplo n.º 4
0
def main():
    global urlbase, acceptedNamespaces
    global expand_templates, templateCache, escape_doc

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description=__doc__)
    parser.add_argument("input",
                        help="XML wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument("-o", "--output", default="text",
                        help="directory for extracted files (or '-' for dumping to stdout)")
    groupO.add_argument("-b", "--bytes", default="1M",
                        help="maximum bytes per output file (default %(default)s)",
                        metavar="n[KMG]")
    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html", action="store_true",
                        help="produce HTML output, subsumes --links")
    groupP.add_argument("-l", "--links", action="store_true",
                        help="preserve links")
    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
                        help="accepted namespaces")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
    groupP.add_argument("--escapedoc", action="store_true",
                        help="use to escape the contents of the output <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q", "--quiet", action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug", action="store_true",
                        help="print debug info")
    groupS.add_argument("-a", "--article", action="store_true",
                        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("-v", "--version", action="version",
                        version='%(prog)s ' + version,
                        help="print program version")

    args = parser.parse_args()

    Extractor.keepLinks = args.links
    Extractor.toHTML = args.html
    if args.html:
        Extractor.keepLinks = True

    expand_templates = args.no_templates
    escape_doc = args.escapedoc

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        file_size = int(args.bytes[:-1]) * 1024 ** power
        if file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    input_file = args.input

    if not Extractor.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
    # manager = Manager()
    # templateCache = manager.dict()

    if args.article:
        if args.templates:
            if os.path.exists(args.templates):
                with open(args.templates) as file:
                    load_templates(file)

        with open(input_file) as file:
            page = file.read().decode('utf-8')
            m = re.search(r'<id>(.*)</id>', page)
            id = m.group(1) if m else 0
            m = re.search(r'<title>(.*)</title>', page)
            if m:
                title = m.group(1)
            else:
                logging.error('Missing title element')
                return
            Extractor(id, title, [page]).extract(sys.stdout)
        return

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes)
Ejemplo n.º 5
0
def main():
    global urlbase, acceptedNamespaces
    global expand_templates, templateCache, escape_doc

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="XML wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument(
        "-o",
        "--output",
        default="text",
        help="directory for extracted files (or '-' for dumping to stdout)")
    groupO.add_argument(
        "-b",
        "--bytes",
        default="1M",
        help="maximum bytes per output file (default %(default)s)",
        metavar="n[KMG]")
    groupO.add_argument("-c",
                        "--compress",
                        action="store_true",
                        help="compress output files using bzip")

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html",
                        action="store_true",
                        help="produce HTML output, subsumes --links")
    groupP.add_argument("-l",
                        "--links",
                        action="store_true",
                        help="preserve links")
    groupP.add_argument("-ns",
                        "--namespaces",
                        default="",
                        metavar="ns1,ns2",
                        help="accepted namespaces")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates",
                        action="store_false",
                        help="Do not expand templates")
    groupP.add_argument(
        "--escapedoc",
        action="store_true",
        help="use to escape the contents of the output <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument(
        "--processes",
        type=int,
        default=default_process_count,
        help="Number of processes to use (default %(default)s)")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q",
                        "--quiet",
                        action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug",
                        action="store_true",
                        help="print debug info")
    groupS.add_argument(
        "-a",
        "--article",
        action="store_true",
        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("-v",
                        "--version",
                        action="version",
                        version='%(prog)s ' + version,
                        help="print program version")

    args = parser.parse_args()

    Extractor.keepLinks = args.links
    Extractor.toHTML = args.html
    if args.html:
        Extractor.keepLinks = True

    expand_templates = args.no_templates
    escape_doc = args.escapedoc

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        file_size = int(args.bytes[:-1]) * 1024**power
        if file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    input_file = args.input

    if not Extractor.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
    # manager = Manager()
    # templateCache = manager.dict()

    if args.article:
        if args.templates:
            if os.path.exists(args.templates):
                with open(args.templates) as file:
                    load_templates(file)

        with open(input_file) as file:
            page = file.read().decode('utf-8')
            m = re.search(r'<id>(.*)</id>', page)
            id = m.group(1) if m else 0
            m = re.search(r'<title>(.*)</title>', page)
            if m:
                title = m.group(1)
            else:
                logging.error('Missing title element')
                return
            Extractor(id, title, [page]).extract(sys.stdout)
        return

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes)