Esempio n. 1
0
def html2text(data, method='lynx'):

    """
    Convert a string consisting of HTML to plain text
    for easy difference checking.

    Method may be one of:
     'lynx' (default) - Use "lynx -dump" for conversion
     'html2text'      - Use "html2text -nobs" for conversion
     're'             - A simple regex-based HTML tag stripper
     'pyhtml2text'    - Use Python module "html2text", keeps link targets
    """
    if method == 're':
        stripped_tags = re.sub(r'<[^>]*>', '', data)
        d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
        return d

    if method == 'pyhtml2text':
        import html2text
        pyhtml2text = html2text.HTML2Text()
        d = pyhtml2text.handle(data)
        return d

    if method == 'lynx':
        cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset=UTF-8', '-display_charset=UTF-8']
        stdout_encoding = 'utf-8'
    elif method == 'html2text':
        cmd = ['html2text', '-nobs', '-utf8']
        stdout_encoding = 'utf-8'
    else:
        raise ValueError('Unknown html2text method: %r' % (method,))

    logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding)

    env = {}
    env.update(os.environ)
    env['LANG'] = 'en_US.utf-8'
    env['LC_ALL'] = 'en_US.utf-8'

    html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env)
    stdout, stderr = html2text.communicate(data.encode('utf-8'))
    stdout = stdout.decode(stdout_encoding)

    if method == 'lynx':
        # Lynx translates relative links in the mode we use it to:
        # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]

        # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
        # "localhost" in the file:// URLs; see Debian bug 732112
        stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)

        # Use the following regular expression to remove the unnecessary
        # parts, so that [RANDOM STRING] (changing on each call) does not
        # expose itself as change on the website (it's a Lynx-related thing
        # Thanks to Evert Meulie for pointing that out
        stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)
        # Also remove file names like L9816-5928TMP.html
        stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)

    return stdout.strip()
Esempio n. 2
0
def html2text(data, baseurl, method, options):
    """
    Convert a string consisting of HTML to plain text
    for easy difference checking.

    Method may be one of:
     'lynx'           - Use "lynx -dump" for conversion
                        options: see "lynx -help" output for options that work with "-dump"
     'html2text'      - Use "html2text -nobs" for conversion
                        options: https://linux.die.net/man/1/html2text
     'bs4'            - Use Beautiful Soup library to prettify the HTML
                        options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser"
                        https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
     're'             - A simple regex-based HTML tag stripper
     'pyhtml2text'    - Use Python module "html2text"
                        options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options
    """
    if method == 're':
        stripped_tags = re.sub(r'<[^>]*>', '', data)
        d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines()
                       if l.strip() != ''))
        return d

    if method == 'pyhtml2text':
        import html2text
        parser = html2text.HTML2Text()
        parser.baseurl = baseurl
        for k, v in options.items():
            setattr(parser, k.lower(), v)
        d = parser.handle(data)
        return d

    if method == 'bs4':
        from bs4 import BeautifulSoup
        parser = options.pop('parser', 'lxml')
        soup = BeautifulSoup(data, parser)
        d = soup.get_text(strip=True)
        return d

    if method == 'lynx':
        cmd = [
            'lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8',
            '-display_charset UTF-8'
        ]
    elif method == 'html2text':
        cmd = ['html2text', '-nobs', '-utf8']
    else:
        raise ValueError('Unknown html2text method: %r' % (method, ))

    stdout_encoding = 'utf-8'

    for k, v in options.items():
        cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k)

    logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding)

    env = {}
    env.update(os.environ)
    env['LANG'] = 'en_US.utf-8'
    env['LC_ALL'] = 'en_US.utf-8'

    html2text = subprocess.Popen(cmd,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 env=env)
    stdout, stderr = html2text.communicate(data.encode('utf-8'))
    stdout = stdout.decode(stdout_encoding)

    if method == 'lynx':
        # Lynx translates relative links in the mode we use it to:
        # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]

        # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
        # "localhost" in the file:// URLs; see Debian bug 732112
        stdout = re.sub(
            r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '',
            stdout)

        # Use the following regular expression to remove the unnecessary
        # parts, so that [RANDOM STRING] (changing on each call) does not
        # expose itself as change on the website (it's a Lynx-related thing
        # Thanks to Evert Meulie for pointing that out
        stdout = re.sub(
            r'file://localhost%s/[^/]*/' %
            (os.environ.get('TMPDIR', '/tmp'), ), '', stdout)
        # Also remove file names like L9816-5928TMP.html
        stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)

    return stdout.strip()
Esempio n. 3
0
def html2text(data, method, options):
    """
    Convert a string consisting of HTML to plain text
    for easy difference checking.

    Method may be one of:
     'lynx'           - Use "lynx -dump" for conversion
                        options: see "lynx -help" output for options that work with "-dump"
     'html2text'      - Use "html2text -nobs" for conversion
                        options: https://linux.die.net/man/1/html2text
     'bs4'            - Use Beautiful Soup library to prettify the HTML
                        options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser"
                        http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use
     're'             - A simple regex-based HTML tag stripper
     'pyhtml2text'    - Use Python module "html2text"
                        options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options
    """
    if method == 're':
        stripped_tags = re.sub(r'<[^>]*>', '', data)
        d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
        return d

    if method == 'pyhtml2text':
        import html2text
        parser = html2text.HTML2Text()
        for k, v in options.items():
            setattr(parser, k.lower(), v)
        d = parser.handle(data)
        return d

    if method == 'bs4':
        from bs4 import BeautifulSoup
        parser = options.pop('parser', 'html.parser')
        soup = BeautifulSoup(data, parser)
        d = soup.get_text(strip=True)
        return d

    if method == 'lynx':
        cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8']
    elif method == 'html2text':
        cmd = ['html2text', '-nobs', '-utf8']
    else:
        raise ValueError('Unknown html2text method: %r' % (method,))

    stdout_encoding = 'utf-8'

    for k, v in options.items():
        cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k)

    logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding)

    env = {}
    env.update(os.environ)
    env['LANG'] = 'en_US.utf-8'
    env['LC_ALL'] = 'en_US.utf-8'

    html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env)
    stdout, stderr = html2text.communicate(data.encode('utf-8'))
    stdout = stdout.decode(stdout_encoding)

    if method == 'lynx':
        # Lynx translates relative links in the mode we use it to:
        # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]

        # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
        # "localhost" in the file:// URLs; see Debian bug 732112
        stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)

        # Use the following regular expression to remove the unnecessary
        # parts, so that [RANDOM STRING] (changing on each call) does not
        # expose itself as change on the website (it's a Lynx-related thing
        # Thanks to Evert Meulie for pointing that out
        stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout)
        # Also remove file names like L9816-5928TMP.html
        stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)

    return stdout.strip()