def html2text(data, method='lynx'): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' (default) - Use "lynx -dump" for conversion 'html2text' - Use "html2text -nobs" for conversion 're' - A simple regex-based HTML tag stripper 'pyhtml2text' - Use Python module "html2text", keeps link targets """ if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'pyhtml2text': import html2text pyhtml2text = html2text.HTML2Text() d = pyhtml2text.handle(data) return d if method == 'lynx': cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset=UTF-8', '-display_charset=UTF-8'] stdout_encoding = 'utf-8' elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] stdout_encoding = 'utf-8' else: raise ValueError('Unknown html2text method: %r' % (method,)) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip()
def html2text(data, baseurl, method, options): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' - Use "lynx -dump" for conversion options: see "lynx -help" output for options that work with "-dump" 'html2text' - Use "html2text -nobs" for conversion options: https://linux.die.net/man/1/html2text 'bs4' - Use Beautiful Soup library to prettify the HTML options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser" https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use 're' - A simple regex-based HTML tag stripper 'pyhtml2text' - Use Python module "html2text" options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options """ if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'pyhtml2text': import html2text parser = html2text.HTML2Text() parser.baseurl = baseurl for k, v in options.items(): setattr(parser, k.lower(), v) d = parser.handle(data) return d if method == 'bs4': from bs4 import BeautifulSoup parser = options.pop('parser', 'lxml') soup = BeautifulSoup(data, parser) d = soup.get_text(strip=True) return d if method == 'lynx': cmd = [ 'lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8' ] elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] else: raise ValueError('Unknown html2text method: %r' % (method, )) stdout_encoding = 'utf-8' for k, v in options.items(): cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub( r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub( r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'), ), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip()
def html2text(data, method, options): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' - Use "lynx -dump" for conversion options: see "lynx -help" output for options that work with "-dump" 'html2text' - Use "html2text -nobs" for conversion options: https://linux.die.net/man/1/html2text 'bs4' - Use Beautiful Soup library to prettify the HTML options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser" http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use 're' - A simple regex-based HTML tag stripper 'pyhtml2text' - Use Python module "html2text" options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options """ if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'pyhtml2text': import html2text parser = html2text.HTML2Text() for k, v in options.items(): setattr(parser, k.lower(), v) d = parser.handle(data) return d if method == 'bs4': from bs4 import BeautifulSoup parser = options.pop('parser', 'html.parser') soup = BeautifulSoup(data, parser) d = soup.get_text(strip=True) return d if method == 'lynx': cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8'] elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] else: raise ValueError('Unknown html2text method: %r' % (method,)) stdout_encoding = 'utf-8' for k, v in options.items(): cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip()