def force_unicode(raw): ''' Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and if that fails, it assumes utf8 and just ignores all errors. ''' converted = UnicodeDammit(raw, isHTML=True) if not converted.unicode: converted.unicode = unicode(raw, 'utf8', errors='ignore') encoding_m = encoding_re.match(converted.unicode) if encoding_m: converted.unicode = \ encoding_m.group('start_xml') + \ encoding_m.group('remainder') return converted.unicode
def make_clean_html_super(raw, stream_item=None, log_dir_path=None): ''' Treat 'raw' as though it is HTML, even if we have no idea what it really is, and attempt to get a properly formatted HTML document with all HTML-escaped characters converted to their unicode. ''' ## attempt to get HTML and force it to unicode fixed_html = None ## count the number of attempts, so can get progressively more ## aggressive with forcing the character set attempt = 0 ## keep all the tracebacks, so we can read them if we want to ## analyze a particular document all_exc = [] ## the last attempt leads sets this to True to end the looping no_more_attempts = False while not no_more_attempts: attempt += 1 try: ## default attempt uses vanilla lxml.html root = lxml.html.fromstring(raw) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') except UnicodeDecodeError, exc: ## most common failure is a bogus encoding all_exc.append(exc) try: converted = UnicodeDammit(raw, isHTML=True) if not converted.unicode: raise Exception( 'UnicodeDammit failed, appeared to be %r tried [%s]' % ( converted.originalEncoding, ', '.join(converted.triedEncodings))) encoding_m = encoding_re.match(converted.unicode) if encoding_m: converted.unicode = \ encoding_m.group('start_xml') + \ encoding_m.group('remainder') root = lxml.html.fromstring(converted.unicode) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') ## hack in a logging step here so we can manually inspect ## this fallback stage. if log_dir_path and stream_item: stream_item.body.clean_html = fixed_html.encode('utf8') stream_item.body.logs.append( make_traceback_log(all_exc) ) except Exception, exc: ## UnicodeDammit failed all_exc.append(exc) fixed_html = None
def make_clean_html_super(raw, stream_item=None, log_dir_path=None): ''' Treat 'raw' as though it is HTML, even if we have no idea what it really is, and attempt to get a properly formatted HTML document with all HTML-escaped characters converted to their unicode. ''' ## attempt to get HTML and force it to unicode fixed_html = None ## count the number of attempts, so can get progressively more ## aggressive with forcing the character set attempt = 0 ## keep all the tracebacks, so we can read them if we want to ## analyze a particular document all_exc = [] ## the last attempt leads sets this to True to end the looping no_more_attempts = False while not no_more_attempts: attempt += 1 try: ## default attempt uses vanilla lxml.html root = lxml.html.fromstring(raw) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') except UnicodeDecodeError, exc: ## most common failure is a bogus encoding all_exc.append(exc) try: converted = UnicodeDammit(raw, isHTML=True) if not converted.unicode: raise Exception( 'UnicodeDammit failed, appeared to be %r tried [%s]' % (converted.originalEncoding, ', '.join( converted.triedEncodings))) encoding_m = encoding_re.match(converted.unicode) if encoding_m: converted.unicode = \ encoding_m.group('start_xml') + \ encoding_m.group('remainder') root = lxml.html.fromstring(converted.unicode) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') ## hack in a logging step here so we can manually inspect ## this fallback stage. if log_dir_path and stream_item: stream_item.body.clean_html = fixed_html.encode('utf8') stream_item.body.logs.append(make_traceback_log(all_exc)) except Exception, exc: ## UnicodeDammit failed all_exc.append(exc) fixed_html = None