コード例 #1
0
def force_unicode(raw):
    '''
    Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and
    if that fails, it assumes utf8 and just ignores all errors.
    '''
    converted = UnicodeDammit(raw, isHTML=True)
    if not converted.unicode:
        converted.unicode = unicode(raw, 'utf8', errors='ignore')

    encoding_m = encoding_re.match(converted.unicode)
    if encoding_m:
        converted.unicode = \
            encoding_m.group('start_xml') + \
            encoding_m.group('remainder')

    return converted.unicode
コード例 #2
0
def force_unicode(raw):
    '''
    Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and
    if that fails, it assumes utf8 and just ignores all errors.
    '''
    converted = UnicodeDammit(raw, isHTML=True)
    if not converted.unicode:
        converted.unicode = unicode(raw, 'utf8', errors='ignore')

    encoding_m = encoding_re.match(converted.unicode)
    if encoding_m:
        converted.unicode = \
            encoding_m.group('start_xml') + \
            encoding_m.group('remainder')

    return converted.unicode
コード例 #3
0
def make_clean_html_super(raw, stream_item=None, log_dir_path=None):
    '''
    Treat 'raw' as though it is HTML, even if we have no idea what it
    really is, and attempt to get a properly formatted HTML document
    with all HTML-escaped characters converted to their unicode.
    '''
    ## attempt to get HTML and force it to unicode
    fixed_html = None

    ## count the number of attempts, so can get progressively more
    ## aggressive with forcing the character set
    attempt = 0

    ## keep all the tracebacks, so we can read them if we want to
    ## analyze a particular document
    all_exc = []

    ## the last attempt leads sets this to True to end the looping
    no_more_attempts = False
    while not no_more_attempts:
        attempt += 1

        try:
            ## default attempt uses vanilla lxml.html
            root = lxml.html.fromstring(raw)
            ## if that worked, then we will be able to generate a
            ## valid HTML string
            fixed_html = lxml.html.tostring(root, encoding='unicode')

        except UnicodeDecodeError, exc:
            ## most common failure is a bogus encoding
            all_exc.append(exc)
            try:
                converted = UnicodeDammit(raw, isHTML=True)
                if not converted.unicode:
                    raise Exception(
                        'UnicodeDammit failed, appeared to be %r tried [%s]' % (
                            converted.originalEncoding,
                            ', '.join(converted.triedEncodings)))

                encoding_m = encoding_re.match(converted.unicode)
                if encoding_m:
                    converted.unicode = \
                        encoding_m.group('start_xml') + \
                        encoding_m.group('remainder')

                root = lxml.html.fromstring(converted.unicode)
                ## if that worked, then we will be able to generate a
                ## valid HTML string
                fixed_html = lxml.html.tostring(root, encoding='unicode')

                ## hack in a logging step here so we can manually inspect
                ## this fallback stage.
                if log_dir_path and stream_item:
                    stream_item.body.clean_html = fixed_html.encode('utf8')
                    stream_item.body.logs.append( make_traceback_log(all_exc) )

            except Exception, exc:
                ## UnicodeDammit failed
                all_exc.append(exc)
                fixed_html = None
コード例 #4
0
def make_clean_html_super(raw, stream_item=None, log_dir_path=None):
    '''
    Treat 'raw' as though it is HTML, even if we have no idea what it
    really is, and attempt to get a properly formatted HTML document
    with all HTML-escaped characters converted to their unicode.
    '''
    ## attempt to get HTML and force it to unicode
    fixed_html = None

    ## count the number of attempts, so can get progressively more
    ## aggressive with forcing the character set
    attempt = 0

    ## keep all the tracebacks, so we can read them if we want to
    ## analyze a particular document
    all_exc = []

    ## the last attempt leads sets this to True to end the looping
    no_more_attempts = False
    while not no_more_attempts:
        attempt += 1

        try:
            ## default attempt uses vanilla lxml.html
            root = lxml.html.fromstring(raw)
            ## if that worked, then we will be able to generate a
            ## valid HTML string
            fixed_html = lxml.html.tostring(root, encoding='unicode')

        except UnicodeDecodeError, exc:
            ## most common failure is a bogus encoding
            all_exc.append(exc)
            try:
                converted = UnicodeDammit(raw, isHTML=True)
                if not converted.unicode:
                    raise Exception(
                        'UnicodeDammit failed, appeared to be %r tried [%s]' %
                        (converted.originalEncoding, ', '.join(
                            converted.triedEncodings)))

                encoding_m = encoding_re.match(converted.unicode)
                if encoding_m:
                    converted.unicode = \
                        encoding_m.group('start_xml') + \
                        encoding_m.group('remainder')

                root = lxml.html.fromstring(converted.unicode)
                ## if that worked, then we will be able to generate a
                ## valid HTML string
                fixed_html = lxml.html.tostring(root, encoding='unicode')

                ## hack in a logging step here so we can manually inspect
                ## this fallback stage.
                if log_dir_path and stream_item:
                    stream_item.body.clean_html = fixed_html.encode('utf8')
                    stream_item.body.logs.append(make_traceback_log(all_exc))

            except Exception, exc:
                ## UnicodeDammit failed
                all_exc.append(exc)
                fixed_html = None