Example #1
0
def replace_entities(ustring, placeholder=" "):

    """Replaces HTML special characters by readable characters.

    As taken from Leif K-Brooks algorithm on:
    http://groups-beta.google.com/group/comp.lang.python
    
    """

    def _repl_func(match):
        try:
            if match.group(1): # Numeric character reference
                return unichr( int(match.group(2)) ) 
            else:
                try: return cp1252[ unichr(int(match.group(3))) ].strip()
                except: return unichr( name2codepoint[match.group(3)] )
        except:
            return placeholder

    # Force to Unicode.
    if not isinstance(ustring, unicode):
        ustring = UnicodeDammit(ustring).unicode
    
    # Don't want some weird unicode character here
    # that truncate_spaces() doesn't know of:
    ustring = ustring.replace(" ", " ")
    
    # The ^> makes sure nothing inside a tag (i.e. href with query arguments) gets processed.
    _entity_re = re.compile(r'&(?:(#)(\d+)|([^;^> ]+));') 
    return _entity_re.sub(_repl_func, ustring) 
Example #2
0
def make_tree(html):
    """
    Returns an lxml tree for the given HTML string (either Unicode or
    bytestring).

    This is better than lxml.html.document_fromstring because this takes care
    of a few known issues.
    """
    # Normalize newlines. Otherwise, "\r" gets converted to an HTML entity
    # by lxml.
    html = re.sub('\r\n', '\n', html)

    # Remove <?xml> declaration in Unicode objects, because it causes an error:
    # "ValueError: Unicode strings with encoding declaration are not supported."
    # Note that the error only occurs if the <?xml> tag has an "encoding"
    # attribute, but we remove it in all cases, as there's no downside to
    # removing it.
    if isinstance(html, unicode):
        html = re.sub(r'^\s*<\?xml\s+.*?\?>', '', html)
    else:
        html = UnicodeDammit(html, isHTML=True).unicode
    html = html.strip()
    if html:
        try:
            return document_fromstring(html)
        except:
            # Fall back to using the (slow) BeautifulSoup parser.
            return lxml.html.soupparser.fromstring(html)
    else:
        root = Element('body')
        root.text = u''
        return ElementTree(root)
Example #3
0
def _prep_query(query):
    '''Prepare query for Wikipedia. Queries must capitalize (most)
    words, must be in unicode, and must not contain characters with
    accents (ü). For example, the query http://en.wikipedia.org/wiki/Olga_Kurylenko works, 
    but the query http://en.wikipedia.org/wiki/olga_kurylenko does not.

    Args:
        query (str) : Original query
    Returns:
        Wikipedia-formatted query
    
    '''

    # Ensure unicode
    query = UnicodeDammit(query).unicode

    # Replace accents (ü -> u)
    query = geotools.strip_accents(query)
    
    # Split and capitalize query terms
    terms = map(
        lambda s: s.capitalize() if s not in _no_cap else s,
        query.lower().split(' ')
    )
    
    # Join query terms
    query = ' '.join(terms)

    # Return completed query
    return query
Example #4
0
def make_tree(html):
    """
    Returns an lxml tree for the given HTML string (either Unicode or
    bytestring).

    This is better than lxml.html.document_fromstring because this takes care
    of a few known issues.
    """
    # Normalize newlines. Otherwise, "\r" gets converted to an HTML entity
    # by lxml.
    html = re.sub('\r\n', '\n', html)

    # Remove <?xml> declaration in Unicode objects, because it causes an error:
    # "ValueError: Unicode strings with encoding declaration are not supported."
    # Note that the error only occurs if the <?xml> tag has an "encoding"
    # attribute, but we remove it in all cases, as there's no downside to
    # removing it.
    if isinstance(html, unicode):
        html = re.sub(r'^\s*<\?xml\s+.*?\?>', '', html)
    else:
        html = UnicodeDammit(html, isHTML=True).unicode
    html = html.strip()
    if html:
        try:
            return document_fromstring(html)
        except:
            # Fall back to using the (slow) BeautifulSoup parser.
            return lxml.html.soupparser.fromstring(html)
    else:
        root = Element('body')
        root.text = u''
        return ElementTree(root)
Example #5
0
    def _solve_encoding(self, encoding, text):

        result = text
        if encoding:

            if (encoding in ["guess", "detect", "unicodedammit"]):
                dammit = UnicodeDammit(text)
                encoding = dammit.originalEncoding
                logger.debug(
                    "Detected content encoding as %s (using 'unicodedammit' detection)"
                    % encoding)
                result = dammit.unicode

            else:
                if (encoding in ["chardet"]):
                    chardet_result = chardet.detect(text)
                    encoding = chardet_result['encoding']
                    logger.debug(
                        "Detected content encoding as %s (using 'chardet' detection)"
                        % encoding)

                try:
                    result = text.decode(encoding, self.encoding_errors)
                except UnicodeDecodeError:
                    if (self.encoding_abort):
                        raise Exception(
                            "Error decoding unicode with encoding '%s' on data: %r"
                            % (encoding, text))
                    logger.warn(
                        "Error decoding unicode with encoding '%s' on data: %r"
                        % (encoding, text))
                    result = text.decode("latin-1")

        return result
Example #6
0
def decodeText(txt, headers=None):
    """
	Takes a HTTP response body (=text) and the corresponding HTTP headers (a dict or dict-like
	object; httplib.HTTPResponse will do; see parseHttpHeaders() if you have a string);
	outputs the text as a unicode string. The encoding is guessed using BeautifulSoup.UnicodeDammit
	(which in turn uses chardet if installed), enhanced by the HTTP-suggested encoding.

	Raises MimeTypeError (subclass of ValueError) if headers do not indicate a text/* mime-type.
	"""
    from BeautifulSoup import UnicodeDammit

    # guess the charset suggested by HTTP headers
    httpCharset = []
    if headers:
        contentType = headers.get('content-type', '')

        if not contentType.startswith('text/'):
            raise MimeTypeError(
                "Can only decode text documents (mime type text/*; got %s)" %
                contentType)

        m = re.search('charset=([\w0-9\-]+)', contentType)
        if m:
            httpCharset = [m.group(1).replace('windows-', 'cp')]

    ud = UnicodeDammit(
        txt, isHTML=True, overrideEncodings=httpCharset
    )  # overrideEncodings is not enforced by UnicodeDammit, it's just tried
    return ud.unicode
Example #7
0
 def _sniff_encoding(self, resource_info):
     with open(resource_info.filename) as f:
         data = f.read()
         proposed = ["utf-8", "latin1"]
         converted = UnicodeDammit(data, proposed, isHTML=True)
     del data
     return converted.originalEncoding
Example #8
0
def decode_html(html, charset='ascii'):
    "Decode html_string to unicode"
    try:
        body = unicode(html, charset)
    except (UnicodeDecodeError, LookupError,):
        body = UnicodeDammit(html, isHTML=True).unicode
    return body
Example #9
0
def _decoder(data):
    """Simple helper to enforce a decent charset handling."""
    converted = UnicodeDammit(data, isHTML=True)
    if not converted.unicode:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                 ', '.join(converted.triedEncodings))
    return converted.unicode
def force_unicode(raw):
    '''
    Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and
    if that fails, it assumes utf8 and just ignores all errors.
    '''
    converted = UnicodeDammit(raw, isHTML=True)
    if not converted.unicode:
        converted.unicode = unicode(raw, 'utf8', errors='ignore')

    encoding_m = encoding_re.match(converted.unicode)
    if encoding_m:
        converted.unicode = \
            encoding_m.group('start_xml') + \
            encoding_m.group('remainder')

    return converted.unicode
Example #11
0
def decode_html(html_string):
    # See http://stackoverflow.com/a/16427392/82216
    converted = UnicodeDammit(html_string, isHTML=True)
    if not converted.unicode:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                 ', '.join(converted.triedEncodings))
    return converted.unicode
def force_unicode(raw):
    '''
    Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and
    if that fails, it assumes utf8 and just ignores all errors.
    '''
    converted = UnicodeDammit(raw, isHTML=True)
    if not converted.unicode:
        converted.unicode = unicode(raw, 'utf8', errors='ignore')

    encoding_m = encoding_re.match(converted.unicode)
    if encoding_m:
        converted.unicode = \
            encoding_m.group('start_xml') + \
            encoding_m.group('remainder')

    return converted.unicode
Example #13
0
def parse(filename, window_width=1000):
    logger.info('Got HTML to parse: %s' % filename)
    try:
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
            copy_fname = f.name
        with tempfile.NamedTemporaryFile(delete=False) as f:
            conv_fname = f.name
        shutil.copy2(filename, copy_fname)
        # try to determine encoding and decode
        with open(copy_fname, 'rb') as f:
            converted = UnicodeDammit(f.read(), isHTML=True)
        if converted.unicode:
            with open(copy_fname, 'wb') as f:
                f.write(converted.unicode.encode('utf8'))
        args = ['wkhtmltopdf', '--encoding', 'utf-8', copy_fname, conv_fname]
        env = {'DISPLAY': ':99'}
        logger.debug('Calling wkhtmltopdf with arguments %r' % args)
        subprocess.check_call(args, env=env)
        logger.debug('Wkhtmltopdf has done the job')
        return pdf.parse(conv_fname, window_width)
    except subprocess.CalledProcessError as err:
        logger.error('wkhtmltopdf failed to convert "%s" because of %s\n%s' %
                     (filename, err, traceback.format_exc()))
        raise PreprocError()
    finally:
        if copy_fname and os.path.exists(copy_fname):
            os.remove(copy_fname)
        if conv_fname and os.path.exists(conv_fname):
            os.remove(conv_fname)
Example #14
0
def decode_html(html_string):
    """Convert a string into the UTF-8 encoding"""
    converted = UnicodeDammit(html_string, isHTML=True)
    if not converted.unicode:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                 ', '.join(converted.triedEncodings))
    return converted.unicode
Example #15
0
def parse(raw_content, base_href=None, notify=lambda *args: None):
    try:
        content = UnicodeDammit(raw_content, isHTML=True).markup
        cleaned = _remove_crufty_html(content)
        return create_doc(cleaned, base_href)
    except HTMLParseError, e:
        notify("parsing failed:", e)
Example #16
0
 def handle_text(self):
   '''
   Takes care of converting body text to unicode, if its text at all.
   Sets self.original_encoding to original char encoding, and converts body
   to unicode if possible. Must come after handle_compression, and after
   self.mediaType is valid.
   '''
   self.encoding = None
   self.text = None
   # if the body is text
   if (self.mediaType and
     (self.mediaType.type == 'text' or
       (self.mediaType.type == 'application' and
        'xml' in self.mediaType.subtype))):
     # if there was a charset parameter in HTTP header, store it
     if 'charset' in self.mediaType.params:
       override_encodings = [self.mediaType.params['charset']]
     else:
       override_encodings = []
     # if there even is data (otherwise, dammit.originalEncoding might be None)
     if self.body != '':
       if UnicodeDammit:
         # honestly, I don't mind not abiding by RFC 2023. UnicodeDammit just
         # does what makes sense, and if the content is remotely standards-
         # compliant, it will do the right thing.
         dammit = UnicodeDammit(self.body, override_encodings)
         # if unicode was found
         if dammit.unicode:
           self.text = dammit.unicode
           self.originalEncoding = dammit.originalEncoding
         else:
           # unicode could not be decoded, at all
           # HAR can't write data, but body might still be useful as-is
           pass
       else:
         # try the braindead version, just guess content-type or utf-8
         u = None
         # try our list of encodings + utf8 with strict errors
         for e in override_encodings + ['utf8', 'iso-8859-1']:
           try:
             u = self.body.decode(e, 'strict')
             self.originalEncoding = e
             break # if ^^ didn't throw, we're done
           except UnicodeError:
             logging.warning("Decoding unicocde response.")
             pass
         # if none of those worked, try utf8 with 'replace' error mode
         if not u:
           # unicode has failed
           u = self.body.decode('utf8', 'replace')
           self.originalEncoding = None # ???
         self.text = u or None
   else:
     # body is not text
     self.encoding = "base64"
     self.text = base64.b64encode(self.body)
   
   # BLAZE - Removing body for now, to preserve memory
   self.text = None
Example #17
0
def parse(raw_content, base_href=None, notify=lambda x: None):
    try:
        content = UnicodeDammit(raw_content, isHTML=True).markup
        cleaned = _remove_crufty_html(content)
        debug("Cleaned content: %s" % (cleaned, ))
        return create_doc(cleaned, base_href)
    except HTMLParseError, e:
        notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
Example #18
0
    def generate_rows(self,
                      dataset_schema=None,
                      dataset_partitioning=None,
                      partition_id=None,
                      records_limit=-1):
        """
        The main reading method.
        """

        url_book = self.mirror
        lid = len(str(self.book_id))
        fullbid = str(self.book_id)
        rootbid = fullbid  # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285
        print type(lid)

        stopit = 0
        for i in range(lid - 1):
            if (fullbid[i + 1] != "-") and (stopit == 0):
                url_book += '/' + fullbid[i]
            else:
                stopit = 1
                rootbid = fullbid[0:i]
        url_book += '/' + rootbid + '/' + fullbid + '.txt'

        print url_book
        response = url2.urlopen(url_book)
        raw = response.read()  #.decode('utf8')
        converted = UnicodeDammit(raw)
        raw = converted.unicode
        start_book = raw.find("START OF")
        end_book = raw.rfind('END OF')
        preamb = raw[:start_book]

        author = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Author') != -1
        ][0]
        title = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Title') != -1
        ][0]
        date = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Release Date') != -1
        ][0]
        book_paraph = raw[start_book:end_book].split("\r\n\r\n")

        print "Book length %s" % len(raw)
        print "N paragraphs:", len(book_paraph)

        for id_p, p in enumerate(book_paraph):
            yield {'id': id_p, 'author': author, 'title': title, 'text': p}
Example #19
0
    def __init__(self,
                 text,
                 url,
                 verbose=VERBOSE,
                 maxpage=MAXPAGE,
                 checker=None,
                 options=None,
                 logger=None):
        self.text = text
        self.url = url
        self.verbose = verbose
        self.maxpage = maxpage
        self.logger = logger
        self.checker = checker
        self.options = options

        # The parsing of the page is done in the __init__() routine in
        # order to initialize the list of names the file
        # contains. Stored the parser in an instance variable. Passed
        # the URL to MyHTMLParser().
        size = len(self.text)

        if self.maxpage and size > self.maxpage:
            self.logger.info("%s Skip huge file (%.0f Kbytes)" %
                             (self.url, (size * 0.001)))
            self.parser = None
            return

        if options:
            text = self.reformat(text, url)
        self.logger.debug("Parsing %s (%d bytes)" % (self.url, size))
        #text = clean_html(text)
        try:
            converted = UnicodeDammit(text, isHTML=True)
            if not converted.unicode:
                raise UnicodeDecodeError(
                    "Failed to detect encoding, tried [%s]",
                    ', '.join(converted.triedEncodings))
            # print converted.originalEncoding
            self.parser = lxml.html.fromstring(converted.unicode)
            #self.parser = lxml.html.soupparser.fromstring(text)
            self.parser.resolve_base_href()
            self._html = tostring(self.parser,
                                  encoding=unicode,
                                  method="html",
                                  pretty_print=True)
            assert self._html is not None
            return
        except UnicodeDecodeError, HTMLParseError:
            self.logger.error("HTMLParseError %s" % url)
            pass
Example #20
0
def markdown_to_html(text, markdown_extensions):
    """
    When the input is Markdown, convert it to HTML so we can parse that.
    """
    logger.info("Converting Markdown to HTML using extensions: %s.",
                ", ".join(sorted(markdown_extensions)))
    # We import the markdown module here so that the markdown module is not
    # required to use html2vimdoc when the input is HTML.
    from markdown import markdown
    # The Python Markdown module only accepts Unicode and ASCII strings, but we
    # don't know what the encoding of the Markdown text is. BeautifulSoup comes
    # to the rescue with the aptly named UnicodeDammit class :-).
    return markdown(UnicodeDammit(text).unicode,
                    extensions=markdown_extensions)
Example #21
0
def text_from_html(html):
    """Remove ALL tags and return all plain text.
    """
    text = preprocess_to_string(html,
                                drop_tags=_html_droptags,
                                drop_trees=_html_droptrees)
    if not text:
        # Maybe there was something there but not really HTML.
        if text and not isinstance(text, unicode):
            text = UnicodeDammit(html, isHTML=False).unicode.strip()
        else:
            text = u''
    text = convert_entities(text)
    return text
Example #22
0
    def get_url(self, url):
        """ fetch url, return it as an lxml.html doc """

        content = urllib2.urlopen(url).read()
        #        content = re.sub( """<?xml version="1.0" encoding="(.*?)"?>""", '', content)
        #"""<?xml version="1.0" encoding="ISO-8859-1"?>"""

        converted = UnicodeDammit(content, isHTML=True)

        if not converted.unicode:
            raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                     ', '.join(converted.triedEncodings))
        doc = fromstring(converted.unicode)
        doc.make_links_absolute(url)
        return doc
Example #23
0
def format_results(results):

    from BeautifulSoup import UnicodeDammit

    new_results = []
    for line in results:
        new_line = []
        for elem in line:
            if elem is None:
                new_line.append('')
            else:
                new_line.append(UnicodeDammit(elem).unicode)
        new_results.append('\t'.join(new_line))

    return '\n'.join(new_results)
Example #24
0
    def generate_rows(self,
                      dataset_schema=None,
                      dataset_partitioning=None,
                      partition_id=None,
                      records_limit=-1):
        """
        The main reading method.
        """

        url_book = self.mirror
        lid = len(str(self.book_id))
        bid = str(self.book_id)
        print type(lid)

        for i in range(lid - 1):
            url_book += '/' + bid[i]
        url_book += '/' + bid + '/' + bid + '.txt'

        print url_book
        response = url2.urlopen(url_book)
        raw = response.read()  #.decode('utf8')
        converted = UnicodeDammit(raw)
        raw = converted.unicode
        start_book = raw.find("START OF")
        end_book = raw.rfind('END OF')
        preamb = raw[:start_book]

        author = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Author') != -1
        ][0]
        title = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Title') != -1
        ][0]
        date = [
            i.split(':')[1].strip() for i in preamb.split("\r\n\r\n")
            if i.find('Release Date') != -1
        ][0]
        book_paraph = raw[start_book:end_book].split("\r\n\r\n")

        print "Book length %s" % len(raw)
        print "N paragraphs:", len(book_paraph)

        for id_p, p in enumerate(book_paraph):
            yield {'id': id_p, 'author': author, 'title': title, 'text': p}
Example #25
0
    def get_value(self, value):
        #
        # This is called when XML is being rendered.
        # If a `transform` callable was passed into the constructor, it will
        # be used to modify the passed value.
        #
        value = self.transform(value) if self.transform else value

        #
        # Ugh - BeerXMLv1 is ASCII (ISO-8859-1), so we need to coerce
        # accented and other international characters to normalized ASCII
        # equivalents as best we can.
        #
        if isinstance(value, basestring):
            value = unicodedata.normalize('NFKD',
                                          UnicodeDammit(value).unicode).encode(
                                              'ascii', 'ignore')

        return {self.name: value}
Example #26
0
    def from_string(self,
                    string,
                    isHTML=False,
                    encoding=None,
                    remove_blank_text=False):
        if string is None: return None

        if encoding == None:
            ud = UnicodeDammit(str(string), isHTML=isHTML)
            markup = ud.markup.encode('utf-8')
        else:
            markup = str(string).encode(encoding)

        if isHTML:
            try:
                return html.fromstring(markup, parser=html_parser)
            except:
                self._core.log_exception(
                    'Error parsing with lxml, falling back to soupparser')
                return soupparser.fromstring(string)
        else:
            return etree.fromstring(
                markup, parser=(xml_parser if remove_blank_text else None))
def sanitize_xml(data, log=None):
    u"""Take a string of bytes or unicode representing XML data and turn it into a UTF-8 string with characters that are invalid in that version of XML removed.

    >>> sanitize_xml("<?xml encoding='UTF-8'?><hello>hi</hello>")
    '<?xml encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml(u"<?xml encoding='UTF-8'?><hello>hi</hello>")
    '<?xml encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml("<?xml encoding='UTF-16'?><hello>hi</hello>")
    '<?xml encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml('<?xml encoding="UTF-16"?><hello>hi</hello>')
    '<?xml encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml('<?xml encoding="blah"?><hello>hi</hello>')
    '<?xml encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml('<?xml encoding="blah" ?><hello>hi</hello>')
    '<?xml encoding="UTF-8"?><hello>hi</hello>'

    >>> sanitize_xml('<?xml version="1.0" encoding="blah" ?><hello>hi</hello>')
    '<?xml version="1.0" encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml('<?xml version="1.1" encoding="blah" ?><hello>hi</hello>')
    '<?xml version="1.1" encoding="UTF-8"?><hello>hi</hello>'
    >>> sanitize_xml('<hello>hi</hello>')
    '<?xml version="1.0" encoding="UTF-8"?><hello>hi</hello>'

    >>> sanitize_xml(u'\u2026')
    '<?xml version="1.0" encoding="UTF-8"?>\\xe2\\x80\\xa6'
    >>> sanitize_xml('hello\\x00world')
    '<?xml version="1.0" encoding="UTF-8"?>helloworld'
    >>> def log(msg): print msg
    ...
    >>> sanitize_xml('hello\\0world', log)
    Found first disallowed character u'\\x00' at position 44
    '<?xml version="1.0" encoding="UTF-8"?>helloworld'


    \x7f is allowed in XML 1.0, but not in XML 1.1

    >>> sanitize_xml(
    ... '<?xml version="1.0" encoding="UTF-8"?><hello>\\x7f</hello>')
    '<?xml version="1.0" encoding="UTF-8"?><hello>\\x7f</hello>'
    >>> sanitize_xml('<?xml version="1.1"?><hello>\\x7f</hello>', log)
    Found first disallowed character u'\\x7f' at position 46
    '<?xml version="1.1" encoding="UTF-8"?><hello></hello>'


    The \x80 in the following makes UnicodeDammit interpret the string using the
    windows-1252 encoding, so it gets translated into a Euro symbol.

    >>> sanitize_xml('hello\\x80world', log)
    '<?xml version="1.0" encoding="UTF-8"?>hello\\xe2\\x82\\xacworld'

    If we pass in a unicode string instead so that UnicodeDammit is bypassed
    then it gets properly ignored...

    >>> sanitize_xml(u'hello\u0080world', log).decode('utf_8')
    u'<?xml version="1.0" encoding="UTF-8"?>hello\\x80world'

    unless we use XML 1.1 where it is properly disallowed and so stripped:

    >>> sanitize_xml(u'<?xml version="1.1" ?>hello\u0080world', log)
    Found first disallowed character u'\\x80' at position 44
    '<?xml version="1.1" encoding="UTF-8"?>helloworld'

    >>> sanitize_xml('<hello>&#xB;</hello>', log)
    Found first disallowed character reference &#xB; at position 46
    '<?xml version="1.0" encoding="UTF-8"?><hello></hello>'
    >>> sanitize_xml(u'<?xml version="1.1"?><hello>&#xB;</hello>', log)
    '<?xml version="1.1" encoding="UTF-8"?><hello>&#xB;</hello>'
    >>> sanitize_xml('<hello>&#x0;&#x01;&#x007;</hello>', log)
    Found first disallowed character reference &#x0; at position 46
    '<?xml version="1.0" encoding="UTF-8"?><hello></hello>'
    >>> sanitize_xml('<hello>&#x0a;&#xd;&#0;blah&#7;&#13;</hello>', log)
    Found first disallowed character reference &#0; at position 57
    '<?xml version="1.0" encoding="UTF-8"?><hello>&#x0a;&#xd;blah&#13;</hello>'
    """

    if isinstance(data, unicode):
        u = data
    else:
        u = UnicodeDammit(data, smartQuotesTo=None).unicode

    # The text may have a prolog that specifies a character encoding, but we're
    # going to re-encode it as UTF-8 so make sure the prolog reflects that.
    m = re.match("""^<\?xml[\s]*([^\?]*?)[\s]*\?>""", u)

    if not m:
        # no prolog found, so add one of our own
        u = '<?xml version="1.0" encoding="UTF-8"?>' + u
        version = 0
    else:
        new_encoding = 'encoding="UTF-8"'

        attr = m.group(1)

        encoding_m = re.search("""encoding[\s]*=[\s]*['"].*?['"]""", attr)
        if encoding_m: # replace the encoding
            attr = \
                attr[:encoding_m.start()] + \
                new_encoding + \
                attr[encoding_m.end():]
        else: # or add it if there wasn't one in the prolog already
            attr = attr + ' ' + new_encoding

        u = '<?xml ' + attr + '?>' + u[m.end():]

        # see if the prolog has a version number too
        m2 = re.search("""[\s]*version[\s]*=[\s]*['"](.*?)['"]""", attr)
        if m2:
            if m2.group(1) == u'1.0':
                version = 0
            else:
                # anything unknown is going to be >1.1, so assume the 1.1
                # invalid character rules
                version = 1
        else:
            # version number is optional for XML 1.0
            version = 0

    allowed = u'\x09\x0a\x0d\x20-\x7e\xa0-\ud7ff\ue000-\ufffd'

    if version == 0:
        allowed = allowed + u'\x7f-\x9f'
    else:
        allowed = allowed + u'\x85'

    allowed_as_references = allowed
    if version != 0:
        allowed_as_references = allowed_as_references + u'\x01-\x1f\x7f-\x9f'

    everything_but = '[^%s]'
    disallowed = re.compile(
        everything_but % allowed)
    disallowed_as_references = re.compile(
        everything_but % allowed_as_references)

    logged_first = False

    skip_replacement = False
    if log:
        m = disallowed.search(u)
        if m:
            log('Found first disallowed character %s at position %d' % (
                repr(m.group(0)), m.start() + 1))
            logged_first = True
        else:
            # no point searching again in a moment
            skip_replacement = True

    if not skip_replacement:
        u = disallowed.sub('', u)

    reference = re.compile('&#(x)?0*([0-9a-fA-F]+);')
    search_pos = 0
    while True:
        m = reference.search(u, search_pos)
        if not m:
            break

        c = unichr(int(m.group(2), 16 if m.group(1) == 'x' else 10))

        if disallowed_as_references.match(c):
            if log and not logged_first:
                log(('Found first disallowed character reference %s ' +
                    'at position %d') % (
                        m.group(0), m.start() + 1))
                logged_first = True
            u = u[:m.start()] + u[m.end():]
            search_pos = m.start()
        else:
            search_pos = m.end()

    return u.encode('utf_8')
Example #28
0
                # return no encoding.
                # Uses most used encodings for each national suffix
                if u'.ru' in ref.link or u'.su' in ref.link:
                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
                    # encoding, no page encoding
                    enc = enc + ['koi8-r', 'windows-1251']
                elif u'.jp' in ref.link:
                    enc.append("shift jis 2004")
                    enc.append("cp932")
                elif u'.kr' in ref.link:
                    enc.append("euc-kr")
                    enc.append("cp949")
                elif u'.zh' in ref.link:
                    enc.append("gbk")

                u = UnicodeDammit(linkedpagetext, overrideEncodings=enc)

                if not u.unicode:
                    #Some page have utf-8 AND windows-1252 characters,
                    #Can't easily parse them. (~1 on 1000)
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output('%s : Hybrid encoding...' % ref.link)
                    continue

                # Retrieves the first non empty string inside <title> tags
                for m in self.TITLE.finditer(u.unicode):
                    t = m.group()
                    if t:
                        ref.title = t
                        ref.transform()
def lhget(*args, **kwargs):
    r = requests.get(*args, **kwargs)
    html = UnicodeDammit(r.content).unicode
    tree = lh.fromstring(html)
    return tree
Example #30
0
def decode_html(html_string):
    converted = UnicodeDammit(html_string, isHTML=True)
    if not converted.unicode:
        raise UnicodeDecodeError(
            ', '.join(converted.triedEncodings))
    return converted.unicode
Example #31
0
 def _computeLinks(self):
     self._computeRelpaths()
     htmls = self.resources['mimes']['text/html']
     total = len(htmls)
     i = 1
     for url in htmls:
         if self.cancel:
             return
         if self.client:
             self.client.call(
                 'eXe.app.getController("Toolbar").updateImportProgressWindow',
                 _(u'Analyzing HTML file labels %d of %d: %s') %
                 (i, total, str(url)))
         content = open(url.path).read()
         encoding = detect(content)['encoding']
         ucontent = unicode(content, encoding)
         soup = BeautifulSoup(ucontent, fromEncoding=encoding)
         declaredHTMLEncoding = getattr(soup, 'declaredHTMLEncoding')
         if declaredHTMLEncoding:
             ucontent = UnicodeDammit(content,
                                      [declaredHTMLEncoding]).unicode
             encoding = declaredHTMLEncoding
         else:
             pass
         url.setContent(ucontent, encoding)
         url.setSoup(soup)
         for tag in soup.findAll():
             if self.cancel:
                 return
             if not tag.attrs:
                 continue
             matches = []
             for key, value in tag.attrs:
                 if value == "":
                     continue
                 unq_value = unquote(value)
                 unq_low_value = unquote(value.lower())
                 for l, rl in self.resources['urls'][
                         url.parentpath].relpaths:
                     low_rl = rl.lower()
                     if rl in unq_value:
                         L = Link(self.resources['urls'][l], rl, url, tag,
                                  key, rl)
                         matches.append(L)
                     elif low_rl in unq_value:
                         L = Link(self.resources['urls'][l], rl, url, tag,
                                  key, low_rl)
                         matches.append(L)
                     elif l in unq_value:
                         L = Link(self.resources['urls'][l], rl, url, tag,
                                  key, l)
                         matches.append(L)
             matches_final = []
             for l1 in matches:
                 matches_ = [m for m in matches if m != l1]
                 found = False
                 for l2 in matches_:
                     if re.search(re.escape(l1.relative), l2.relative):
                         found = True
                 if not found:
                     matches_final.append(l1)
             if matches_final:
                 for match in matches_final:
                     url.addLink(match)
                     url.addRLink(str(match.url))
         i += 1
     csss = self.resources[
         'mimes']['text/css'] if 'text/css' in self.resources['mimes'].keys(
         ) else None
     csss_and_htmls = csss + htmls if csss else htmls
     total = len(csss_and_htmls)
     i = 1
     for url in csss_and_htmls:
         if self.cancel:
             return
         if url.mime == 'text/css':
             tipo = 'CSS'
         else:
             tipo = 'HTML'
         content = url.getContent()
         if not content:
             content = open(url.path).read()
             encoding = detect(content)['encoding']
             content = unicode(content, encoding)
             url.setContent(content, encoding)
         if self.client:
             self.client.call(
                 'eXe.app.getController("Toolbar").updateImportProgressWindow',
                 _(u'Exhaustively analyzed file %s %d of %d: %s') %
                 (tipo, i, total, str(url)))
         matches = []
         for l, rl in self.resources['urls'][url.parentpath].relpaths:
             low_rl = rl.lower()
             if rl in content:
                 L = Link(self.resources['urls'][l], rl, url, match=rl)
                 matches.append(L)
             elif low_rl in content:
                 L = Link(self.resources['urls'][l], rl, url, match=low_rl)
                 matches.append(L)
         matches_final = []
         for l1 in matches:
             matches_ = [m for m in matches if m != l1]
             found = False
             for l2 in matches_:
                 if re.search(re.escape(l1.relative), l2.relative):
                     found = True
             if not found:
                 matches_final.append(l1)
         if matches_final:
             for match in matches_final:
                 if not [
                         link for link in url.links
                         if link.relative == match.relative
                 ]:
                     url.addLink(match)
                     url.addRLink(str(match.url))
         i += 1
Example #32
0
 def message(self, user, message, length=380):
     message = message.replace('\n', '').replace('\r', '')
     message = UnicodeDammit(message)
     self.send_message(user, message.unicode.encode("utf-8"), length)
 - text - after bte
 - keywords
 - discription 
 - author
 - title
 """
 result = {}
 try: 
     conn = urllib2.urlopen(url)
     webfile = conn.read()
 except Exception, e: 
     logger.info("Cannot download URL:%s\t%s", url, e)
 else:
     if not webfile:
         return result
     converted = UnicodeDammit(webfile) #, isHTML=True)
     if not converted.unicode:
         logger.info("UnicodeDammit failed to detect encoding, tried [%s]", \
              ', '.join(converted.triedEncodings))
         return result
     logger.debug("UnicodeDammit: originalEncoding:%s, triedEncodings:%s",
              converted.originalEncoding, ', '.join(converted.triedEncodings))
     result['raw'] = converted.unicode
     result['text'] = bte.html2text(converted.unicode)
     root = None
     try:
         root = lxml.html.fromstring(webfile)
     except lxml.etree.ParserError, e:
         logger.info("Can not parse URL:%s\t%s", url, e)
         return dict()
     find = {'description' : "./head/meta[@name=\"description\"]/@content",
def make_clean_html_super(raw, stream_item=None, log_dir_path=None):
    '''
    Treat 'raw' as though it is HTML, even if we have no idea what it
    really is, and attempt to get a properly formatted HTML document
    with all HTML-escaped characters converted to their unicode.
    '''
    ## attempt to get HTML and force it to unicode
    fixed_html = None

    ## count the number of attempts, so can get progressively more
    ## aggressive with forcing the character set
    attempt = 0

    ## keep all the tracebacks, so we can read them if we want to
    ## analyze a particular document
    all_exc = []

    ## the last attempt leads sets this to True to end the looping
    no_more_attempts = False
    while not no_more_attempts:
        attempt += 1

        try:
            ## default attempt uses vanilla lxml.html
            root = lxml.html.fromstring(raw)
            ## if that worked, then we will be able to generate a
            ## valid HTML string
            fixed_html = lxml.html.tostring(root, encoding='unicode')

        except UnicodeDecodeError, exc:
            ## most common failure is a bogus encoding
            all_exc.append(exc)
            try:
                converted = UnicodeDammit(raw, isHTML=True)
                if not converted.unicode:
                    raise Exception(
                        'UnicodeDammit failed, appeared to be %r tried [%s]' % (
                            converted.originalEncoding,
                            ', '.join(converted.triedEncodings)))

                encoding_m = encoding_re.match(converted.unicode)
                if encoding_m:
                    converted.unicode = \
                        encoding_m.group('start_xml') + \
                        encoding_m.group('remainder')

                root = lxml.html.fromstring(converted.unicode)
                ## if that worked, then we will be able to generate a
                ## valid HTML string
                fixed_html = lxml.html.tostring(root, encoding='unicode')

                ## hack in a logging step here so we can manually inspect
                ## this fallback stage.
                if log_dir_path and stream_item:
                    stream_item.body.clean_html = fixed_html.encode('utf8')
                    stream_item.body.logs.append( make_traceback_log(all_exc) )

            except Exception, exc:
                ## UnicodeDammit failed
                all_exc.append(exc)
                fixed_html = None
Example #35
0
 def unicode_cleansed(content, base_href):
     content = UnicodeDammit(content, isHTML=True).markup
     cleaned = _remove_crufty_html(content)
     debug("Cleaned content: %s" % (cleaned, ))
     return beautiful_soup(cleaned, base_href)