def test_to_unicode(self):
     self.assertEqual(to_unicode('lel\xc3\xb1e'), u'lel\xf1e')
     self.assertEqual(to_unicode('lel\xf1e', 'latin-1'), u'lel\xf1e')
     self.assertEqual(to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e')
     self.assertEqual(to_unicode([10, 11]), u'[10, 11]')
     self.assertIn(u'\ufffd', to_unicode('a\xedb',
                                         'utf-8',
                                         errors='replace'))
Example #2
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for el, attr, attr_val, pos in html.iterlinks():
         if self.tag_func(el.tag):
             if self.attr_func(attr):
                 try:
                     url = attr_val
                     if isinstance(url, unicode):
                         try:
                             url = to_str(url, response_encoding)
                         except UnicodeEncodeError:
                             # fallback
                             url = to_str(url, 'utf-8')
                     url = requote_url(url)
                     url = correct_relative_path(url)
                     text = el.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (el.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Example #3
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding))
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Example #4
0
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
    '''Remove entities from the given text by converting them to
    corresponding unicode character.

    `text` can be a unicode string or a regular string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If 'keep' is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric (&#nnnn; and &#hhhh;) and named (  >)
    entities.

    If remove_illegal is True, entities that can't be converted are removed.
    If remove_illegal is False, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).
    '''

    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal else m.group(0)

    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
Example #5
0
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
    '''Remove entities from the given text by converting them to
    corresponding unicode character.

    `text` can be a unicode string or a regular string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If 'keep' is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric (&#nnnn; and &#hhhh;) and named (&nbsp; &gt;)
    entities.

    If remove_illegal is True, entities that can't be converted are removed.
    If remove_illegal is False, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).
    '''
    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal else m.group(0)

    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
Example #6
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(
                         to_str(to_unicode(l, 'utf-8'), response_encoding))
                     url = correct_relative_path(url)
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format=
                         'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING,
                         url=html.base_url,
                         etype=type(e),
                         error=e)
                 else:
                     links.append(
                         Link(url=url, text=text, nofollow=nofollow))
     return links
Example #7
0
 def test_to_unicode(self):
     self.assertEqual(to_unicode('lel\xc3\xb1e'), u'lel\xf1e')
     self.assertEqual(to_unicode('lel\xf1e', 'latin-1'), u'lel\xf1e')
     self.assertEqual(to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e')
     self.assertEqual(to_unicode([10, 11]), u'[10, 11]')
     self.assertIn(u'\ufffd', to_unicode('a\xedb', 'utf-8', errors='replace'))