def test_to_unicode(self): self.assertEqual(to_unicode('lel\xc3\xb1e'), u'lel\xf1e') self.assertEqual(to_unicode('lel\xf1e', 'latin-1'), u'lel\xf1e') self.assertEqual(to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e') self.assertEqual(to_unicode([10, 11]), u'[10, 11]') self.assertIn(u'\ufffd', to_unicode('a\xedb', 'utf-8', errors='replace'))
def _extract_links_from_html(self, html, response_encoding): links = [] for el, attr, attr_val, pos in html.iterlinks(): if self.tag_func(el.tag): if self.attr_func(attr): try: url = attr_val if isinstance(url, unicode): try: url = to_str(url, response_encoding) except UnicodeEncodeError: # fallback url = to_str(url, 'utf-8') url = requote_url(url) url = correct_relative_path(url) text = el.text or u'' text = to_unicode(text, 'utf-8') nofollow = (el.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding)) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): '''Remove entities from the given text by converting them to corresponding unicode character. `text` can be a unicode string or a regular string encoded in the given `encoding` (which defaults to 'utf-8'). If 'keep' is passed (with a list of entity names) those entities will be kept (they won't be removed). It supports both numeric (&#nnnn; and &#hhhh;) and named ( >) entities. If remove_illegal is True, entities that can't be converted are removed. If remove_illegal is False, entities that can't be converted are kept "as is". For more information see the tests. Always returns a unicode string (with the entities removed). ''' def convert_entity(m): entity_body = m.group(3) if m.group(1): try: if m.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return chr(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return m.group(0) else: number = name2codepoint.get(entity_body) if number is not None: try: return unichr(number) except ValueError: pass return u'' if remove_illegal else m.group(0) return _ent_re.sub(convert_entity, to_unicode(text, encoding))
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url( to_str(to_unicode(l, 'utf-8'), response_encoding)) url = correct_relative_path(url) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format= 'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append( Link(url=url, text=text, nofollow=nofollow)) return links