def _extract_links_from_html(self, html, response_encoding): links = [] for el, attr, attr_val, pos in html.iterlinks(): if self.tag_func(el.tag): if self.attr_func(attr): try: url = attr_val if isinstance(url, unicode): try: url = to_str(url, response_encoding) except UnicodeEncodeError: # fallback url = to_str(url, 'utf-8') url = requote_url(url) url = correct_relative_path(url) text = el.text or u'' text = to_unicode(text, 'utf-8') nofollow = (el.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def test_correct_relative_path(self): self.assertEqual( correct_relative_path( 'http://digineff.cz/art/sout/fotky-s-p-b-hem.html'), 'http://digineff.cz/art/sout/fotky-s-p-b-hem.html') self.assertEqual(correct_relative_path('http://www.test.com/.'), 'http://www.test.com/') self.assertEqual(correct_relative_path('http://www.test.com/./'), 'http://www.test.com/') self.assertEqual(correct_relative_path('http://www.test.com/..'), 'http://www.test.com/') self.assertEqual(correct_relative_path('http://www.test.com/../'), 'http://www.test.com/') self.assertEqual( correct_relative_path('http://www.test.com/./.././..'), 'http://www.test.com/') self.assertEqual( correct_relative_path('http://www.test.com/./a/./b/../c'), 'http://www.test.com/a/c')
def test_correct_relative_path(self): self.assertEqual(correct_relative_path( 'http://digineff.cz/art/sout/fotky-s-p-b-hem.html'), 'http://digineff.cz/art/sout/fotky-s-p-b-hem.html') self.assertEqual(correct_relative_path( 'http://www.test.com/.'), 'http://www.test.com/') self.assertEqual(correct_relative_path( 'http://www.test.com/./'), 'http://www.test.com/') self.assertEqual(correct_relative_path( 'http://www.test.com/..'), 'http://www.test.com/') self.assertEqual(correct_relative_path( 'http://www.test.com/../'), 'http://www.test.com/') self.assertEqual(correct_relative_path( 'http://www.test.com/./.././..'), 'http://www.test.com/') self.assertEqual(correct_relative_path( 'http://www.test.com/./a/./b/../c'), 'http://www.test.com/a/c')
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url( to_str(to_unicode(l, 'utf-8'), response_encoding)) url = correct_relative_path(url) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format= 'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append( Link(url=url, text=text, nofollow=nofollow)) return links