def _extract_links_from_html(self, html, response_encoding):
     links = []
     for el, attr, attr_val, pos in html.iterlinks():
         if self.tag_func(el.tag):
             if self.attr_func(attr):
                 try:
                     url = attr_val
                     if isinstance(url, unicode):
                         try:
                             url = to_str(url, response_encoding)
                         except UnicodeEncodeError:
                             # fallback
                             url = to_str(url, 'utf-8')
                     url = requote_url(url)
                     url = correct_relative_path(url)
                     text = el.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (el.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Beispiel #2
0
 def test_correct_relative_path(self):
     self.assertEqual(
         correct_relative_path(
             'http://digineff.cz/art/sout/fotky-s-p-b-hem.html'),
         'http://digineff.cz/art/sout/fotky-s-p-b-hem.html')
     self.assertEqual(correct_relative_path('http://www.test.com/.'),
                      'http://www.test.com/')
     self.assertEqual(correct_relative_path('http://www.test.com/./'),
                      'http://www.test.com/')
     self.assertEqual(correct_relative_path('http://www.test.com/..'),
                      'http://www.test.com/')
     self.assertEqual(correct_relative_path('http://www.test.com/../'),
                      'http://www.test.com/')
     self.assertEqual(
         correct_relative_path('http://www.test.com/./.././..'),
         'http://www.test.com/')
     self.assertEqual(
         correct_relative_path('http://www.test.com/./a/./b/../c'),
         'http://www.test.com/a/c')
Beispiel #3
0
 def test_correct_relative_path(self):
     self.assertEqual(correct_relative_path(
         'http://digineff.cz/art/sout/fotky-s-p-b-hem.html'),
         'http://digineff.cz/art/sout/fotky-s-p-b-hem.html')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/.'),
         'http://www.test.com/')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/./'),
         'http://www.test.com/')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/..'),
         'http://www.test.com/')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/../'),
         'http://www.test.com/')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/./.././..'),
         'http://www.test.com/')
     self.assertEqual(correct_relative_path(
         'http://www.test.com/./a/./b/../c'),
         'http://www.test.com/a/c')
Beispiel #4
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(
                         to_str(to_unicode(l, 'utf-8'), response_encoding))
                     url = correct_relative_path(url)
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format=
                         'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING,
                         url=html.base_url,
                         etype=type(e),
                         error=e)
                 else:
                     links.append(
                         Link(url=url, text=text, nofollow=nofollow))
     return links