Exemple #1
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         links = self.link_extractor.extract_links(response)
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL to LRU: %s" % e, log.ERROR)
             continue
         lrulinks.append(lrulink)
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Exemple #2
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
         elif redir_url.startswith('./') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Exemple #3
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith('../'):
             lrustart = lru[:lru.rfind('|p:')]
             while redir_url.startswith('../'):
                 lrustart = lrustart[:lrustart.rfind('|p:')]
                 redir_url = redir_url[3:]
             redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log(
                 "ERROR: links extractor crashed on %s: %s %s" %
                 (response, type(e), e), logging.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url, TLDS_TREE)
         except (ValueError, IndexError) as e:
             self.log("Error converting URL %s to LRU: %s" % (url, e),
                      logging.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
     response.meta['depth'] = realdepth
     yield self._make_html_page(response, lru, lrulinks)