def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (getURLHostFromLRU(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (getURLPathFromLRU(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: links = self.link_extractor.extract_links(response) for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL to LRU: %s" % e, log.ERROR) continue lrulinks.append(lrulink) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url)
def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (getURLHostFromLRU(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (getURLPathFromLRU(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: links = self.link_extractor.extract_links(response) for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL to LRU: %s" % e, log.ERROR) continue lrulinks.append(lrulink) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url)
def _new_page(self, url, lru=None): if lru is None: lru = url_to_lru_clean(url) p = Page() p['url'] = url p['lru'] = lru p['timestamp'] = int(time.time() * 1000) return p
def _new_page(self, url, lru=None): if lru is None: lru = url_to_lru_clean(url) p = Page() p['url'] = url p['lru'] = lru p['timestamp'] = int(time.time()*1000) return p
def handle_response(self, response): lru = url_to_lru_clean(response.url) if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru)