def file_path(self, request, response=None, info=None): # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): url = request else: url = request.url group = getattr(request, "group", None) try: if group: filename = "{0}{1}".format(group["urls"][request.url], self.DEFAULT_EXT) path = os.path.join(group["name"], filename) else: url = URL(url) url.scheme = "" _, ext = os.path.splitext(url.path.split("/")[-1]) if not ext: url.path = url.path.strip("/") + self.DEFAULT_EXT path = url.geturl() except Exception: path = os.path.join("err", hashlib.sha1(url).hexdigest() + self.DEFAULT_EXT) if request.spider.subdir: path = os.path.join(request.spider.subdir, path) return path
def file_path(self, request, response=None, info=None): # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): url = request else: url = request.url group = getattr(request, "group", None) try: if group: filename = "{0}{1}".format(group["urls"][request.url], self.DEFAULT_EXT) path = os.path.join(group["name"], filename) else: url = URL(url) url.scheme = '' _, ext = os.path.splitext(url.path.split('/')[-1]) if not ext: url.path = url.path.strip('/') + self.DEFAULT_EXT path = url.geturl() except Exception: path = os.path.join( "err", hashlib.sha1(url).hexdigest() + self.DEFAULT_EXT) if request.spider.subdir: path = os.path.join(request.spider.subdir, path) return path
def extract_links(self, response): hxs = Selector(response) list_css = self.get_css("list_css") if not list_css: return [] urls = [] try: links = hxs.css(list_css).xpath('@href').extract() for url in links: urls.append(url) next_url = self.extract_next_links(response) urls.extend(next_url) except Exception as err: self.logger.error("%s" % err) rtn = [] for url in urls: url = URL.s_get_full_url(URL(url), URL(response.url)) if url: rtn.append(Link(url=url)) return rtn
def extract_links(self, response): hxs = Selector(response) list_css = self.get_css("list_css") if not list_css: return [] urls = [] try: links = hxs.css(list_css).xpath("@href").extract() for url in links: urls.append(url) next_url = self.extract_next_links(response) urls.extend(next_url) except Exception as err: self.logger.error("%s" % err) rtn = [] for url in urls: url = URL.s_get_full_url(URL(url), URL(response.url)) if url: rtn.append(Link(url=url)) return rtn