Example #1
0
    def file_path(self, request, response=None, info=None):
        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            url = request
        else:
            url = request.url

        group = getattr(request, "group", None)
        try:
            if group:
                filename = "{0}{1}".format(group["urls"][request.url], self.DEFAULT_EXT)
                path = os.path.join(group["name"], filename)
            else:
                url = URL(url)
                url.scheme = ""
                _, ext = os.path.splitext(url.path.split("/")[-1])
                if not ext:
                    url.path = url.path.strip("/") + self.DEFAULT_EXT
                path = url.geturl()
        except Exception:
            path = os.path.join("err", hashlib.sha1(url).hexdigest() + self.DEFAULT_EXT)

        if request.spider.subdir:
            path = os.path.join(request.spider.subdir, path)
        return path
Example #2
0
    def file_path(self, request, response=None, info=None):
        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            url = request
        else:
            url = request.url

        group = getattr(request, "group", None)
        try:
            if group:
                filename = "{0}{1}".format(group["urls"][request.url],
                                           self.DEFAULT_EXT)
                path = os.path.join(group["name"], filename)
            else:
                url = URL(url)
                url.scheme = ''
                _, ext = os.path.splitext(url.path.split('/')[-1])
                if not ext:
                    url.path = url.path.strip('/') + self.DEFAULT_EXT
                path = url.geturl()
        except Exception:
            path = os.path.join(
                "err",
                hashlib.sha1(url).hexdigest() + self.DEFAULT_EXT)

        if request.spider.subdir:
            path = os.path.join(request.spider.subdir, path)
        return path
Example #3
0
    def extract_links(self, response):
        hxs = Selector(response)
        list_css = self.get_css("list_css")
        if not list_css:
            return []

        urls = []
        try:
            links = hxs.css(list_css).xpath('@href').extract()
            for url in links:
                urls.append(url)
            next_url = self.extract_next_links(response)
            urls.extend(next_url)
        except Exception as err:
            self.logger.error("%s" % err)

        rtn = []
        for url in urls:
            url = URL.s_get_full_url(URL(url), URL(response.url))
            if url:
                rtn.append(Link(url=url))

        return rtn
Example #4
0
    def extract_links(self, response):
        hxs = Selector(response)
        list_css = self.get_css("list_css")
        if not list_css:
            return []

        urls = []
        try:
            links = hxs.css(list_css).xpath("@href").extract()
            for url in links:
                urls.append(url)
            next_url = self.extract_next_links(response)
            urls.extend(next_url)
        except Exception as err:
            self.logger.error("%s" % err)

        rtn = []
        for url in urls:
            url = URL.s_get_full_url(URL(url), URL(response.url))
            if url:
                rtn.append(Link(url=url))

        return rtn