def parse(self, response): for post in response.css("#archive .floated-thumb .post-thumb a"): img_url = post.css("img::attr(src)").extract_first("") post_url = post.css("::attr(href)").extract_first("") yield Request(url=basejoin(response.url, post_url), meta={"front_image_url": basejoin(response.url, img_url)}, callback=self.parse_detail)
def __call__(self, url, baseURL=None): """Load the given multi-value url and call callbacks url -- vrml97-style url (multi-value string) baseURL -- optional base url from which items in url will be resolved. protofunctions.root(node).baseURI will give you the baseURL normally used for the given node. raises IOError on failure returns (successfulURL, filename, open_file, headers) on success headers will be None for local files """ log.info("Loading: %s, %s", url, baseURL) url = as_unicode(url) if isinstance(url, unicode): url = [url] else: url = [as_unicode(u) for u in url] file = None for u in url: # get the "absolute" url if baseURL: u = basejoin(baseURL, u) resolvedURL, file, filename, headers = self.get(u) if file is not None and filename is not None: break if not file or not filename: raise IOError("""Unable to download url %s""" % url) return (resolvedURL, os.path.abspath(filename), file, headers)
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] else: return void = fp.read() fp.close() newurl = basejoin(self.type + ':' + url, newurl) return self.open(newurl)
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] else: return void = fp.read() fp.close() # In case the server sent a relative URL, join with original: newurl = basejoin(self.type + ":" + url, newurl) return self.open(newurl)
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] else: return fp.close() newurl = basejoin(self.type + ':' + url, newurl) newurl_lower = newurl.lower() if not (newurl_lower.startswith('http://') or newurl_lower.startswith('https://') or newurl_lower.startswith('ftp://')): raise IOError('redirect error', errcode, errmsg + " - Redirection to url '%s' is not allowed" % newurl, headers) return self.open(newurl)
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] else: return fp.close() newurl = basejoin(self.type + ':' + url, newurl) newurl_lower = newurl.lower() if not (newurl_lower.startswith('http://') or newurl_lower.startswith('https://') or newurl_lower.startswith('ftp://')): raise IOError( 'redirect error', errcode, errmsg + " - Redirection to url '%s' is not allowed" % newurl, headers) return self.open(newurl)