def __init__(self, request, timeout=180): self._url = urldefrag(request.url)[0] # converting to bytes to comply to Twisted interface self.url = to_bytes(self._url, encoding='ascii') self.method = to_bytes(request.method, encoding='ascii') self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == b'POST': self.headers['Content-Length'] = 0
def get_defaults_spider_mw(self): crawler = get_crawler(Spider) spider = crawler._create_spider('foo') defaults = { to_bytes(k): [to_bytes(v)] for k, v in crawler.settings.get('DEFAULT_REQUEST_HEADERS').items() } return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
def testHostHeader(self): # if we pass Host header explicitly, it should be used, otherwise # it should extract from url return defer.gatherResults([ getPage(self.getURL("host")).addCallback( self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)), getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback( self.assertEquals, to_bytes("www.example.com"))])
def renderRequest(self, request, nlist): s = """<html> <head></head> <body>""" args = request.args.copy() for nl in nlist: args[b"n"] = [to_bytes(str(nl))] argstr = urlencode(args, doseq=True) s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl) s += """</body>""" request.write(to_bytes(s)) request.finish()
def response_httprepr(response): """Return raw HTTP representation (as bytes) of the given response. This is provided only for reference, since it's not the exact stream of bytes that was received (that's not exposed by Twisted). """ s = b"HTTP/1.1 " + to_bytes(str(response.status)) + b" " + \ to_bytes(http.RESPONSES.get(response.status, b'')) + b"\r\n" if response.headers: s += response.headers.to_string() + b"\r\n" s += b"\r\n" s += response.body return s
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): return ( to_native_str(parts.scheme), to_native_str(parts.netloc.encode('idna')), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars) )
def request_httprepr(request): """Return the raw HTTP representation (as bytes) of the given request. This is provided only for reference since it's not the actual stream of bytes that will be send when performing the request (that's controlled by Twisted). """ parsed = urlparse_cached(request) path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, "")) s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n" s += b"Host: " + to_bytes(parsed.hostname) + b"\r\n" if request.headers: s += request.headers.to_string() + b"\r\n" s += b"\r\n" s += request.body return s
def multpart_encode(form_data): _boundary = to_bytes(''.join( random.choice(string.digits + string.ascii_letters) for i in range(20))) content_type = "multipart/form-data; boundary=" + _boundary body = [] for name, value in form_data.items(): body.append(b'--' + _boundary) body.append(b'Content-Disposition: form-data; name="' + to_bytes(name) + b'"') body.append(b'') body.append(to_bytes(value)) body.append(b'--' + _boundary + b'--') return content_type, b'\r\n'.join(body)
def _build_response(self, result, request, protocol): self.result = result respcls = responsetypes.from_args(url=request.url) protocol.close() body = protocol.filename or protocol.body.read() headers = {"local filename": protocol.filename or '', "size": protocol.size} return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
def thumb_path(self, request, thumb_id, response=None, info=None): url = request.url image_guid = hashlib.sha1(to_bytes(url)).hexdigest() if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT': return '{ig}.jpg'.format(ig=image_guid) else: return 'thumbs/{p}/{ig}.jpg'.format(p=thumb_id, ig=image_guid)
def thumb_path(self, request, thumb_id, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn( "ImagesPipeline.thumb_key(url) method is deprecated, please use " "thumb_path(request, thumb_id, response=None, info=None) instead", category=ScrapyDeprecationWarning, stacklevel=1, ) # check if called from thumb_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if thumb_key() method has been overridden if not hasattr(self.thumb_key, "_base"): _warn() return self.thumb_key(url, thumb_id) ## end of deprecation warning block thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation return "thumbs/%s/%s.jpg" % (thumb_id, thumb_guid)
def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ if isinstance(url, ParseResult): return url return urlparse(to_bytes(url, encoding))
def requestTunnel(self, protocol): """Asks the proxy to open a tunnel.""" tunnelReq = ( b'CONNECT ' + to_bytes(self._tunneledHost, encoding='ascii') + b':' + to_bytes(str(self._tunneledPort)) + b' HTTP/1.1\r\n') if self._proxyAuthHeader: tunnelReq += \ b'Proxy-Authorization: ' + self._proxyAuthHeader + b'\r\n' tunnelReq += b'\r\n' protocol.transport.write(tunnelReq) self._protocolDataReceived = protocol.dataReceived protocol.dataReceived = self.processProxyResponse self._protocol = protocol return protocol
def _test_request(self, **kwargs): r = self.request_class('http://scrapytest.org/rpc2', **kwargs) self.assertEqual(r.headers[b'Content-Type'], b'text/xml') self.assertEqual(r.body, to_bytes(xmlrpclib.dumps(**kwargs))) self.assertEqual(r.method, 'POST') self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8')) self.assertTrue(r.dont_filter, True)
def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) if hasattr(item, '_context'): itemdict['@context'] = item._context data = self.encoder.encode(itemdict) + '\n' self.file.write(to_bytes(data, self.encoding))
def testParse(self): lip = '127.0.0.1' tests = ( ("http://127.0.0.1?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/?c=v&c2=v2#fragment", ('http', lip, lip, 80, '/?c=v&c2=v2')), ("http://127.0.0.1/foo?c=v&c2=v2#frag", ('http', lip, lip, 80, '/foo?c=v&c2=v2')), ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')), ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/foo?c=v&c2=v2')), ("http://127.0.0.1", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/", ('http', lip, lip, 80, '/')), ("http://127.0.0.1/foo", ('http', lip, lip, 80, '/foo')), ("http://127.0.0.1?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')), ("http://127.0.0.1:12345/foo", ('http', lip+':12345', lip, 12345, '/foo')), ("http://spam:12345/foo", ('http', 'spam:12345', 'spam', 12345, '/foo')), ("http://spam.test.org/foo", ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')), ("https://127.0.0.1/foo", ('https', lip, lip, 443, '/foo')), ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')), ("https://127.0.0.1:12345/", ('https', lip+':12345', lip, 12345, '/')), ("http://scrapytest.org/foo ", ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')), ("http://egg:7890 ", ('http', 'egg:7890', 'egg', 7890, '/')), ) for url, test in tests: test = tuple( to_bytes(x) if not isinstance(x, int) else x for x in test) self.assertEquals(client._parse(url), test, url)
def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation return 'full/%s.jpg' % (image_guid)
def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation media_ext = os.path.splitext(url)[1] # change to request.url after deprecation return 'full/%s%s' % (media_guid, media_ext)
def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b",\n") itemdict = dict(self._get_serialized_fields(item)) self.file.write(to_bytes(self.encoder.encode(itemdict)))
def to_native_str(text, encoding=None, errors='strict'): """ Return str representation of `text` (bytes in Python 2.x and unicode in Python 3.x). """ if six.PY2: return to_bytes(text, encoding, errors) else: return to_unicode(text, encoding, errors)
def test_3xx_and_invalid_gzipped_body_must_redirect(self): """Regression test for a failure when redirecting a compressed request. This happens when httpcompression middleware is executed before redirect middleware and attempts to decompress a non-compressed body. In particular when some website returns a 30x response with header 'Content-Encoding: gzip' giving as result the error below: exceptions.IOError: Not a gzipped file """ req = Request('http://example.com') body = b'<p>You are being redirected</p>' resp = Response(req.url, status=302, body=body, headers={ 'Content-Length': str(len(body)), 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', 'Location': 'http://example.com/login', }) ret = self._download(request=req, response=resp) self.assertTrue(isinstance(ret, Request), "Not redirected: {0!r}".format(ret)) self.assertEqual(to_bytes(ret.url), resp.headers['Location'], "Not redirected to location header")
def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') self.assertEqual(mw._get_key(test_req1), to_bytes(request_fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'}) self.assertEqual(mw._get_key(test_req2), b'dfkey1')
def writeLine(self, data): """ Write the data to the process stdin, adding the new-line delimiter if necessary """ data = to_bytes(data) if not data.endswith(b'\n'): data += self.__delimiter self.transport.write(data)
def render_GET(self, request): output = { 'headers': dict( (to_unicode(k), [to_unicode(v) for v in vs]) for k, vs in request.requestHeaders.getAllRawHeaders()), 'body': to_unicode(request.content.read()), } return to_bytes(json.dumps(output))
def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',\n') itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) self.file.write(to_bytes(data, self.encoding))
def browser_open(burl): path = urlparse(burl).path if not os.path.exists(path): path = burl.replace('file://', '') with open(path, "rb") as f: bbody = f.read() self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody) return True
def file_path(self, request, response=None, info=None): url = request.url image_guid = hashlib.sha1(to_bytes(url)).hexdigest() if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT': return '{ig}.jpg'.format(ig=image_guid) elif self.conf["IMAGES_STORE_FORMAT"] == 'THUMBS': return 'thumbs/{p}/{ig}.jpg'.format(p=next(iter(list(settings.get('IMAGES_THUMBS').keys()))), ig=image_guid) else: return 'full/{ig}.jpg'.format(ig=image_guid)
def _failed(self, result, request): message = result.getErrorMessage() if result.type == CommandFailed: m = _CODE_RE.search(message) if m: ftpcode = m.group() httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"]) return Response(url=request.url, status=httpcode, body=to_bytes(message)) raise result.type(result.value)
def request_fingerprint(request, include_headers=None): """ Return the request fingerprint. The request fingerprint is a hash that uniquely identifies the resource the request points to. For example, take the following two urls: http://www.example.com/query?id=111&cat=222 http://www.example.com/query?cat=222&id=111 Even though those are two different URLs both point to the same resource and are equivalent (ie. they should return the same response). Another example are cookies used to store session ids. Suppose the following page is only accesible to authenticated users: http://www.example.com/members/offers.html Lot of sites use a cookie to store the session id, which adds a random component to the HTTP Request and thus should be ignored when calculating the fingerprint. For this reason, request headers are ignored by default when calculating the fingeprint. If you want to include specific headers use the include_headers argument, which is a list of Request headers to include. """ if include_headers: include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) if include_headers not in cache: fp = hashlib.sha1() fp.update(to_bytes(request.method)) fp.update(to_bytes(canonicalize_url(request.url))) fp.update(request.body or b'') if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[include_headers] = fp.hexdigest() return cache[include_headers]
def test_timeoutNotTriggering(self): """ When a non-zero timeout is passed to L{getPage} and the page is retrieved before the timeout period elapses, the L{Deferred} is called back with the contents of the page. """ d = getPage(self.getURL("host"), timeout=100) d.addCallback( self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)) return d
def getPage(url, contextFactory=None, response_transform=None, *args, **kwargs): """Adapted version of twisted.web.client.getPage""" def _clientfactory(url, *args, **kwargs): url = to_unicode(url) timeout = kwargs.pop('timeout', 0) f = client.ScrapyHTTPClientFactory(Request(url, *args, **kwargs), timeout=timeout) f.deferred.addCallback(response_transform or (lambda r: r.body)) return f from twisted.web.client import _makeGetterFactory return _makeGetterFactory(to_bytes(url), _clientfactory, contextFactory=contextFactory, *args, **kwargs).deferred
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
def process_item(self, item, spider): session = self.session() processing = Processing(s3_raw=hashlib.sha1(to_bytes( item['doc_url'])).hexdigest()).save_to_db(session) policy = Policy(country=item['country'], geo_code=item['geo_code'], level=item['level'], source=item['source'], title=item['title'], reference=item['reference'], authorship=item['authorship'], resume=item['resume'], publication_date=item['publication_date'], enforcement_date=item['enforcement_date'], url=item['url'], doc_url=item['doc_url'], doc_name=item['doc_name'], doc_type=item['doc_type'], processing_id=processing.id).save_to_db(session)
def process_request(self, request, spider): cookies = spider.cookies driver = init_chromium(spider.selenium_hostname, cookies) print('SeleniumMiddleware - getting the page') driver.get(request.url) # request.meta['driver'] = self.driver # to access driver from response print('waiting for page loading') spider.wait_page_completion(driver=driver) print('SeleniumMiddleware - retrieving body') body = to_bytes(driver.page_source) # body must be of type bytes request.meta['driver'] = driver return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
def file_path(self, request, response=None, info=None): """根据request返回存储文件的相对路径""" def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn( 'FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # sha1 哈希算法 media_ext = os.path.splitext(url)[1] return 'files/%s%s' % (media_guid, media_ext)
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
def file_path(self, request, response=None, info=None, *, item=None): # function 2 img_path = "" try: # HERE ARE CUSTOM CHANGES marketplace = request.meta.get("marketplace") asin = request.meta.get("asin") img_path = os.path.join(marketplace, asin + ".jpg") except: media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() media_ext = os.path.splitext(request.url)[1] # Handles empty and wild extensions by trying to guess the # mime type then extension or default to empty string otherwise if media_ext not in mimetypes.types_map: media_ext = '' media_type = mimetypes.guess_type(request.url)[0] if media_type: media_ext = mimetypes.guess_extension(media_type) img_path = f'full/{media_guid}{media_ext}' return img_path
def run(output_path, download_workers): crawl(download_workers) reactor.run() # the script will block here until the last crawl call is finished encoder = ScrapyJSONEncoder() with open("results/publications.jl", "rb+") as publications_info_file: with open(output_path, "wb+") as output_file: output_file.write(b"[") first = True while True: line = publications_info_file.readline() if not line: break if first: output_file.write(b"\n") first = False else: output_file.write(b",\n") company_info_data = json.loads(line) data = encoder.encode(company_info_data) output_file.write(to_bytes(data)) output_file.write(b"\n]\n")
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent: from twisted.internet import reactor bind_address = request.meta.get('bindaddress') or self._bind_address proxy = request.meta.get('proxy') if proxy: _, _, proxy_host, proxy_port, proxy_params = _parse(proxy) scheme = _parse(request.url)[0] proxy_host = proxy_host.decode() omit_connect_tunnel = b'noconnect' in proxy_params if omit_connect_tunnel: warnings.warn( "Using HTTPS proxies in the noconnect mode is not " "supported by the downloader handler. If you use Zyte " "Smart Proxy Manager, it doesn't require this mode " "anymore, so you should update scrapy-crawlera to " "scrapy-zyte-smartproxy and remove '?noconnect' from the " "Zyte Smart Proxy Manager URL.") if scheme == b'https' and not omit_connect_tunnel: # ToDo raise NotImplementedError( 'Tunneling via CONNECT method using HTTP/2.0 is not yet supported' ) return self._ProxyAgent( reactor=reactor, context_factory=self._context_factory, proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')), connect_timeout=timeout, bind_address=bind_address, pool=self._pool, ) return self._Agent( reactor=reactor, context_factory=self._context_factory, connect_timeout=timeout, bind_address=bind_address, pool=self._pool, )
def page_parse(self, response): # pagetitle=response.xpath('/html/head/title').extract_first() # print("detail parsing:", pagetitle) text1 = response.xpath('//*[@class="cp-js"]/p/text()').extract() # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract_first() item = WineItem() item['url'] = response.url item['slogan'] = re.sub(r'—*\s+', '', text1[0]) # slogon item['name'] = re.sub(r':', '', text1[1]) # 名称 item['odor_type'] = re.sub(r':', '', text1[2]) # 香型 item['volume'] = re.sub(r':', '', text1[3]) # 酒精度 item['net_weight'] = re.sub(r':', '', text1[4]) # 净含量 item['general_agency'] = re.sub(r':', '', text1[5]) # 总代理 item['package'] = re.sub(r':', '', text1[6]) # 包装 item['price'] = '' # 价格 item['sku'] = hashlib.sha1(to_bytes(item['name'])).hexdigest() item['images_urls_local'] = 'http://192.168.33.55/pics/' + item[ 'sku'] + '/1.jpg' #总代理 # print(re.sub(r':+', '', text1[5])) # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract() # print(re.sub(r'\s+', '', text2)) pictureUrls = response.xpath( '//*[@class="items"]/ul/li/img/@bimg').extract() img_urls = [] for picture in pictureUrls: pictureUrl = request.urljoin(response.url, picture) img_urls.append(pictureUrl) item['images_urls'] = img_urls yield item
def process_request(self, request, spider): driver = spider.driver print('SeleniumMiddleware - getting the page') driver.get(request.url) # request.meta['driver'] = self.driver # to access driver from response print('waiting for page loading') profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img" get_by_xpath(driver, profile_xpath) # waiting links to other users are shown so the crawl can continue get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3) print('SeleniumMiddleware - retrieving body') body = to_bytes(driver.page_source) # body must be of type bytes return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
def parse_items(self, response): items = [] item = AquariumScapeItem() src = response.css('#index_photo img::attr(src)').extract_first() tank_img = response.urljoin(src) item['image_urls'] = [tank_img] image_guid = hashlib.sha1(to_bytes(tank_img)).hexdigest() # change to request.url after deprecation item['image_sha1'] = '%s.jpg' % (image_guid) item['user_name'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "reeftank_panel_top", " " ))]//span/text()').extract() item['rank'] = response.selector.css('.margin_b+ b::text').extract() item['average_category_score'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 7) and parent::*)]//b/text()').extract() item['average_tank_score'] = response.selector.css('.clear:nth-child(5) b::text').extract() item['num_ratings'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 3) and parent::*)]//b/text()').extract() item['fish_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list2", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract() item['plants_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list1", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract() item['comments'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "image_comment", " " ))]/text()').extract() # Return all the found items items.append(item) return items
def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn( 'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = hashlib.sha1(to_bytes( url)).hexdigest() # change to request.url after deprecation folder = request.meta['folder'] if len(folder): folder = folder[0] #pdb.set_trace() table = dict((ord(char), None) for char in "|\\?*<\":>+[]/'") folder = folder.translate(table) else: folder = u'未知' #pdb.set_trace() return '%s/%s.jpg' % (folder, image_guid)
def jsonrpc_client_call(url, method, *args, **kwargs): """Execute a JSON-RPC call on the given url""" if args and kwargs: raise ValueError( "Pass *args or **kwargs but not both to jsonrpc_client_call") req = { 'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1 } data = to_bytes(json.dumps(req)) body = urllib.request.urlopen(url, data).read() res = json.loads(body.decode('utf-8')) if 'result' in res: return res['result'] elif 'error' in res: er = res['error'] raise JsonRpcError(er['code'], er['message'], er['data']) else: msg = "JSON-RPC response must contain 'result' or 'error': %s" % res raise ValueError(msg)
def export_item(self, item): if self.first_item: self.first_item = False else: self.file.write(b',') self._beautify_newline() __item_properties = dict(self._get_serialized_fields(item)) __item = { 'type': 'Feature', 'properties': __item_properties, 'geometry': { 'type': 'Point', 'coordinates': ( float(__item_properties.get('longitude', 0.0)), float(__item_properties.get('latitude', 0.0)))}} self.file.write(to_bytes( self.encoder.encode(__item), self.encoding))
def test_ftp_local_filename(self): f, local_fname = tempfile.mkstemp() local_fname = to_bytes(local_fname) os.close(f) meta = {"ftp_local_filename": local_fname} meta.update(self.req_meta) request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum, meta=meta) d = self.download_handler.download_request(request, None) def _test(r): self.assertEqual(r.body, local_fname) self.assertEqual(r.headers, { b'Local Filename': [local_fname], b'Size': [b'17'] }) self.assertTrue(os.path.exists(local_fname)) with open(local_fname, "rb") as f: self.assertEqual(f.read(), b"I have the power!") os.remove(local_fname) return self._add_test_callbacks(d, _test)
def file_path(self, request, response=None, info=None): def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) if not isinstance(request, Request): _warn() url = request else: url = request.url if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation return 'full/movie250/%s.jpg' % (image_guid)
def process_item(self, item, spider): session = self.session() processing = Processing(s3_raw=hashlib.sha1(to_bytes(item['doc_url'])).hexdigest()) session.add(processing) policy = Policy( country=item['country'], state = item['state'], data_source=item['data_source'], law_class = item['law_class'], title=item['title'], reference=item['reference'], authorship=item['authorship'], resume=item['summary'], publication_date=item['publication_date'], url=item['url'], doc_url=item['doc_url'] + self.API_key, doc_name=item['doc_name'], processing = processing ) session.merge(policy) #print(policy) session.commit()
def _downloadImg(imgUrl): savePrefix = "/root/SPIDERIMAGESDB/DATASOURCE/Lofter/" + strDate + "/" if not os.path.isdir(savePrefix): os.makedirs(savePrefix) image_guid = hashlib.sha1(to_bytes(imgUrl)).hexdigest() image_name = None if ".jpg" in imgUrl: image_name = image_guid + ".jpg" if ".png" in imgUrl: image_name = image_guid + ".png" if ".jpeg" in imgUrl: image_name = image_guid + ".jpeg" if image_name == None: return try: with open(savePrefix+image_name, "wb") as writer: writer.write(urllib.request.urlopen(imgUrl).read()) except Exception as es: print(es) with codecs.open("./download" + strDate + ".csv", mode='a', encoding='utf8') as writer: writer.write(imgUrl+',Lofter/'+strDate+'/'+image_name) writer.write('\n') print(imgUrl, image_name)
def process_request(self, request, spider): self.driver.get(request.url) if spider.name == 'wanted' and request.meta != {}: time.sleep(2) some_tag = self.driver.find_element_by_class_name( '_3h_f6TfissC0l7ogPcn7lY') action = ActionChains(self.driver) action.move_to_element(some_tag).perform() time.sleep(1) else: time.sleep(1.2) if spider.name == 'wanted' and request.meta == {}: self.infinite_scroll_control() elif spider.name == 'roketpunch' and 'job_card_company' in request.meta: self.see_more_button_click() elif spider.name == 'naver' and request.meta == {}: self.naver_more_button() body = to_bytes(text=self.driver.page_source) return HtmlResponse(url=request.url, body=body, encoding='utf-8', request=request)
def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn( 'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = hashlib.sha1(to_bytes( url)).hexdigest() # change to request.url after deprecation # 重新此方法,如果 item 中含有 save_sub_dir, 则将其放入此子文件夹 item = request.meta['item'] if 'save_sub_dir' in item: path = '%s/full/%s.jpg' % (item['save_sub_dir'], image_guid) else: path = 'full/%s.jpg' % image_guid return path
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called if spider.browser: request.meta[ 'browser'] = self.browser # to access driver from response self.browser.get(request.url) # wait js eval time.sleep(15) body = to_bytes( self.browser.page_source) # body must be of type bytes return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request) else: return None
def parse_page(self, response): item = MeituItem() item['referer'] = response.meta['item']['referer'] href = response.xpath("//div[@class='content-pic']/a/img/@src").extract_first() image_name = response.xpath("//div[@class='content']/h5/text()").extract_first() item['image_name'] = image_name.split('(')[0] item['image_hash'] = hashlib.sha1(to_bytes(item['image_name'])).hexdigest() try: item['image_num'] = image_name.split('(')[1].split(')')[0].zfill(2) except IndexError: item['image_num'] = '01' item['image_urls'] = [href] item['spider_name'] = self.name yield item # next page next_href = response.xpath("//a[text()='下一页']/@href").extract_first() if next_href: next_href = urljoin(response.url, next_href) yield scrapy.Request( url=next_href, callback=self.parse_page, meta={'item': {'referer': response.url}} )
def process_request(self, request, spider): driver = spider.driver logger.info('SeleniumMiddleware - getting the page') driver.get(request.url) logger.info('SeleniumMiddleware - click more options') more_option = get_by_xpath( driver, '//div/div/button[@class="more-options dropdown-caret"]') more_option.send_keys(Keys.NULL) more_option.click() logger.info('SeleniumMiddleware - wait for names') name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a') name.send_keys(Keys.NULL) #request.meta['driver'] = self.driver # to access driver from response logging.info('SeleniumMiddleware - retrieving body') body = to_bytes(driver.page_source) # body must be of type bytes return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with self._open(os.path.join(rpath, 'meta'), 'wb') as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=2) with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { "url": request.url, "method": request.method, "status": response.status, "response_url": response.url, "timestamp": time(), } with self._open(os.path.join(rpath, "meta"), "wb") as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f: pickle.dump(metadata, f, protocol=4) with self._open(os.path.join(rpath, "response_headers"), "wb") as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, "response_body"), "wb") as f: f.write(response.body) with self._open(os.path.join(rpath, "request_headers"), "wb") as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, "request_body"), "wb") as f: f.write(request.body)
def process_item(self, item, spider): ''' 图片下载 :param item: :param spider: :return: ''' for image in item['image_urls'].split(','): #图片命名 image_guid = hashlib.sha1(to_bytes(image)).hexdigest() image_name = '%s.jpg' % (image_guid) house_id = item['from_url'].split('/')[-1].replace('.html', '') file_path = '%s/%s' % (self.imagepath, house_id) if not os.path.exists(file_path): os.makedirs(file_path) image_path = '%s/%s/%s' % (self.imagepath, house_id, image_name) if not os.path.exists(image_path): urlretrieve(image, image_path) else: raise DropItem('It exists!')
def file_path(self, request, response=None, info=None): def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn( 'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) if not isinstance(request, Request): _warn() url = request else: url = request.url image_guid = hashlib.sha1(to_bytes( url)).hexdigest() # change to request.url after deprecation word = str(request.meta['word']).split('word=')[-1] try: word = urllib.unquote(word).decode('utf-8') except Exception as ex: word = urllib.parse.unquote(word) print(word + "/%s.jpg" % image_guid) return word + '/%s.jpg' % (image_guid)
def thumb_path(self, request, thumb_id, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use ' 'thumb_path(request, thumb_id, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from thumb_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if thumb_key() method has been overridden if not hasattr(self.thumb_key, '_base'): _warn() return self.thumb_key(url, thumb_id) ## end of deprecation warning block thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
def file_path(self, request, response=None, info=None): start_time = self._get_start_time(info.spider) start_time_str = start_time.strftime('%Y%m%d_%H%M%S') content_type = '' if response: # This is to cover the case when the url has . after the last / # and the text after the . is not a file extension but the response is a json content_type = to_native_str(response.headers['Content-Type']) url = request.url media_guid = hashlib.sha1(to_bytes(url)).hexdigest() media_ext = os.path.splitext(url)[1] if hasattr(info.spider, 'ext'): media_ext = info.spider.ext elif not media_ext or ('json' in content_type and media_ext != '.json'): media_ext = '.json' # Put files in a directory named after the scraper they came from, and the scraper starttime if hasattr(info.spider, 'sample') and info.spider.sample == 'true': return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str, media_guid, media_ext) else: return '%s/%s/%s%s' % (info.spider.name, start_time_str, media_guid, media_ext)