def __init__(self, url=None, callback=None, method=None, formdata=None, body=None, **kwargs): # First init FormRequest to get url, body and method if formdata: FormRequest.__init__( self, url=url, method=method, formdata=formdata) url, method, body = self.url, self.method, self.body # Then pass all other kwargs to SplashRequest SplashRequest.__init__( self, url=url, callback=callback, method=method, body=body, **kwargs)
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, args={'images': 0, 'timeout': 3})
def start_requests(self): yield SplashRequest( self.start_urls[0], callback=self.parse_splash, args={'wait': 10}, # 最大超时时间,单位:秒 endpoint='render.html') # 使用splash服务的固定参数
def test_splash_request_url_with_fragment(): mw = _get_mw() url = "http://example.com#id1" req = SplashRequest(url) req = mw.process_request(req, None) assert json.loads(to_native_str(req.body)) == {'url': url}
def test_magic_response(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = SplashRequest('http://example.com/', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies=[{'name': 'foo', 'value': 'bar'}]) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req resp_data = { 'url': "http://exmaple.com/#id42", 'html': '<html><body>Hello 404</body></html>', 'http_status': 404, 'headers': [ {'name': 'Content-Type', 'value': "text/html"}, {'name': 'X-My-Header', 'value': "foo"}, {'name': 'Set-Cookie', 'value': "bar=baz"}, ], 'cookies': [ {'name': 'foo', 'value': 'bar'}, {'name': 'bar', 'value': 'baz', 'domain': '.example.com'}, {'name': 'session', 'value': '12345', 'path': '/', 'expires': '2055-07-24T19:20:30Z'}, ], } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) resp2 = cookie_mw.process_response(req, resp2, None) assert isinstance(resp2, scrapy_splash.SplashJsonResponse) assert resp2.data == resp_data assert resp2.body == b'<html><body>Hello 404</body></html>' assert resp2.text == '<html><body>Hello 404</body></html>' assert resp2.headers == { b'Content-Type': [b'text/html'], b'X-My-Header': [b'foo'], b'Set-Cookie': [b'bar=baz'], } assert resp2.status == 404 assert resp2.url == "http://exmaple.com/#id42" assert len(resp2.cookiejar) == 3 cookies = [c for c in resp2.cookiejar] assert {(c.name, c.value) for c in cookies} == { ('bar', 'baz'), ('foo', 'bar'), ('session', '12345') } # send second request using the same session and check the resulting cookies req = SplashRequest('http://example.com/foo', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies={'spam': 'ham'}) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req resp_data = { 'html': '<html><body>Hello</body></html>', 'headers': [ {'name': 'Content-Type', 'value': "text/html"}, {'name': 'X-My-Header', 'value': "foo"}, {'name': 'Set-Cookie', 'value': "bar=baz"}, ], 'cookies': [ {'name': 'spam', 'value': 'ham'}, {'name': 'egg', 'value': 'spam'}, {'name': 'bar', 'value': 'baz', 'domain': '.example.com'}, #{'name': 'foo', 'value': ''}, -- this won't be in response {'name': 'session', 'value': '12345', 'path': '/', 'expires': '2056-07-24T19:20:30Z'}, ], } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) resp2 = cookie_mw.process_response(req, resp2, None) assert isinstance(resp2, scrapy_splash.SplashJsonResponse) assert resp2.data == resp_data cookies = [c for c in resp2.cookiejar] assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'} for c in cookies: if c.name == 'session': assert c.expires == 2731692030 if c.name == 'spam': assert c.value == 'ham'
def start_requests(self): yield SplashRequest(url='https://quotes.toscrape.com/login', endpoint='execute', args={'lua_source': self.script}, callback=self.parse)
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, args={'wait': 0.5})
def start_requests(self): page = "https://www.duquesnelight.com/outages-safety/current-outages" yield SplashRequest(url=page, callback=self.parse, endpoint='render.html', args={'wait': 2})
def parse(self, response): url_selectors = response.css("div.event-body.clearfix div.left h3 a::attr(href)") for url in url_selectors.extract()[:10]: yield SplashRequest(url.encode('utf-8'), callback=self.parse_item, endpoint='execute', args={'lua_source': script2})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': script})
def start_requests(self): parameters = urlencode({"show": 100, "qs": getattr(self, "query", "")}) yield SplashRequest( f"https://www.sciencedirect.com/search?{parameters}" )
def process_links(self, response): links = response.xpath('//div[@class="rate"]/a/@href').extract() for link in links: linkurl = urljoin('https://www.vrbo.com', link) yield SplashRequest(linkurl, self.parse_listings)
def parse(self, response): post_links = response.css( "a.rpl-search-results__item::attr(data-print-url)" ).getall() for url in post_links: yield SplashRequest(url, self.parse_item, args={"wait": 0.5})
def parse(self, response): car_urls = response.xpath('//*[@class="title"]/a/@href').extract() for car_url in car_urls: absolute_car_url = response.urljoin(car_url + '/') yield Request(absolute_car_url, callback=self.parse_car) script_at_page_1 = """function main(splash) assert(splash:go(splash.args.url)) splash:wait(5) local get_element_dim_by_xpath = splash:jsfunc([[ function(xpath) { var element = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; var element_rect = element.getClientRects()[0]; return {"x": element_rect.left, "y": element_rect.top} } ]]) -- -- Find the YEAR drop down local year_drop_dimensions = get_element_dim_by_xpath( '//h2[contains(@class, "label ") and contains(text(), "Year ")]') splash:set_viewport_full() splash:mouse_click(year_drop_dimensions.x, year_drop_dimensions.y) splash:wait(1.5) -- -- Clicks the 202X year local year_dimensions = get_element_dim_by_xpath( '//li[contains(@data-value, "2020")]/span') splash:set_viewport_full() splash:mouse_click(year_dimensions.x, year_dimensions.y) splash:wait(5) -- Find the MAKE drop down local make_drop_dimensions = get_element_dim_by_xpath( '//h2[contains(@class, "label ") and contains(text(), "Make ")]') splash:set_viewport_full() splash:mouse_click(make_drop_dimensions.x, make_drop_dimensions.y) splash:wait(1.5) -- Clicks the Toyota make local make_dimensions = get_element_dim_by_xpath( '//li[contains(@data-filters, "make_toyota")]/span') splash:set_viewport_full() splash:mouse_click(make_dimensions.x, make_dimensions.y) splash:wait(5) next_button = splash:select("*[class='page-next ']") next_button.mouse_click() splash:wait(4) return { url = splash:url(), html = splash:html() } end""" script_at_page_2 = """function main(splash) assert(splash:go(splash.args.url)) splash:wait(5) next_button = splash:select("*[class='page-next ']") next_button.mouse_click() splash:wait(4) return { url = splash:url(), html = splash:html() } end""" script = None if response.url is not self.start_urls[0]: script = script_at_page_2 else: script = script_at_page_1 yield SplashRequest(url=response.url, callback=self.parse, endpoint='execute', args={'lua_source': script})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, callback=self.homepage_parse, endpoint='execute', args={'lua_source': homepage_script, 'timeout': 90})
def parse_pages(self, response): pages = response.xpath( '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[1]/td/table/tbody/tr/td/a/@href' ).extract() pages = list(set(pages)) post_action = response.xpath('//*[@id="_f"]/@action').extract_first() url_parsed = urlparse(post_action) url_qs = parse_qs(url_parsed.query) period = int(url_qs['CisObdobia'][0]) current_page = response.xpath( '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[1]/td/table/tbody/tr/td/span/text()' ).extract_first() if not current_page: current_page = '1' initial_string = '{}_{}'.format(period, current_page) if not initial_string in self.crawled_pages: self.crawled_pages[initial_string] = True cleaned_pages = [] for page in pages: page_match = re.match(r'.*(Page.*[0-9]).*', page) if not page_match: continue page_num = page_match.groups()[0].split('$')[-1] crawled_string = '{}_{}'.format(period, page_num) if crawled_string in self.crawled_pages: continue cleaned_pages.append(page_match.groups()[0]) for page in cleaned_pages: eventargument = page page_num = eventargument.split('$')[-1] crawled_string = '{}_{}'.format(period, page_num) viewstate = response.css( 'input#__VIEWSTATE::attr(value)').extract_first() eventvalidation = response.css( 'input#__EVENTVALIDATION::attr(value)').extract_first() viewstategenerator = response.css( 'input#__VIEWSTATEGENERATOR::attr(value)').extract_first() scroll_x = response.css( 'input#__SCROLLPOSITIONX::attr(value)').extract_first() or '0' scroll_y = response.css( 'input#__SCROLLPOSITIONY::attr(value)').extract_first() or '0' eventtarget = '_sectionLayoutContainer$ctl01$dgResult2' body = { '__EVENTTARGET': eventtarget, '__EVENTARGUMENT': eventargument, '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewstategenerator, '__EVENTVALIDATION': eventvalidation, '__SCROLLPOSITIONX': scroll_x, '__SCROLLPOSITIONY': scroll_y, '_searchText': '', '_sectionLayoutContainer$ctl01$ctlNazov': '', '_sectionLayoutContainer$ctl01$ctlCisObdobia': str(period), '_sectionLayoutContainer$ctl01$ctlCPT': '', '_sectionLayoutContainer$ctl01$ctlTypTlace': '-1', '_sectionLayoutContainer$ctl01$DatumOd': self.date_from, '_sectionLayoutContainer$ctl01$DatumDo': '', '_sectionLayoutContainer$ctl01$Type': 'optSearchType', '_sectionLayoutContainer$ctl01$ctl00$txtDescriptorText': '', '_sectionLayoutContainer$ctl01$ctl00$cboLanguages': '3', '_sectionLayoutContainer$ctl00$_calendarYear': '2018', '_sectionLayoutContainer$ctl00$_calendarMonth': '9', '_sectionLayoutContainer$ctl00$_calendarApp': 'nrcpt_all', '_sectionLayoutContainer$ctl00$_calendarLang': '', '_sectionLayoutContainer$ctl00$_monthSelector': '9', '_sectionLayoutContainer$ctl00$_yearSelector': '2018' } if crawled_string not in self.crawled_pages: self.crawled_pages[crawled_string] = True yield SplashRequest('{}{}'.format(self.BASE_URL, post_action), self.parse_pages, args={ 'http_method': 'POST', 'body': urlencode(body), }, meta={'page': True}) items = response.xpath( '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[position()>1]' ) for item in items: press_url = item.xpath('td[1]/a/@href').extract_first() if press_url: url = '{}{}'.format(self.BASE_URL, press_url) yield scrapy.Request(url, callback=self.parse_press, meta={'period_num': period})
def start_requests(self): yield SplashRequest( url='http://quotes.toscrape.com/js', callback=self.parse, )
def parse(self, response): for li in response.xpath('//*[@id="houseList"]/li'): if "clearfix zry" not in li.xpath('@class').extract(): house_item = HouseItem() try: house_item['time_unit'] = re.findall(".*\((.+)\).*", li.xpath('div[3]/p[1]/span/text()').extract()[0])[ 0] except IndexError: house_item['time_unit'] = '' try: house_item['rentType'] = li.xpath('div[2]/div/p[1]/span[4]/text()').extract()[0] except IndexError: house_item['rentType'] = '' try: house_item['floorLoc'] = \ re.findall("^(\d+)/.*", li.xpath('div[2]/div/p[1]/span[2]/text()').extract()[0])[0] except IndexError: house_item['floorLoc'] = '' try: house_item['floorTotal'] = \ re.findall(".*/(\d+).*$", li.xpath('div[2]/div/p[1]/span[2]/text()').extract()[0])[0] except IndexError: house_item['floorTotal'] = '' try: for span in li.xpath('div[2]/p/span[2]/span[@class="subway"]'): if span.xpath('text()').extract()[0].find('暖') == 1 or span.xpath('text()').extract()[0].find( '空调') == 1: house_item['heatingType'] = span.xpath('text()').extract()[0] break except IndexError: house_item['heatingType'] = '' try: house_item['nearestSubWayDist'] = \ re.findall(".*?(\d+)米.*", li.xpath('div[2]/div/p[2]/span/text()').extract()[0])[0] except IndexError: house_item['nearestSubWayDist'] = '' try: house_item['confStatus'] = '0' if li.xpath('div[1]/a/img/@src').extract()[0].find( 'defaultPZZ') >= 0 else '1' except IndexError: house_item['confStatus'] = '' detail_page_link = li.xpath('div[2]/h3/a/@href').extract()[0] if detail_page_link: detail_page_link = detail_page_link if detail_page_link.find( 'www') >= 0 else 'http://' + detail_page_link detail_page_link = detail_page_link if detail_page_link.find( 'http') >= 0 else 'http:' + detail_page_link request = scrapy.Request(detail_page_link, callback=self.parse_detail_item, headers=config.ZIRU_REQUEST_HEADERS) request.meta['house_item'] = house_item yield request # 请求下一页数据 if "next" in response.xpath('//*[@id="page"]/a/@class').extract(): current_page_link = response.url if re.match('.*/z/nl/z[1|2|6]-r\d-.+?\.html\?p=\d+$', current_page_link): current_page_p = int(re.findall(".*\?p=(\d+).*", current_page_link)[0]) + 1 current_page_prefix = re.findall("^(.+\?p=).*", current_page_link)[0] next_page_link = current_page_prefix + str(current_page_p) yield SplashRequest(next_page_link, self.parse, args={'wait': 0.5}, headers=config.ZIRU_REQUEST_HEADERS)
def start_requests(self): yield SplashRequest(self.index_url, self.parse, args={'wait': 1}, meta={"pn": 1})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url=url, callback=self.parse, args={'wait': 4})
def splash_request(self, request): return SplashRequest(url=request.url, callback=self.parse_details, args={'wait': 2})
def test_splash_request_parameters(): mw = _get_mw() cookie_mw = _get_cookie_mw() def cb(): pass req = SplashRequest("http://example.com/#!start", cb, 'POST', body="foo=bar", splash_url="http://mysplash.example.com", slot_policy=SlotPolicy.SINGLE_SLOT, endpoint="execute", splash_headers={'X-My-Header': 'value'}, args={ "lua_source": "function main() end", "myarg": 3.0, }, magic_response=False, headers={'X-My-Header': 'value'} ) req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) assert req2.meta['splash'] == { 'endpoint': 'execute', 'splash_url': "http://mysplash.example.com", 'slot_policy': SlotPolicy.SINGLE_SLOT, 'splash_headers': {'X-My-Header': 'value'}, 'magic_response': False, 'session_id': 'default', 'http_status_from_error_code': True, 'args': { 'url': "http://example.com/#!start", 'http_method': 'POST', 'body': 'foo=bar', 'cookies': [], 'lua_source': 'function main() end', 'myarg': 3.0, 'headers': { 'X-My-Header': 'value', } }, } assert req2.callback == cb assert req2.headers == { b'Content-Type': [b'application/json'], b'X-My-Header': [b'value'], } # check response post-processing res = { 'html': '<html><body>Hello</body></html>', 'num_divs': 0.0, } res_body = json.dumps(res) response = TextResponse("http://mysplash.example.com/execute", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'application/json'}, body=res_body.encode('utf8')) response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_splash.SplashJsonResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.meta['splash']['args']['url'] assert response2.data == res assert response2.body == res_body.encode('utf8') assert response2.text == response2.body_as_unicode() == res_body assert response2.encoding == 'utf8' assert response2.headers == {b'Content-Type': [b'application/json']} assert response2.status == 200
def start_requests(self): # Process AngularJS page with Network West Midlands disruptions data using Splash base_url = 'https://www.networkwestmidlands.com/plan-your-journey/disruptions/#/params?DisruptionType=&when=All&TransportModeA=5&TransportModeB=0&TransportModeC=4' yield SplashRequest(url=base_url, callback=self.parse, args={'wait': 1.0})
def test_cache_args(): spider = scrapy.Spider(name='foo') mw = _get_mw() mw.crawler.spider = spider mw.spider_opened(spider) dedupe_mw = SplashDeduplicateArgsMiddleware() # ========= Send first request - it should use save_args: lua_source = 'function main(splash) end' req = SplashRequest('http://example.com/foo', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) assert req.meta['splash']['args']['lua_source'] == lua_source # <---- spider req, = list(dedupe_mw.process_start_requests([req], spider)) # ----> scheduler assert req.meta['splash']['args']['lua_source'] != lua_source assert list(mw._argument_values.values()) == [lua_source] assert list(mw._argument_values.keys()) == [req.meta['splash']['args']['lua_source']] # <---- scheduler # process request before sending it to the downloader req = mw.process_request(req, spider) or req # -----> downloader assert req.meta['splash']['args']['lua_source'] == lua_source assert req.meta['splash']['args']['save_args'] == ['lua_source'] assert 'load_args' not in req.meta['splash']['args'] assert req.meta['splash']['_local_arg_fingerprints'] == { 'lua_source': list(mw._argument_values.keys())[0] } # <---- downloader resp_body = b'{}' resp = TextResponse("http://example.com", headers={ b'Content-Type': b'application/json', b'X-Splash-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3' }, body=resp_body) resp = mw.process_response(req, resp, None) # ============ Send second request - it should use load_args req2 = SplashRequest('http://example.com/bar', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req2, item = list(dedupe_mw.process_spider_output(resp, [req2, {'key': 'value'}], spider)) assert item == {'key': 'value'} # ----> scheduler assert req2.meta['splash']['args']['lua_source'] != lua_source # <---- scheduler # process request before sending it to the downloader req2 = mw.process_request(req2, spider) or req2 # -----> downloader assert req2.meta['splash']['args']['load_args'] == {"lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"} assert "lua_source" not in req2.meta['splash']['args'] assert "save_args" not in req2.meta['splash']['args'] assert json.loads(req2.body.decode('utf8')) == { 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 'url': 'http://example.com/bar' } # <---- downloader resp = TextResponse("http://example.com/bar", headers={b'Content-Type': b'application/json'}, body=b'{}') resp = mw.process_response(req, resp, spider) # =========== Third request is dispatched to another server where # =========== arguments are expired: req3 = SplashRequest('http://example.com/baz', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider)) # ----> scheduler assert req3.meta['splash']['args']['lua_source'] != lua_source # <---- scheduler req3 = mw.process_request(req3, spider) or req3 # -----> downloader assert json.loads(req3.body.decode('utf8')) == { 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 'url': 'http://example.com/baz' } # <---- downloader resp_body = json.dumps({ "type": "ExpiredArguments", "description": "Arguments stored with ``save_args`` are expired", "info": {"expired": ["html"]}, "error": 498 }) resp = TextResponse("127.0.0.1:8050", headers={b'Content-Type': b'application/json'}, status=498, body=resp_body.encode('utf8')) req4 = mw.process_response(req3, resp, spider) assert isinstance(req4, SplashRequest) # process this request again req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider)) req4 = mw.process_request(req4, spider) or req4 # it should become save_args request after all middlewares assert json.loads(req4.body.decode('utf8')) == { 'lua_source': 'function main(splash) end', 'save_args': ['lua_source'], 'url': 'http://example.com/baz' } assert mw._remote_keys == {}
def modify_realtime_request(self, request): user_url_input = request.meta["url"] return SplashRequest(user_url_input, self.parse, args={'lua_source': self.script}, endpoint='execute')
def start_requests(self): yield SplashRequest(url="https://www.livecoin.net/en", callback=self.parse, endpoint="execute", args={'lua_source': self.script})
def start_requests(self): yield SplashRequest( url= 'https://shop.vons.com/aisles/bread-bakery/bakery-bread.2118.html', callback=self.parse, )
def parse(self, response): #print(response.headers) #print(response.status) #print(response.meta) #print(response.data) # # TODO: handle lua script error #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'", #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'', #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53}, #'error': 400, 'description': 'Error happened while executing Lua script'} if response.status == 504: # no response #print('504 detected') pass # LUA ERROR # # TODO: logs errors elif 'error' in response.data: if (response.data['error'] == 'network99'): ## splash restart ## error_retry = response.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 url = response.data['last_url'] father = response.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if 'cookies' in response.data: all_cookies = response.data[ 'cookies'] # # TODO: use initial cookie ????? else: all_cookies = [] l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', dont_filter=True, meta={ 'father': father, 'current_url': url, 'error_retry': error_retry }, args=l_cookies) else: if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, 'Connection to proxy refused') print('Connection to proxy refused') elif response.data['error'] == 'network3': if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, 'HostNotFoundError: the remote host name was not found (invalid hostname)' ) print( 'HostNotFoundError: the remote host name was not found (invalid hostname)' ) else: if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, response.data['error']) print(response.data['error']) elif response.status != 200: print('other response: {}'.format(response.status)) # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: ## TEST MODE ## if self.requested_mode == 'test': if 'It works!' in response.data['html']: crawlers.save_test_ail_crawlers_result( True, 'It works!') else: print('TEST ERROR') crawlers.save_test_ail_crawlers_result( False, 'TEST ERROR') return ## -- ## item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] else: all_cookies = [] # SCREENSHOT if 'png' in response.data and self.png: sha256_string = Screenshot.save_crawled_screeshot( response.data['png'], 5000000, f_save=self.requested_mode) if sha256_string: Screenshot.save_item_relationship( sha256_string, item_id) Screenshot.save_domain_relationship( sha256_string, self.domains[0]) # HAR if 'har' in response.data and self.har: crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': item_id, 'current_url': link.url }, args=l_cookies)
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, endpoint="render.html", callback=self.parse)
def start_requests(self): for url in self.urls: yield SplashRequest(url, callback=self.parse, endpoint='render.html', args={'wait': '5'})
def start_requests(self): yield SplashRequest('http://quotes.toscrape.com/js')