def __init__(self, url=None, callback=None, method=None, formdata=None,
              body=None, **kwargs):
     # First init FormRequest to get url, body and method
     if formdata:
         FormRequest.__init__(
             self, url=url, method=method, formdata=formdata)
         url, method, body = self.url, self.method, self.body
     # Then pass all other kwargs to SplashRequest
     SplashRequest.__init__(
         self, url=url, callback=callback, method=method, body=body,
         **kwargs)
Beispiel #2
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url, args={'images': 0, 'timeout': 3})
Beispiel #3
0
 def start_requests(self):
     yield SplashRequest(
         self.start_urls[0],
         callback=self.parse_splash,
         args={'wait': 10},  # 最大超时时间,单位:秒
         endpoint='render.html')  # 使用splash服务的固定参数
def test_splash_request_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = SplashRequest(url)
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
def test_magic_response():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = SplashRequest('http://example.com/',
                        endpoint='execute',
                        args={'lua_source': 'function main() end'},
                        magic_response=True,
                        cookies=[{'name': 'foo', 'value': 'bar'}])
    req = cookie_mw.process_request(req, None) or req
    req = mw.process_request(req, None) or req

    resp_data = {
        'url': "http://exmaple.com/#id42",
        'html': '<html><body>Hello 404</body></html>',
        'http_status': 404,
        'headers': [
            {'name': 'Content-Type', 'value': "text/html"},
            {'name': 'X-My-Header', 'value': "foo"},
            {'name': 'Set-Cookie', 'value': "bar=baz"},
        ],
        'cookies': [
            {'name': 'foo', 'value': 'bar'},
            {'name': 'bar', 'value': 'baz', 'domain': '.example.com'},
            {'name': 'session', 'value': '12345', 'path': '/',
             'expires': '2055-07-24T19:20:30Z'},
        ],
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    resp2 = cookie_mw.process_response(req, resp2, None)
    assert isinstance(resp2, scrapy_splash.SplashJsonResponse)
    assert resp2.data == resp_data
    assert resp2.body == b'<html><body>Hello 404</body></html>'
    assert resp2.text == '<html><body>Hello 404</body></html>'
    assert resp2.headers == {
        b'Content-Type': [b'text/html'],
        b'X-My-Header': [b'foo'],
        b'Set-Cookie': [b'bar=baz'],
    }
    assert resp2.status == 404
    assert resp2.url == "http://exmaple.com/#id42"
    assert len(resp2.cookiejar) == 3
    cookies = [c for c in resp2.cookiejar]
    assert {(c.name, c.value) for c in cookies} == {
        ('bar', 'baz'),
        ('foo', 'bar'),
        ('session', '12345')
    }

    # send second request using the same session and check the resulting cookies
    req = SplashRequest('http://example.com/foo',
                        endpoint='execute',
                        args={'lua_source': 'function main() end'},
                        magic_response=True,
                        cookies={'spam': 'ham'})
    req = cookie_mw.process_request(req, None) or req
    req = mw.process_request(req, None) or req

    resp_data = {
        'html': '<html><body>Hello</body></html>',
        'headers': [
            {'name': 'Content-Type', 'value': "text/html"},
            {'name': 'X-My-Header', 'value': "foo"},
            {'name': 'Set-Cookie', 'value': "bar=baz"},
        ],
        'cookies': [
            {'name': 'spam', 'value': 'ham'},
            {'name': 'egg', 'value': 'spam'},
            {'name': 'bar', 'value': 'baz', 'domain': '.example.com'},
           #{'name': 'foo', 'value': ''},  -- this won't be in response
            {'name': 'session', 'value': '12345', 'path': '/',
             'expires': '2056-07-24T19:20:30Z'},
        ],
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    resp2 = cookie_mw.process_response(req, resp2, None)
    assert isinstance(resp2, scrapy_splash.SplashJsonResponse)
    assert resp2.data == resp_data
    cookies = [c for c in resp2.cookiejar]
    assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'}
    for c in cookies:
        if c.name == 'session':
            assert c.expires == 2731692030
        if c.name == 'spam':
            assert c.value == 'ham'
Beispiel #6
0
 def start_requests(self):
     yield SplashRequest(url='https://quotes.toscrape.com/login',
                         endpoint='execute',
                         args={'lua_source': self.script},
                         callback=self.parse)
Beispiel #7
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url, self.parse, args={'wait': 0.5})
 def start_requests(self):
     page = "https://www.duquesnelight.com/outages-safety/current-outages"
     yield SplashRequest(url=page,
                         callback=self.parse,
                         endpoint='render.html',
                         args={'wait': 2})
Beispiel #9
0
 def parse(self, response):
     url_selectors = response.css("div.event-body.clearfix div.left h3 a::attr(href)")
     for url in url_selectors.extract()[:10]:
         yield SplashRequest(url.encode('utf-8'), callback=self.parse_item, endpoint='execute', args={'lua_source': script2})
Beispiel #10
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': script})
Beispiel #11
0
 def start_requests(self):
     parameters = urlencode({"show": 100, "qs": getattr(self, "query", "")})
     yield SplashRequest(
         f"https://www.sciencedirect.com/search?{parameters}"
     )
Beispiel #12
0
 def process_links(self, response):
     links = response.xpath('//div[@class="rate"]/a/@href').extract()
     for link in links:
         linkurl = urljoin('https://www.vrbo.com', link)
         yield SplashRequest(linkurl, self.parse_listings)
Beispiel #13
0
 def parse(self, response):
     post_links = response.css(
         "a.rpl-search-results__item::attr(data-print-url)"
     ).getall()
     for url in post_links:
         yield SplashRequest(url, self.parse_item, args={"wait": 0.5})
Beispiel #14
0
    def parse(self, response):
        car_urls = response.xpath('//*[@class="title"]/a/@href').extract()
        for car_url in car_urls:
            absolute_car_url = response.urljoin(car_url + '/')
            yield Request(absolute_car_url, callback=self.parse_car)

        script_at_page_1 = """function main(splash)
                                assert(splash:go(splash.args.url))
                                splash:wait(5)
                            
                                local get_element_dim_by_xpath = splash:jsfunc([[
                                    function(xpath) {
                                        var element = document.evaluate(xpath, document, null,
                                            XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
                                        var element_rect = element.getClientRects()[0];
                                        return {"x": element_rect.left, "y": element_rect.top}
                                    }
                                ]])
                            
                                -- -- Find the YEAR drop down
                                local year_drop_dimensions = get_element_dim_by_xpath(
                                    '//h2[contains(@class, "label ") and contains(text(), "Year ")]')
                                splash:set_viewport_full()
                                splash:mouse_click(year_drop_dimensions.x, year_drop_dimensions.y)
                                splash:wait(1.5)
                            
                                -- -- Clicks the 202X year
                                local year_dimensions = get_element_dim_by_xpath(
                                    '//li[contains(@data-value, "2020")]/span')
                                splash:set_viewport_full()
                                splash:mouse_click(year_dimensions.x, year_dimensions.y)
                                splash:wait(5)
                            
                                -- Find the MAKE drop down
                                local make_drop_dimensions = get_element_dim_by_xpath(
                                    '//h2[contains(@class, "label ") and contains(text(), "Make ")]')
                                splash:set_viewport_full()
                                splash:mouse_click(make_drop_dimensions.x, make_drop_dimensions.y)
                                splash:wait(1.5)
                            
                                -- Clicks the Toyota make
                                local make_dimensions = get_element_dim_by_xpath(
                                    '//li[contains(@data-filters, "make_toyota")]/span')
                                splash:set_viewport_full()
                                splash:mouse_click(make_dimensions.x, make_dimensions.y)
                                splash:wait(5)
                            
                                next_button = splash:select("*[class='page-next ']")
                                next_button.mouse_click()
                                splash:wait(4)
                                return {
                                    url = splash:url(),
                                    html = splash:html()
                                }
                            end"""

        script_at_page_2 = """function main(splash)
                                assert(splash:go(splash.args.url))
                                splash:wait(5)
                            
                                next_button = splash:select("*[class='page-next ']")
                                next_button.mouse_click()
                                splash:wait(4)
                                return {
                                    url = splash:url(),
                                    html = splash:html()
                                }
                            end"""

        script = None
        if response.url is not self.start_urls[0]:
            script = script_at_page_2
        else:
            script = script_at_page_1

        yield SplashRequest(url=response.url,
                            callback=self.parse,
                            endpoint='execute',
                            args={'lua_source': script})
Beispiel #15
0
    def start_requests(self):

        for url in self.start_urls:
            yield SplashRequest(url, callback=self.homepage_parse, endpoint='execute',
                                args={'lua_source': homepage_script, 'timeout': 90})
Beispiel #16
0
    def parse_pages(self, response):
        pages = response.xpath(
            '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[1]/td/table/tbody/tr/td/a/@href'
        ).extract()
        pages = list(set(pages))
        post_action = response.xpath('//*[@id="_f"]/@action').extract_first()
        url_parsed = urlparse(post_action)
        url_qs = parse_qs(url_parsed.query)
        period = int(url_qs['CisObdobia'][0])
        current_page = response.xpath(
            '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[1]/td/table/tbody/tr/td/span/text()'
        ).extract_first()

        if not current_page:
            current_page = '1'

        initial_string = '{}_{}'.format(period, current_page)
        if not initial_string in self.crawled_pages:
            self.crawled_pages[initial_string] = True

        cleaned_pages = []
        for page in pages:
            page_match = re.match(r'.*(Page.*[0-9]).*', page)
            if not page_match:
                continue
            page_num = page_match.groups()[0].split('$')[-1]
            crawled_string = '{}_{}'.format(period, page_num)
            if crawled_string in self.crawled_pages:
                continue
            cleaned_pages.append(page_match.groups()[0])

        for page in cleaned_pages:
            eventargument = page
            page_num = eventargument.split('$')[-1]
            crawled_string = '{}_{}'.format(period, page_num)

            viewstate = response.css(
                'input#__VIEWSTATE::attr(value)').extract_first()
            eventvalidation = response.css(
                'input#__EVENTVALIDATION::attr(value)').extract_first()
            viewstategenerator = response.css(
                'input#__VIEWSTATEGENERATOR::attr(value)').extract_first()
            scroll_x = response.css(
                'input#__SCROLLPOSITIONX::attr(value)').extract_first() or '0'
            scroll_y = response.css(
                'input#__SCROLLPOSITIONY::attr(value)').extract_first() or '0'
            eventtarget = '_sectionLayoutContainer$ctl01$dgResult2'
            body = {
                '__EVENTTARGET': eventtarget,
                '__EVENTARGUMENT': eventargument,
                '__VIEWSTATE': viewstate,
                '__VIEWSTATEGENERATOR': viewstategenerator,
                '__EVENTVALIDATION': eventvalidation,
                '__SCROLLPOSITIONX': scroll_x,
                '__SCROLLPOSITIONY': scroll_y,
                '_searchText': '',
                '_sectionLayoutContainer$ctl01$ctlNazov': '',
                '_sectionLayoutContainer$ctl01$ctlCisObdobia': str(period),
                '_sectionLayoutContainer$ctl01$ctlCPT': '',
                '_sectionLayoutContainer$ctl01$ctlTypTlace': '-1',
                '_sectionLayoutContainer$ctl01$DatumOd': self.date_from,
                '_sectionLayoutContainer$ctl01$DatumDo': '',
                '_sectionLayoutContainer$ctl01$Type': 'optSearchType',
                '_sectionLayoutContainer$ctl01$ctl00$txtDescriptorText': '',
                '_sectionLayoutContainer$ctl01$ctl00$cboLanguages': '3',
                '_sectionLayoutContainer$ctl00$_calendarYear': '2018',
                '_sectionLayoutContainer$ctl00$_calendarMonth': '9',
                '_sectionLayoutContainer$ctl00$_calendarApp': 'nrcpt_all',
                '_sectionLayoutContainer$ctl00$_calendarLang': '',
                '_sectionLayoutContainer$ctl00$_monthSelector': '9',
                '_sectionLayoutContainer$ctl00$_yearSelector': '2018'
            }
            if crawled_string not in self.crawled_pages:
                self.crawled_pages[crawled_string] = True
            yield SplashRequest('{}{}'.format(self.BASE_URL, post_action),
                                self.parse_pages,
                                args={
                                    'http_method': 'POST',
                                    'body': urlencode(body),
                                },
                                meta={'page': True})

        items = response.xpath(
            '//*[@id="_sectionLayoutContainer_ctl01_dgResult2"]/tbody/tr[position()>1]'
        )
        for item in items:
            press_url = item.xpath('td[1]/a/@href').extract_first()
            if press_url:
                url = '{}{}'.format(self.BASE_URL, press_url)
                yield scrapy.Request(url,
                                     callback=self.parse_press,
                                     meta={'period_num': period})
Beispiel #17
0
 def start_requests(self):
     yield SplashRequest(
         url='http://quotes.toscrape.com/js',
         callback=self.parse,
     )
Beispiel #18
0
    def parse(self, response):
        for li in response.xpath('//*[@id="houseList"]/li'):
            if "clearfix zry" not in li.xpath('@class').extract():
                house_item = HouseItem()
                try:
                    house_item['time_unit'] = re.findall(".*\((.+)\).*", li.xpath('div[3]/p[1]/span/text()').extract()[0])[
                        0]
                except IndexError:
                    house_item['time_unit'] = ''
                try:
                    house_item['rentType'] = li.xpath('div[2]/div/p[1]/span[4]/text()').extract()[0]
                except IndexError:
                    house_item['rentType'] = ''
                try:
                    house_item['floorLoc'] = \
                    re.findall("^(\d+)/.*", li.xpath('div[2]/div/p[1]/span[2]/text()').extract()[0])[0]
                except IndexError:
                    house_item['floorLoc'] = ''
                try:
                    house_item['floorTotal'] = \
                    re.findall(".*/(\d+).*$", li.xpath('div[2]/div/p[1]/span[2]/text()').extract()[0])[0]
                except IndexError:
                    house_item['floorTotal'] = ''
                try:
                    for span in li.xpath('div[2]/p/span[2]/span[@class="subway"]'):
                        if span.xpath('text()').extract()[0].find('暖') == 1 or span.xpath('text()').extract()[0].find(
                                '空调') == 1:
                            house_item['heatingType'] = span.xpath('text()').extract()[0]
                            break
                except IndexError:
                    house_item['heatingType'] = ''

                try:
                    house_item['nearestSubWayDist'] = \
                    re.findall(".*?(\d+)米.*", li.xpath('div[2]/div/p[2]/span/text()').extract()[0])[0]
                except IndexError:
                    house_item['nearestSubWayDist'] = ''
                try:
                    house_item['confStatus'] = '0' if li.xpath('div[1]/a/img/@src').extract()[0].find(
                        'defaultPZZ') >= 0 else '1'
                except IndexError:
                    house_item['confStatus'] = ''

                detail_page_link = li.xpath('div[2]/h3/a/@href').extract()[0]
                if detail_page_link:
                    detail_page_link = detail_page_link if detail_page_link.find(
                        'www') >= 0 else 'http://' + detail_page_link
                    detail_page_link = detail_page_link if detail_page_link.find(
                        'http') >= 0 else 'http:' + detail_page_link
                    request = scrapy.Request(detail_page_link, callback=self.parse_detail_item, headers=config.ZIRU_REQUEST_HEADERS)
                    request.meta['house_item'] = house_item
                    yield request

        # 请求下一页数据
        if "next" in response.xpath('//*[@id="page"]/a/@class').extract():
            current_page_link = response.url
            if re.match('.*/z/nl/z[1|2|6]-r\d-.+?\.html\?p=\d+$', current_page_link):
                current_page_p = int(re.findall(".*\?p=(\d+).*", current_page_link)[0]) + 1
                current_page_prefix = re.findall("^(.+\?p=).*", current_page_link)[0]
                next_page_link = current_page_prefix + str(current_page_p)
                yield SplashRequest(next_page_link, self.parse, args={'wait': 0.5}, headers=config.ZIRU_REQUEST_HEADERS)
Beispiel #19
0
 def start_requests(self):
     yield SplashRequest(self.index_url,
                         self.parse,
                         args={'wait': 1},
                         meta={"pn": 1})
Beispiel #20
0
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url=url, callback=self.parse, args={'wait': 4})
Beispiel #21
0
def splash_request(self, request):
    return SplashRequest(url=request.url,
                         callback=self.parse_details,
                         args={'wait': 2})
def test_splash_request_parameters():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    def cb():
        pass

    req = SplashRequest("http://example.com/#!start", cb, 'POST',
        body="foo=bar",
        splash_url="http://mysplash.example.com",
        slot_policy=SlotPolicy.SINGLE_SLOT,
        endpoint="execute",
        splash_headers={'X-My-Header': 'value'},
        args={
            "lua_source": "function main() end",
            "myarg": 3.0,
        },
        magic_response=False,
        headers={'X-My-Header': 'value'}
    )
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None)
    assert req2.meta['splash'] == {
        'endpoint': 'execute',
        'splash_url': "http://mysplash.example.com",
        'slot_policy': SlotPolicy.SINGLE_SLOT,
        'splash_headers': {'X-My-Header': 'value'},
        'magic_response': False,
        'session_id': 'default',
        'http_status_from_error_code': True,
        'args': {
            'url': "http://example.com/#!start",
            'http_method': 'POST',
            'body': 'foo=bar',
            'cookies': [],
            'lua_source': 'function main() end',
            'myarg': 3.0,
            'headers': {
                'X-My-Header': 'value',
            }
        },
    }
    assert req2.callback == cb
    assert req2.headers == {
        b'Content-Type': [b'application/json'],
        b'X-My-Header': [b'value'],
    }

    # check response post-processing
    res = {
        'html': '<html><body>Hello</body></html>',
        'num_divs': 0.0,
    }
    res_body = json.dumps(res)
    response = TextResponse("http://mysplash.example.com/execute",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'application/json'},
                            body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_splash.SplashJsonResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.meta['splash']['args']['url']
    assert response2.data == res
    assert response2.body == res_body.encode('utf8')
    assert response2.text == response2.body_as_unicode() == res_body
    assert response2.encoding == 'utf8'
    assert response2.headers == {b'Content-Type': [b'application/json']}
    assert response2.status == 200
 def start_requests(self):
     # Process AngularJS page with Network West Midlands disruptions data using Splash
     base_url = 'https://www.networkwestmidlands.com/plan-your-journey/disruptions/#/params?DisruptionType=&when=All&TransportModeA=5&TransportModeB=0&TransportModeC=4'
     yield SplashRequest(url=base_url, callback=self.parse, args={'wait': 1.0})
def test_cache_args():
    spider = scrapy.Spider(name='foo')
    mw = _get_mw()
    mw.crawler.spider = spider
    mw.spider_opened(spider)
    dedupe_mw = SplashDeduplicateArgsMiddleware()

    # ========= Send first request - it should use save_args:
    lua_source = 'function main(splash) end'
    req = SplashRequest('http://example.com/foo',
                        endpoint='execute',
                        args={'lua_source': lua_source},
                        cache_args=['lua_source'])

    assert req.meta['splash']['args']['lua_source'] == lua_source
    # <---- spider
    req, = list(dedupe_mw.process_start_requests([req], spider))
    # ----> scheduler
    assert req.meta['splash']['args']['lua_source'] != lua_source
    assert list(mw._argument_values.values()) == [lua_source]
    assert list(mw._argument_values.keys()) == [req.meta['splash']['args']['lua_source']]
    # <---- scheduler
    # process request before sending it to the downloader
    req = mw.process_request(req, spider) or req
    # -----> downloader
    assert req.meta['splash']['args']['lua_source'] == lua_source
    assert req.meta['splash']['args']['save_args'] == ['lua_source']
    assert 'load_args' not in req.meta['splash']['args']
    assert req.meta['splash']['_local_arg_fingerprints'] == {
        'lua_source': list(mw._argument_values.keys())[0]
    }
    # <---- downloader
    resp_body = b'{}'
    resp = TextResponse("http://example.com",
                        headers={
                            b'Content-Type': b'application/json',
                            b'X-Splash-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3'
                        },
                        body=resp_body)
    resp = mw.process_response(req, resp, None)

    # ============ Send second request - it should use load_args
    req2 = SplashRequest('http://example.com/bar',
                        endpoint='execute',
                        args={'lua_source': lua_source},
                        cache_args=['lua_source'])
    req2, item = list(dedupe_mw.process_spider_output(resp, [req2, {'key': 'value'}], spider))
    assert item == {'key': 'value'}
    # ----> scheduler
    assert req2.meta['splash']['args']['lua_source'] != lua_source
    # <---- scheduler
    # process request before sending it to the downloader
    req2 = mw.process_request(req2, spider) or req2
    # -----> downloader
    assert req2.meta['splash']['args']['load_args'] == {"lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"}
    assert "lua_source" not in req2.meta['splash']['args']
    assert "save_args" not in req2.meta['splash']['args']
    assert json.loads(req2.body.decode('utf8')) == {
        'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'},
        'url': 'http://example.com/bar'
    }
    # <---- downloader
    resp = TextResponse("http://example.com/bar",
                        headers={b'Content-Type': b'application/json'},
                        body=b'{}')
    resp = mw.process_response(req, resp, spider)

    # =========== Third request is dispatched to another server where
    # =========== arguments are expired:
    req3 = SplashRequest('http://example.com/baz',
                         endpoint='execute',
                         args={'lua_source': lua_source},
                         cache_args=['lua_source'])
    req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider))
    # ----> scheduler
    assert req3.meta['splash']['args']['lua_source'] != lua_source
    # <---- scheduler
    req3 = mw.process_request(req3, spider) or req3
    # -----> downloader
    assert json.loads(req3.body.decode('utf8')) == {
        'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'},
        'url': 'http://example.com/baz'
    }
    # <---- downloader

    resp_body = json.dumps({
        "type": "ExpiredArguments",
        "description": "Arguments stored with ``save_args`` are expired",
        "info": {"expired": ["html"]},
        "error": 498
    })
    resp = TextResponse("127.0.0.1:8050",
                        headers={b'Content-Type': b'application/json'},
                        status=498,
                        body=resp_body.encode('utf8'))
    req4 = mw.process_response(req3, resp, spider)
    assert isinstance(req4, SplashRequest)

    # process this request again
    req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider))
    req4 = mw.process_request(req4, spider) or req4

    # it should become save_args request after all middlewares
    assert json.loads(req4.body.decode('utf8')) == {
        'lua_source': 'function main(splash) end',
        'save_args': ['lua_source'],
        'url': 'http://example.com/baz'
    }
    assert mw._remote_keys == {}
Beispiel #25
0
 def modify_realtime_request(self, request):
     user_url_input = request.meta["url"]
     return SplashRequest(user_url_input,
                          self.parse,
                          args={'lua_source': self.script},
                          endpoint='execute')
Beispiel #26
0
 def start_requests(self):
     yield SplashRequest(url="https://www.livecoin.net/en",
                         callback=self.parse,
                         endpoint="execute",
                         args={'lua_source': self.script})
Beispiel #27
0
 def start_requests(self):
     yield SplashRequest(
         url=
         'https://shop.vons.com/aisles/bread-bakery/bakery-bread.2118.html',
         callback=self.parse,
     )
Beispiel #28
0
        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            #print(response.meta)
            #print(response.data) # # TODO: handle lua script error
            #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'",
            #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'',
            #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53},
            #'error': 400, 'description': 'Error happened while executing Lua script'}
            if response.status == 504:
                # no response
                #print('504 detected')
                pass

            # LUA ERROR # # TODO: logs errors
            elif 'error' in response.data:
                if (response.data['error'] == 'network99'):
                    ## splash restart ##
                    error_retry = response.meta.get('error_retry', 0)
                    if error_retry < 3:
                        error_retry += 1
                        url = response.data['last_url']
                        father = response.meta['father']

                        self.logger.error(
                            'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                            url)
                        time.sleep(10)
                        if 'cookies' in response.data:
                            all_cookies = response.data[
                                'cookies']  # # TODO:  use initial cookie ?????
                        else:
                            all_cookies = []
                        l_cookies = self.build_request_arg(all_cookies)
                        yield SplashRequest(url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='execute',
                                            dont_filter=True,
                                            meta={
                                                'father': father,
                                                'current_url': url,
                                                'error_retry': error_retry
                                            },
                                            args=l_cookies)
                    else:
                        if self.requested_mode == 'test':
                            crawlers.save_test_ail_crawlers_result(
                                False, 'Connection to proxy refused')
                        print('Connection to proxy refused')
                elif response.data['error'] == 'network3':
                    if self.requested_mode == 'test':
                        crawlers.save_test_ail_crawlers_result(
                            False,
                            'HostNotFoundError: the remote host name was not found (invalid hostname)'
                        )
                    print(
                        'HostNotFoundError: the remote host name was not found (invalid hostname)'
                    )
                else:
                    if self.requested_mode == 'test':
                        crawlers.save_test_ail_crawlers_result(
                            False, response.data['error'])
                    print(response.data['error'])

            elif response.status != 200:
                print('other response: {}'.format(response.status))
                # detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                print(error_log)
            #elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
            #    pass # ignore response
            else:
                ## TEST MODE ##
                if self.requested_mode == 'test':
                    if 'It works!' in response.data['html']:
                        crawlers.save_test_ail_crawlers_result(
                            True, 'It works!')
                    else:
                        print('TEST ERROR')
                        crawlers.save_test_ail_crawlers_result(
                            False, 'TEST ERROR')
                    return
                ## -- ##

                item_id = crawlers.create_item_id(self.item_dir,
                                                  self.domains[0])
                self.save_crawled_item(item_id, response.data['html'])
                crawlers.create_item_metadata(item_id, self.domains[0],
                                              response.data['last_url'],
                                              self.port,
                                              response.meta['father'])

                if self.root_key is None:
                    self.root_key = item_id
                    crawlers.add_domain_root_item(item_id, self.domain_type,
                                                  self.domains[0],
                                                  self.date_epoch, self.port)
                    crawlers.create_domain_metadata(self.domain_type,
                                                    self.domains[0], self.port,
                                                    self.full_date,
                                                    self.date_month)

                if 'cookies' in response.data:
                    all_cookies = response.data['cookies']
                else:
                    all_cookies = []

                # SCREENSHOT
                if 'png' in response.data and self.png:
                    sha256_string = Screenshot.save_crawled_screeshot(
                        response.data['png'],
                        5000000,
                        f_save=self.requested_mode)
                    if sha256_string:
                        Screenshot.save_item_relationship(
                            sha256_string, item_id)
                        Screenshot.save_domain_relationship(
                            sha256_string, self.domains[0])
                # HAR
                if 'har' in response.data and self.har:
                    crawlers.save_har(self.har_dir, item_id,
                                      response.data['har'])

                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    l_cookies = self.build_request_arg(all_cookies)
                    yield SplashRequest(link.url,
                                        self.parse,
                                        errback=self.errback_catcher,
                                        endpoint='execute',
                                        meta={
                                            'father': item_id,
                                            'current_url': link.url
                                        },
                                        args=l_cookies)
 def start_requests(self):
     for url in self.start_urls:
         yield SplashRequest(url,
                             endpoint="render.html",
                             callback=self.parse)
Beispiel #30
0
 def start_requests(self):
     for url in self.urls:
         yield SplashRequest(url,
                             callback=self.parse,
                             endpoint='render.html',
                             args={'wait': '5'})
Beispiel #31
0
 def start_requests(self):
     yield SplashRequest('http://quotes.toscrape.com/js')