def test_apikey(self): self.spider.crawlera_enabled = True self.settings['CRAWLERA_APIKEY'] = apikey = 'apikey' proxyauth = basic_auth_header(apikey, '') self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) self.spider.crawlera_apikey = apikey = 'notfromsettings' proxyauth = basic_auth_header(apikey, '') self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
def _add_auth_header(self, request): if self._user_name != None and self._user_password != None: request.add_header('Authorization', basic_auth_header(self._user_name, self._user_password)) else: # try netrc try: host = urlparse(self._get_server_url()).hostname a = netrc().authenticators(host) request.add_header('Authorization', basic_auth_header(a[0], a[2])) except (NetrcParseError, IOError, TypeError): pass
def test_userpass(self): self.spider.use_hubproxy = True self.settings['HUBPROXY_USER'] = user = '******' self.settings['HUBPROXY_PASS'] = pass_ = 'secret' proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) self.spider.hubproxy_user = user = '******' self.spider.hubproxy_pass = pass_ = 'anothersecret' proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
def test_userpass(self): self.spider.crawlera_enabled = True self.settings["CRAWLERA_USER"] = user = "******" self.settings["CRAWLERA_PASS"] = pass_ = "secret" proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) self.spider.crawlera_user = user = "******" self.spider.crawlera_pass = pass_ = "anothersecret" proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
def test_userpass(self): self.spider.crawlera_enabled = True self.settings['CRAWLERA_USER'] = user = '******' self.settings['CRAWLERA_PASS'] = pass_ = 'secret' proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) self.spider.crawlera_user = user = '******' self.spider.crawlera_pass = pass_ = 'anothersecret' proxyauth = basic_auth_header(user, pass_) self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
def test_apikey_assignment(self): self.spider.crawlera_enabled = True apikey = 'someapikey' self.settings['CRAWLERA_APIKEY'] = None self.settings['CRAWLERA_USER'] = apikey self.settings['CRAWLERA_PASS'] = '' proxyauth = basic_auth_header(apikey, '') self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) self.settings['CRAWLERA_USER'] = None self.settings['CRAWLERA_APIKEY'] = apikey self.settings['CRAWLERA_PASS'] = '' proxyauth = basic_auth_header(apikey, '') self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
def __init__(self, user, password, maxbans, url, crawler, enabled=False): self.url = url self.user = user self.auth = basic_auth_header(user, password) self.crawler = crawler self.enabled = enabled self.maxbans = maxbans self.bans = 0
def __init__(self, crawler, splash_base_url, slot_policy): self.crawler = crawler self.splash_base_url = splash_base_url self.slot_policy = slot_policy self.splash_auth = None user = crawler.settings.get('SPLASH_USER') passwd = crawler.settings.get('SPLASH_PASS', '') if user: self.splash_auth = basic_auth_header(user, passwd)
def open_spider(self, spider): try: self.enabled = spider.use_hubproxy self.user = spider.hubproxy_user self.auth = basic_auth_header(spider.hubproxy_user, spider.hubproxy_pass) except AttributeError: pass if self.enabled: log.msg("Using hubproxy at %s (user: %s)" % (self.url, self.user), spider=spider)
def test_get_basic_auth(self): curl_command = 'curl "https://api.test.com/" -u ' \ '"some_username:some_password"' expected_result = { "method": "GET", "url": "https://api.test.com/", "headers": [("Authorization", basic_auth_header("some_username", "some_password"))], } self._test_command(curl_command, expected_result)
def _assert_enabled(self, spider, settings=None, proxyurl='http://proxy.zyte.com:8011', proxyauth=basic_auth_header('apikey', ''), maxbans=400, download_timeout=190): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) req = Request('http://www.scrapytest.org') assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), proxyurl) self.assertEqual(req.meta.get('download_timeout'), download_timeout) self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy=True' is set req = Request('http://www.scrapytest.org') req.meta['dont_proxy'] = True assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), None) self.assertEqual(req.meta.get('download_timeout'), None) self.assertEqual(req.headers.get('Proxy-Authorization'), None) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res del req.meta['dont_proxy'] if maxbans > 0: # assert ban count is reseted after a succesful response res = self._mock_zyte_smartproxy_response('http://ban.me', status=self.bancode) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = self._mock_zyte_smartproxy_response('http://unban.me') assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) self.assertEqual(mw._bans[None], 0) # check for not banning before maxbans for bancode for x in range(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = self._mock_zyte_smartproxy_response( 'http://ban.me/%d' % x, status=self.bancode, headers={'X-Crawlera-Error': 'banned'}, ) assert mw.process_response(req, res, spider) is res # max bans reached and close_spider called self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
def update_request(request: Request, spider: Spider) -> Request: proxy = spider.settings.get("PROXY") proxy_auth = spider.settings.get("PROXY_AUTH") if proxy: if proxy_auth: request.headers["Proxy-Authorization"] = basic_auth_header(*proxy_auth.split(":")) if "http" not in proxy: proxy = "http://{}".format(proxy) request.meta["proxy"] = proxy else: raise RuntimeError("Proxy url is empty! Proxy is not working!") return request
def process_request(self, request, spider): """ The request will be passed to the AutoExtract server only if the request is explicitly enabled with `{'autoextract': {'enabled': True}}` meta. The page type value must be also present, either in the AUTOEXTRACT_PAGE_TYPE option, or in `{'autoextract': {'pageType': '...'}}` meta. """ if not self._is_enabled_for_request(request): return # If the request was already processed by AutoExtract if request.meta.get(AUTOEXTRACT_META_KEY): return if request.method != 'GET': raise AutoExtractError('Only GET requests are supported by AutoExtract') request.meta[AUTOEXTRACT_META_KEY] = { 'original_url': request.url, 'timing': { 'start_ts': time.time() }, } # Maybe over-write the page type value from the request page_type = self._check_page_type(request) logger.debug('Process AutoExtract request for %s URL %s', page_type, request, extra={'spider': spider}) # Define request timeout request.meta['download_timeout'] = self.timeout # Define concurrency settings self._set_download_slot(request, request.meta) payload = [{'url': request.url, 'pageType': page_type}] headers = Headers({ 'Content-Type': 'application/json', 'Authorization': basic_auth_header(self._api_user, self._api_pass) }) new_request = request.replace( url=self._api_url, method='POST', headers=headers, body=json.dumps(payload, sort_keys=True), ) self.inc_metric('autoextract/request_count') return new_request
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called request.meta['proxy'] = Auth.PROXY request.headers['Proxy-Authorization'] = basic_auth_header( Auth.PROXY_USERNAME, Auth.PROXY_PASSWORD)
def _configure_js(self, spec, settings): self._job_id = settings.get('JOB', '') self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec)
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def update_request(request: Request, spider: Spider) -> Request: if 'proxy' not in request.meta.keys(): proxy = spider.settings.get("PROXY") proxy_auth = spider.settings.get("PROXY_AUTH") if not proxy: raise Exception('Proxy enabled but not configured') if proxy_auth: request.headers["Proxy-Authorization"] = basic_auth_header(*proxy_auth.split(":")) if "http" not in proxy: proxy = "http://{}".format(proxy) request.meta["proxy"] = proxy return request
def start_requests(self): urls = ['https://www.whatismyip.com/'] for url in urls: yield scrapy.Request( url=url, callback=self.parse, meta={'proxy': 'https://gate.smartproxy.com:7000' }, ## Your desired Endpoint headers={ 'Proxy-Authorization': basic_auth_header( 'username', 'password' ) ## Your username and password for the proxy user })
def start_requests(self): yield SplashRequest( url='http://quotes.toscrape.com/js/', endpoint='execute', splash_headers={ 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), }, args={ 'lua_source': self.LUA_SOURCE, 'crawlera_user': self.settings['CRAWLERA_APIKEY'], }, # tell Splash to cache the lua script, to avoid sending it for every request cache_args=['lua_source'], )
def _assert_enabled(self, spider, settings=None, proxyurl='http://proxy.crawlera.com:8010', proxyauth=basic_auth_header('apikey', ''), maxbans=400, download_timeout=190): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) req = Request('http://www.scrapytest.org') assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), proxyurl) self.assertEqual(req.meta.get('download_timeout'), download_timeout) self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) res = Response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy=True' is set req = Request('http://www.scrapytest.org') req.meta['dont_proxy'] = True assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), None) self.assertEqual(req.meta.get('download_timeout'), None) self.assertEqual(req.headers.get('Proxy-Authorization'), None) res = Response(req.url) assert mw.process_response(req, res, spider) is res del req.meta['dont_proxy'] if maxbans > 0: # assert ban count is reseted after a succesful response res = Response('http://ban.me', status=self.bancode) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response('http://unban.me') assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) self.assertEqual(mw._bans[None], 0) # check for not banning before maxbans for bancode for x in range(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response( 'http://ban.me/%d' % x, status=self.bancode, headers={'X-Crawlera-Error': 'banned'}, ) assert mw.process_response(req, res, spider) is res # max bans reached and close_spider called self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
def start_requests(self): self.initProxy() proxyIP = [ "https://107.172.80.209:4444", "https://107.175.235.86:4444", "https://149.20.244.136:4444", "https://152.44.107.127:4444", "https://199.34.83.177:4444", "https://104.202.30.219:4444", "https://107.172.225.111:4444", "https://107.175.229.254:4444" ] maxpage = 10 self.d = open('goru.txt', 'wb') cookies = '' self.maxPage = 30 self.initCsvFile() # url = 'https://www.loopnet.com/for-sale/hospitality/{}/?sk=8166f58f12eb275a5d8c99813f65a9ea'.format(i) self.fh = open("hello.txt", "wb") i = 1 while i < maxpage: # print(url.format(i)) time.sleep(random.randint(1, 5)) # url = 'https://www.loopnet.com/for-sale/san-francisco-ca/hospitality-properties/{}/?bb=nmlx09n8zOwp6_w7B'.format(i) url = 'https://www.loopnet.com/for-sale/hospitality/{}/?sk=8166f58f12eb275a5d8c99813f65a9ea'.format( i) headers = { "Host": "www.loopnet.com", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "DNT": "1", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "en-US,en;q=0.8" } req = scrapy.Request(url, callback=self.parse, headers=headers) i += 1 t = random.randint(0, len(proxyIP) - 1) req.meta['proxy'] = proxyIP[t] req.headers['Proxy-Authorization'] = basic_auth_header( '2b37ecba9f', '4ojgLl8h') yield req
def _assert_enabled( self, spider, settings=None, proxyurl='http://proxy.scrapinghub.com:8010', proxyauth=basic_auth_header('user', 'pass'), bancode=503, maxbans=20, download_timeout=1800, ): crawler = self._mock_crawler(settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) req = Request('http://www.scrapytest.org') assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), proxyurl) self.assertEqual(req.meta.get('download_timeout'), download_timeout) self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) res = Response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy' is set req = Request('http://www.scrapytest.org') req.meta['dont_proxy'] = True assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), None) self.assertEqual(req.meta.get('download_timeout'), None) self.assertEqual(req.headers.get('Proxy-Authorization'), None) res = Response(req.url) assert mw.process_response(req, res, spider) is res if maxbans > 0: # assert ban count is reseted after a succesful response res = Response('http://ban.me', status=bancode) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response('http://unban.me') assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) # check for not banning before maxbans for bancode for x in xrange(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response('http://ban.me/%d' % x, status=bancode) assert mw.process_response(req, res, spider) is res # max bans reached and close_spider called self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
def _assert_enabled( self, spider, settings=None, proxyurl="http://proxy.crawlera.com:8010?noconnect", proxyauth=basic_auth_header("user", "pass"), bancode=503, maxbans=20, download_timeout=1800, ): crawler = self._mock_crawler(settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) req = Request("http://www.scrapytest.org") assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get("proxy"), proxyurl) self.assertEqual(req.meta.get("download_timeout"), download_timeout) self.assertEqual(req.headers.get("Proxy-Authorization"), proxyauth) res = Response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy' is set req = Request("http://www.scrapytest.org") req.meta["dont_proxy"] = True assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get("proxy"), None) self.assertEqual(req.meta.get("download_timeout"), None) self.assertEqual(req.headers.get("Proxy-Authorization"), None) res = Response(req.url) assert mw.process_response(req, res, spider) is res if maxbans > 0: # assert ban count is reseted after a succesful response res = Response("http://ban.me", status=bancode) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response("http://unban.me") assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) self.assertEqual(mw._bans[None], 0) # check for not banning before maxbans for bancode for x in xrange(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response("http://ban.me/%d" % x, status=bancode) assert mw.process_response(req, res, spider) is res # max bans reached and close_spider called self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned"))
def start_requests(self): urls = ['https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[1,2,2019]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[20,11,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[1,8,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[17,5,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[18,4,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[15,3,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[27,11,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[15,9,2017]&f:ptype=[Article,News,Press%20release,Media%20coverage]', 'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[5,5,2017]&f:ptype=[Article,News,Press%20release,Media%20coverage]', ] ] for url in urls: url = 'https://www.willistowerswatson.com/en/press#first={}&sort=%40displayz45xdate%20descending' request = SplashRequest(url=url, splash_headers={'Authorization': basic_auth_header('535209af07354fbbb4110611b27f7504', '')}, args={'wait': 0.5, 'timeout':15}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0'}, callback=self.parse) yield request
def start_requests(self): url = 'https://investorrelations.discover.com/newsroom/press-releases/default.aspx' request = SplashRequest( url=url, splash_headers={ 'Authorization': basic_auth_header('535209af07354fbbb4110611b27f7504', '') }, args={'wait': 0.5}, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0' }, callback=self.parse) yield request
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # request.meta['proxy'] = "https://5.79.73.131:13200" # request.headers['Proxy-Authorization'] = basic_auth_header('*****@*****.**', 'xbyte123') request.meta['proxy'] = "http://zproxy.lum-superproxy.io:22225" request.headers['Proxy-Authorization'] = basic_auth_header( 'lum-customer-xbyte-zone-zone_us-country-us', '0gi0pioy3oey') # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called #request.meta['proxy'] = "https://134.122.17.137:8080" #proxy_user_pass = "******" #encoded_user_pass = base64.encodestring(proxy_user_pass) #request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass proxy = random.choice(proxy_list) request.meta['proxy'] = 'http://'+ proxy+':6060' request.headers['Proxy-Authorization'] = basic_auth_header('*****@*****.**','Hotthdrn591!')
def spider_opened(self, spider): usr = getattr(spider, 'http_user', '') pwd = getattr(spider, 'http_pass', '') if usr or pwd: self.auth = basic_auth_header(usr, pwd) if not hasattr(spider, 'http_auth_domain'): warnings.warn( 'Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security ' 'problems if the spider makes requests to several different domains. http_auth_domain ' 'will be set to the domain of the first request, please set it to the correct value ' 'explicitly.', category=ScrapyDeprecationWarning) self.domain_unset = True else: self.domain = spider.http_auth_domain self.domain_unset = False
def parse(self, response): for quote in response.css('div.quote'): yield { 'text': quote.css('span.text::text').extract_first(), 'author': quote.css('span small::text').extract_first(), 'tags': quote.css('div.tags a.tag::text').extract(), } next_page = response.css('li.next > a::attr(href)').extract_first() if next_page: yield SplashRequest( response.urljoin(next_page), splash_headers={ 'Authorization': basic_auth_header(self.settings['APIKEY'], ''), }, )
def start_requests(self): url = 'https://shopee.co.id/%F0%9F%92%95Celana-Pendek-Pria-i.8497368.57564317' yield SplashRequest(url, self.extract_headers, args={ 'wait': 10, 'images_enabled': False, "timeout": 60, 'lua_source': self.LUA_SOURCE, 'crawlera_user': self.settings['CRAWLERA_APIKEY'], }, endpoint='render.har', # optional; default is render.html splash_headers={ 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), }, cache_args=['lua_source'], )
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def start_requests(self): print "start_requests" #yield SplashRequest(url=parameters.starturl, endpoint='execute', args={'html': 1, 'png': 1, 'wait': parameters.wait, 'timeout': 300, 'lua_source': self.script}, ) yield SplashRequest( url=parameters.starturl, endpoint='execute', splash_headers={ 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), }, args={ 'lua_source': self.LUA_SOURCE, 'crawlera_user': self.settings['CRAWLERA_APIKEY'], 'timeout': 60, }, # tell Splash to cache the lua script, to avoid sending it for every request cache_args=['lua_source'], )
def _configure_js(self, spec, settings): self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self.splash_wait = settings.getint('SPLASH_WAIT', 5) self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30) self.splash_js_source = settings.get('SPLASH_JS_SOURCE', 'function(){}') self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '') self._filter_js_urls = self._build_js_url_filter(spec)
def _assert_enabled(spider, settings=None, url='http://quotes.toscrape.com', api_url='autoextract.scrapinghub.com', api_auth=basic_auth_header('apikey', '')): mw = _mock_mw(spider, settings) req = Request(url, meta=AUTOX_META) out = mw.process_request(req, spider) assert api_url in out.url assert out.meta['autoextract'].get('enabled') assert out.headers.get('Authorization') == api_auth resp = Response(out.url, request=out, body=b'[{}]') proc = mw.process_response(out, resp, spider) assert proc.meta['autoextract'].get('original_url') == url assert isinstance(proc.meta['autoextract'].get('article'), dict)
def start_requests(self): for place_id in self.place_ids: yield SplashRequest( url='https://www.google.com/maps/place/?q=place_id:' + place_id, endpoint='execute', splash_headers={ 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), }, args={ 'wait': 0.5, 'timeout': 60, 'lua_source': self.LUA_SOURCE, }, cache_args=['lua_source'], )
def _configure_js(self, spec, settings): self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self.splash_wait = settings.getint('SPLASH_WAIT', 5) self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30) self.splash_js_source = settings.get( 'SPLASH_JS_SOURCE', 'function(){}') self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '') self._filter_js_urls = self._build_js_url_filter(spec)
def _parse_headers_and_cookies(parsed_args): headers = [] cookies = {} for header in parsed_args.headers or (): name, val = header.split(':', 1) name = name.strip() val = val.strip() if name.title() == 'Cookie': for name, morsel in SimpleCookie(val).items(): cookies[name] = morsel.value else: headers.append((name, val)) if parsed_args.auth: user, password = parsed_args.auth.split(':', 1) headers.append(('Authorization', basic_auth_header(user, password))) return headers, cookies
def __init__(self, proxy_list, proxy_username, proxy_password, logstats_interval, stop_if_no_proxies, max_proxies_to_try, backoff_base, backoff_cap, crawler): backoff = partial(exp_backoff_full_jitter, base=backoff_base, cap=backoff_cap) self.proxies = Proxies(self.cleanup_proxy_list(proxy_list), backoff=backoff) if proxy_username and proxy_password: self.auth_header = basic_auth_header(proxy_username, proxy_password) else: self.auth_header = None self.logstats_interval = logstats_interval self.reanimate_interval = 5 self.stop_if_no_proxies = stop_if_no_proxies self.max_proxies_to_try = max_proxies_to_try self.stats = crawler.stats self.log_task = None self.reanimate_task = None
def parse(self, response): # print("ok") # print(response.body) # self.d.write(response.body) # self.d.close() sel = Selector(response) results = sel.xpath( "//article[contains(@class, 'placard') and contains(@class, 'tier')]" ).extract() # print(results) for result in results: href = '' try: # href1 = result.xpath(".//header//h4/a/@href").extract()[0] href = result[result.find("https:"):result.find("/\',$event")] # print("href>>>>>>>"+href) except: continue listingid = self.getListingID(result) # print("lid>>>>>>>"+listingid) city = self.getCity(result) state = self.getState(result) zip1 = self.getZip(result) propertyType = self.getProperty(result) image = self.getImage(result) time.sleep(random.randint(1, 5)) print("href>>>>>>>" + href) req = scrapy.Request(href, callback=self.parsePage, meta={ 'listingid': listingid, 'city': city, 'state': state, 'zip': zip1, 'propertyType': propertyType, 'image': image }) t = random.randint(0, len(self.proxyIP) - 1) req.meta['proxy'] = self.proxyIP[t] req.headers['Proxy-Authorization'] = basic_auth_header( '2b37ecba9f', '4ojgLl8h') yield req
def _build_request(self, rule, link): print link.url #r = SplashRequest(url=link.url, endpoint='execute', callback=self._response_downloaded, args={'html': 1, 'png': 1, 'wait': parameters.wait, 'timeout': 300, 'lua_source': self.script}) r = SplashRequest( url=link.url, endpoint='execute', callback=self._response_downloaded, dont_filter=True, splash_headers={ 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), }, args={ 'lua_source': self.LUA_SOURCE, 'crawlera_user': self.settings['CRAWLERA_APIKEY'], 'timeout': 60, }, cache_args=['lua_source'], ) r.meta.update(rule=rule, link_text=link.text) return r
def __init__(self, candidate_id, user, password): self.candidate_id, self.username, self.password = candidate_id, user, password self.report_url = 'https://stage-sc.consumerdirect.com/member/credit-report/3b/' self.login_url = 'https://stage-sc.consumerdirect.com/login/' self.auth = basic_auth_header( current_app.config['SMART_CREDIT_HTTP_USER'], current_app.config['SMART_CREDIT_HTTP_PASS']) self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,' 'image/webp,image/apng,*/*;q=0.8,' 'application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8,ru;q=0.7', 'Authorization': self.auth, 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'stage-sc.consumerdirect.com', 'Origin': 'https://stage-sc.consumerdirect.com', 'Referer': 'https://stage-sc.consumerdirect.com/login/', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '******', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/78.0.3904.97 Safari/537.36' }
def test_patch(self): curl_command = ( 'curl "https://example.com/api/fake" -u "username:password" -H "Ac' 'cept: application/vnd.go.cd.v4+json" -H "Content-Type: applicatio' 'n/json" -X PATCH -d \'{"hostname": "agent02.example.com", "agent' '_config_state": "Enabled", "resources": ["Java","Linux"], "enviro' 'nments": ["Dev"]}\'' ) expected_result = { "method": "PATCH", "url": "https://example.com/api/fake", "headers": [ ("Accept", "application/vnd.go.cd.v4+json"), ("Content-Type", "application/json"), ("Authorization", basic_auth_header("username", "password")), ], "body": '{"hostname": "agent02.example.com", "agent_config_state"' ': "Enabled", "resources": ["Java","Linux"], "environments' '": ["Dev"]}', } self._test_command(curl_command, expected_result)
def _read_settings(self, spider: Spider) -> None: settings = spider.crawler.settings if not settings.get("CRAWLERA_FETCH_APIKEY"): self.enabled = False logger.info("Crawlera Fetch API cannot be used without an apikey") return self.apikey = settings["CRAWLERA_FETCH_APIKEY"] self.apipass = settings.get("CRAWLERA_FETCH_APIPASS", "") self.auth_header = basic_auth_header(self.apikey, self.apipass) if settings.get("CRAWLERA_FETCH_URL"): self.url = settings["CRAWLERA_FETCH_URL"] self.download_slot_policy = settings.get( "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain) self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
def _assert_enabled(self, spider, settings=None, proxyurl='http://proxy.scrapinghub.com:8010', proxyauth=basic_auth_header('user', 'pass'), bancode=503, maxbans=20, download_timeout=1800, ): crawler = self._mock_crawler(settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) req = Request('http://www.scrapytest.org') assert mw.process_request(req, spider) is None self.assertEqual(req.meta.get('proxy'), proxyurl) self.assertEqual(req.meta.get('download_timeout'), download_timeout) self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) res = Response(req.url) assert mw.process_response(req, res, spider) is res if maxbans > 0: # assert ban count is reseted after a succesful response res = Response('http://ban.me', status=bancode) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response('http://unban.me') assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) # check for not banning before maxbans for bancode for x in xrange(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = Response('http://ban.me/%d' % x, status=bancode) assert mw.process_response(req, res, spider) is res # max bans reached and close_spider called self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
def process_feed_item(self, item, feed_params): # extract location name from the title re_match = re.search(r"[\s]near[\s](.*)$", item['title']) or re.search(r"[\s]in[\s](.*)$", item['title']) if re_match: location_name = re_match.group(1) else: location_name = 'Unspecified' dt = datetime.strptime (item['incident_datetime'], "%Y-%m-%d %H:%M:%S") # YYYY-MM-DD HH:MM:SS report = { 'incident_title': item['title'], 'incident_description': item['description'], 'incident_date' : dt.strftime('%m/%d/%Y'), # 'MM/DD/YYYY' 'incident_hour' : dt.strftime('%I'), # 01-12 'incident_minute' : dt.strftime('%M'), # 00-59 'incident_ampm' : dt.strftime('%p').lower(), # am | pm 'incident_category' : self.get_category(item, feed_params), 'latitude' : item['lat'], 'longitude' : item['lng'], 'location_name' : location_name, 'person_first' : 'SkyTruth Alerts', 'person_last' : '', 'person_email' : '*****@*****.**', } params = {} api_version = feed_params.get('api_version', 'default') if api_version in ('default', 'oilspill'): params['auth_token'] = 'JDxEF83bd' params['task'] = 'report' params['incident_active'] = '1' params['incident_alert_status'] = '1' elif api_version in ('LABB'): params['task'] = 'reports' # params['task'] = 'report' params['action'] = 'edit' params['incident_active'] = '1' else: self.log('Unknown API version specified: %s' % (api_version), log.ERROR) params = dict(params.items() + report.items()) # retrieve task self.log('publishing item %s to Ushahidi API %s' % (item['id'], feed_params['api_url']), log.DEBUG) request = FormRequest (feed_params['api_url'], formdata=params, callback=self.submit_report_success, errback=self.error_callback, dont_filter=True) request.meta['report'] = report request.meta['item'] = item request.meta['feed_params'] = feed_params request.meta['dont_retry'] = True if feed_params.get ('http_user'): self.log('Authenticating with user: %s' % (feed_params.get ('http_user')), log.INFO) request.headers['Authorization'] = basic_auth_header(feed_params.get ('http_user'), feed_params.get ('http_password')) #yield request yield self.filter_request(request)
def get_proxyauth(self, spider): """Hook to compute Proxy-Authorization header by custom rules.""" return basic_auth_header(self.user, getattr(self, 'pass'))
def get_proxyauth(self, spider): """Hook to compute Proxy-Authorization header by custom rules.""" if self.apikey: return basic_auth_header(self.apikey, "") return basic_auth_header(self.user, getattr(self, "pass"))
def get_proxyauth(self, spider): """Hook to compute Proxy-Authorization header by custom rules.""" return basic_auth_header(self.apikey, '')
def request_authenticate(request, username, password): """Autenticate the given request (in place) using the HTTP basic access authentication mechanism (RFC 2617) and the given username and password """ request.headers['Authorization'] = basic_auth_header(username, password)
def spider_opened(self, spider): usr = getattr(spider, 'http_user', '') pwd = getattr(spider, 'http_pass', '') if usr or pwd: self.auth = basic_auth_header(usr, pwd)
def test_basic_auth_header_encoding(self): self.assertEqual(b'Basic c29tw6Z1c8Oocjpzw7htZXDDpHNz', basic_auth_header(u'somæusèr', u'sømepäss', encoding='utf8')) # default encoding (ISO-8859-1) self.assertEqual(b'Basic c29t5nVz6HI6c_htZXDkc3M=', basic_auth_header(u'somæusèr', u'sømepäss'))
def test_basic_auth_header(self): self.assertEqual(b'Basic c29tZXVzZXI6c29tZXBhc3M=', basic_auth_header('someuser', 'somepass')) # Check url unsafe encoded header self.assertEqual(b'Basic c29tZXVzZXI6QDx5dTk-Jm8_UQ==', basic_auth_header('someuser', '@<yu9>&o?Q'))
def _authorization(self, spider): usr = getattr(spider, 'http_user', '') pwd = getattr(spider, 'http_pass', '') if usr or pwd: return basic_auth_header(usr, pwd)