def test_apikey(self):
        self.spider.crawlera_enabled = True
        self.settings['CRAWLERA_APIKEY'] = apikey = 'apikey'
        proxyauth = basic_auth_header(apikey, '')
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)

        self.spider.crawlera_apikey = apikey = 'notfromsettings'
        proxyauth = basic_auth_header(apikey, '')
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
Example #2
0
 def _add_auth_header(self, request):
     if self._user_name != None and self._user_password != None:
         request.add_header('Authorization', basic_auth_header(self._user_name, self._user_password))
     else:  # try netrc
         try:
             host = urlparse(self._get_server_url()).hostname
             a = netrc().authenticators(host)
             request.add_header('Authorization', basic_auth_header(a[0], a[2]))
         except (NetrcParseError, IOError, TypeError):
             pass
Example #3
0
    def test_userpass(self):
        self.spider.use_hubproxy = True
        self.settings['HUBPROXY_USER'] = user = '******'
        self.settings['HUBPROXY_PASS'] = pass_ = 'secret'
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)

        self.spider.hubproxy_user = user = '******'
        self.spider.hubproxy_pass = pass_ = 'anothersecret'
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
Example #4
0
    def test_userpass(self):
        self.spider.crawlera_enabled = True
        self.settings["CRAWLERA_USER"] = user = "******"
        self.settings["CRAWLERA_PASS"] = pass_ = "secret"
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)

        self.spider.crawlera_user = user = "******"
        self.spider.crawlera_pass = pass_ = "anothersecret"
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
    def test_userpass(self):
        self.spider.crawlera_enabled = True
        self.settings['CRAWLERA_USER'] = user = '******'
        self.settings['CRAWLERA_PASS'] = pass_ = 'secret'
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)

        self.spider.crawlera_user = user = '******'
        self.spider.crawlera_pass = pass_ = 'anothersecret'
        proxyauth = basic_auth_header(user, pass_)
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
    def test_apikey_assignment(self):
        self.spider.crawlera_enabled = True

        apikey = 'someapikey'
        self.settings['CRAWLERA_APIKEY'] = None
        self.settings['CRAWLERA_USER'] = apikey
        self.settings['CRAWLERA_PASS'] = ''
        proxyauth = basic_auth_header(apikey, '')
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)

        self.settings['CRAWLERA_USER'] = None
        self.settings['CRAWLERA_APIKEY'] = apikey
        self.settings['CRAWLERA_PASS'] = ''
        proxyauth = basic_auth_header(apikey, '')
        self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
Example #7
0
 def __init__(self, user, password, maxbans, url, crawler, enabled=False):
     self.url = url
     self.user = user
     self.auth = basic_auth_header(user, password)
     self.crawler = crawler
     self.enabled = enabled
     self.maxbans = maxbans
     self.bans = 0
Example #8
0
 def __init__(self, crawler, splash_base_url, slot_policy):
     self.crawler = crawler
     self.splash_base_url = splash_base_url
     self.slot_policy = slot_policy
     self.splash_auth = None
     user = crawler.settings.get('SPLASH_USER')
     passwd = crawler.settings.get('SPLASH_PASS', '')
     if user:
         self.splash_auth = basic_auth_header(user, passwd)
Example #9
0
    def open_spider(self, spider):
        try:
            self.enabled = spider.use_hubproxy
            self.user = spider.hubproxy_user
            self.auth = basic_auth_header(spider.hubproxy_user, spider.hubproxy_pass)
        except AttributeError:
            pass

        if self.enabled:
            log.msg("Using hubproxy at %s (user: %s)" % (self.url, self.user),
                spider=spider)
Example #10
0
 def test_get_basic_auth(self):
     curl_command = 'curl "https://api.test.com/" -u ' \
                    '"some_username:some_password"'
     expected_result = {
         "method":
         "GET",
         "url":
         "https://api.test.com/",
         "headers": [("Authorization",
                      basic_auth_header("some_username", "some_password"))],
     }
     self._test_command(curl_command, expected_result)
    def _assert_enabled(self,
                        spider,
                        settings=None,
                        proxyurl='http://proxy.zyte.com:8011',
                        proxyauth=basic_auth_header('apikey', ''),
                        maxbans=400,
                        download_timeout=190):
        crawler = self._mock_crawler(spider, settings)
        mw = self.mwcls.from_crawler(crawler)
        mw.open_spider(spider)
        req = Request('http://www.scrapytest.org')
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), proxyurl)
        self.assertEqual(req.meta.get('download_timeout'), download_timeout)
        self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
        res = self._mock_zyte_smartproxy_response(req.url)
        assert mw.process_response(req, res, spider) is res

        # disabled if 'dont_proxy=True' is set
        req = Request('http://www.scrapytest.org')
        req.meta['dont_proxy'] = True
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), None)
        self.assertEqual(req.meta.get('download_timeout'), None)
        self.assertEqual(req.headers.get('Proxy-Authorization'), None)
        res = self._mock_zyte_smartproxy_response(req.url)
        assert mw.process_response(req, res, spider) is res
        del req.meta['dont_proxy']

        if maxbans > 0:
            # assert ban count is reseted after a succesful response
            res = self._mock_zyte_smartproxy_response('http://ban.me',
                                                      status=self.bancode)
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = self._mock_zyte_smartproxy_response('http://unban.me')
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            self.assertEqual(mw._bans[None], 0)

        # check for not banning before maxbans for bancode
        for x in range(maxbans + 1):
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = self._mock_zyte_smartproxy_response(
                'http://ban.me/%d' % x,
                status=self.bancode,
                headers={'X-Crawlera-Error': 'banned'},
            )
            assert mw.process_response(req, res, spider) is res

        # max bans reached and close_spider called
        self.assertEqual(crawler.engine.fake_spider_closed_result,
                         (spider, 'banned'))
Example #12
0
 def update_request(request: Request, spider: Spider) -> Request:
     proxy = spider.settings.get("PROXY")
     proxy_auth = spider.settings.get("PROXY_AUTH")
     if proxy:
         if proxy_auth:
             request.headers["Proxy-Authorization"] = basic_auth_header(*proxy_auth.split(":"))
         if "http" not in proxy:
             proxy = "http://{}".format(proxy)
         request.meta["proxy"] = proxy
     else:
         raise RuntimeError("Proxy url is empty! Proxy is not working!")
     return request
Example #13
0
    def process_request(self, request, spider):
        """
        The request will be passed to the AutoExtract server only if the request
        is explicitly enabled with `{'autoextract': {'enabled': True}}` meta.
        The page type value must be also present, either in the
        AUTOEXTRACT_PAGE_TYPE option, or in `{'autoextract': {'pageType': '...'}}` meta.
        """
        if not self._is_enabled_for_request(request):
            return

        # If the request was already processed by AutoExtract
        if request.meta.get(AUTOEXTRACT_META_KEY):
            return

        if request.method != 'GET':
            raise AutoExtractError('Only GET requests are supported by AutoExtract')

        request.meta[AUTOEXTRACT_META_KEY] = {
            'original_url': request.url,
            'timing': {
                'start_ts': time.time()
            },
        }

        # Maybe over-write the page type value from the request
        page_type = self._check_page_type(request)
        logger.debug('Process AutoExtract request for %s URL %s',
                     page_type,
                     request,
                     extra={'spider': spider})

        # Define request timeout
        request.meta['download_timeout'] = self.timeout

        # Define concurrency settings
        self._set_download_slot(request, request.meta)

        payload = [{'url': request.url, 'pageType': page_type}]
        headers = Headers({
            'Content-Type': 'application/json',
            'Authorization': basic_auth_header(self._api_user, self._api_pass)
        })

        new_request = request.replace(
            url=self._api_url,
            method='POST',
            headers=headers,
            body=json.dumps(payload, sort_keys=True),
        )

        self.inc_metric('autoextract/request_count')
        return new_request
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        request.meta['proxy'] = Auth.PROXY
        request.headers['Proxy-Authorization'] = basic_auth_header(
            Auth.PROXY_USERNAME, Auth.PROXY_PASSWORD)
Example #15
0
 def _configure_js(self, spec, settings):
     self._job_id = settings.get('JOB', '')
     self.js_enabled = False
     self.SPLASH_HOST = None
     if settings.get('SPLASH_URL'):
         self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
         self.js_enabled = spec.get('js_enabled', False)
     if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                             or settings.get('SPLASH_USER') is not None):
         self.splash_auth = basic_auth_header(
             settings.get('SPLASH_USER', ''),
             settings.get('SPLASH_PASS', ''))
     self._filter_js_urls = self._build_js_url_filter(spec)
Example #16
0
 def _configure_js(self, spec, settings):
     self._job_id = settings.get('JOB', '')
     self.js_enabled = False
     self.SPLASH_HOST = None
     if settings.get('SPLASH_URL'):
         self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
         self.js_enabled = spec.get('js_enabled', False)
     if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                             settings.get('SPLASH_USER') is not None):
         self.splash_auth = basic_auth_header(
             settings.get('SPLASH_USER', ''),
             settings.get('SPLASH_PASS', ''))
     self._filter_js_urls = self._build_js_url_filter(spec)
Example #17
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                                or settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Example #18
0
    def update_request(request: Request, spider: Spider) -> Request:
        if 'proxy' not in request.meta.keys():
            proxy = spider.settings.get("PROXY")
            proxy_auth = spider.settings.get("PROXY_AUTH")

            if not proxy:
                raise Exception('Proxy enabled but not configured')

            if proxy_auth:
                request.headers["Proxy-Authorization"] = basic_auth_header(*proxy_auth.split(":"))
            if "http" not in proxy:
                proxy = "http://{}".format(proxy)
            request.meta["proxy"] = proxy
            return request
 def start_requests(self):
     urls = ['https://www.whatismyip.com/']
     for url in urls:
         yield scrapy.Request(
             url=url,
             callback=self.parse,
             meta={'proxy': 'https://gate.smartproxy.com:7000'
                   },  ## Your desired Endpoint
             headers={
                 'Proxy-Authorization':
                 basic_auth_header(
                     'username', 'password'
                 )  ## Your username and password for the proxy user
             })
Example #20
0
 def start_requests(self):
     yield SplashRequest(
         url='http://quotes.toscrape.com/js/',
         endpoint='execute',
         splash_headers={
             'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
         },
         args={
             'lua_source': self.LUA_SOURCE,
             'crawlera_user': self.settings['CRAWLERA_APIKEY'],
         },
         # tell Splash to cache the lua script, to avoid sending it for every request
         cache_args=['lua_source'],
     )
    def _assert_enabled(self, spider,
                        settings=None,
                        proxyurl='http://proxy.crawlera.com:8010',
                        proxyauth=basic_auth_header('apikey', ''),
                        maxbans=400,
                        download_timeout=190):
        crawler = self._mock_crawler(spider, settings)
        mw = self.mwcls.from_crawler(crawler)
        mw.open_spider(spider)
        req = Request('http://www.scrapytest.org')
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), proxyurl)
        self.assertEqual(req.meta.get('download_timeout'), download_timeout)
        self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        # disabled if 'dont_proxy=True' is set
        req = Request('http://www.scrapytest.org')
        req.meta['dont_proxy'] = True
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), None)
        self.assertEqual(req.meta.get('download_timeout'), None)
        self.assertEqual(req.headers.get('Proxy-Authorization'), None)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res
        del req.meta['dont_proxy']

        if maxbans > 0:
            # assert ban count is reseted after a succesful response
            res = Response('http://ban.me', status=self.bancode)
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response('http://unban.me')
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            self.assertEqual(mw._bans[None], 0)

        # check for not banning before maxbans for bancode
        for x in range(maxbans + 1):
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response(
                'http://ban.me/%d' % x,
                status=self.bancode,
                headers={'X-Crawlera-Error': 'banned'},
            )
            assert mw.process_response(req, res, spider) is res

        # max bans reached and close_spider called
        self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
Example #22
0
    def start_requests(self):

        self.initProxy()
        proxyIP = [
            "https://107.172.80.209:4444", "https://107.175.235.86:4444",
            "https://149.20.244.136:4444", "https://152.44.107.127:4444",
            "https://199.34.83.177:4444", "https://104.202.30.219:4444",
            "https://107.172.225.111:4444", "https://107.175.229.254:4444"
        ]
        maxpage = 10
        self.d = open('goru.txt', 'wb')
        cookies = ''

        self.maxPage = 30
        self.initCsvFile()

        # url = 'https://www.loopnet.com/for-sale/hospitality/{}/?sk=8166f58f12eb275a5d8c99813f65a9ea'.format(i)
        self.fh = open("hello.txt", "wb")

        i = 1
        while i < maxpage:
            # print(url.format(i))

            time.sleep(random.randint(1, 5))
            # url = 'https://www.loopnet.com/for-sale/san-francisco-ca/hospitality-properties/{}/?bb=nmlx09n8zOwp6_w7B'.format(i)
            url = 'https://www.loopnet.com/for-sale/hospitality/{}/?sk=8166f58f12eb275a5d8c99813f65a9ea'.format(
                i)
            headers = {
                "Host": "www.loopnet.com",
                "Connection": "keep-alive",
                "Cache-Control": "max-age=0",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "DNT": "1",
                "Accept-Encoding": "gzip, deflate, sdch",
                "Accept-Language": "en-US,en;q=0.8"
            }

            req = scrapy.Request(url, callback=self.parse, headers=headers)
            i += 1

            t = random.randint(0, len(proxyIP) - 1)
            req.meta['proxy'] = proxyIP[t]
            req.headers['Proxy-Authorization'] = basic_auth_header(
                '2b37ecba9f', '4ojgLl8h')
            yield req
Example #23
0
    def _assert_enabled(
        self,
        spider,
        settings=None,
        proxyurl='http://proxy.scrapinghub.com:8010',
        proxyauth=basic_auth_header('user', 'pass'),
        bancode=503,
        maxbans=20,
        download_timeout=1800,
    ):
        crawler = self._mock_crawler(settings)
        mw = self.mwcls.from_crawler(crawler)
        mw.open_spider(spider)
        req = Request('http://www.scrapytest.org')
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), proxyurl)
        self.assertEqual(req.meta.get('download_timeout'), download_timeout)
        self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        # disabled if 'dont_proxy' is set
        req = Request('http://www.scrapytest.org')
        req.meta['dont_proxy'] = True
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), None)
        self.assertEqual(req.meta.get('download_timeout'), None)
        self.assertEqual(req.headers.get('Proxy-Authorization'), None)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        if maxbans > 0:
            # assert ban count is reseted after a succesful response
            res = Response('http://ban.me', status=bancode)
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response('http://unban.me')
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)

        # check for not banning before maxbans for bancode
        for x in xrange(maxbans + 1):
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response('http://ban.me/%d' % x, status=bancode)
            assert mw.process_response(req, res, spider) is res

        # max bans reached and close_spider called
        self.assertEqual(crawler.engine.fake_spider_closed_result,
                         (spider, 'banned'))
Example #24
0
    def _assert_enabled(
        self,
        spider,
        settings=None,
        proxyurl="http://proxy.crawlera.com:8010?noconnect",
        proxyauth=basic_auth_header("user", "pass"),
        bancode=503,
        maxbans=20,
        download_timeout=1800,
    ):
        crawler = self._mock_crawler(settings)
        mw = self.mwcls.from_crawler(crawler)
        mw.open_spider(spider)
        req = Request("http://www.scrapytest.org")
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get("proxy"), proxyurl)
        self.assertEqual(req.meta.get("download_timeout"), download_timeout)
        self.assertEqual(req.headers.get("Proxy-Authorization"), proxyauth)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        # disabled if 'dont_proxy' is set
        req = Request("http://www.scrapytest.org")
        req.meta["dont_proxy"] = True
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get("proxy"), None)
        self.assertEqual(req.meta.get("download_timeout"), None)
        self.assertEqual(req.headers.get("Proxy-Authorization"), None)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        if maxbans > 0:
            # assert ban count is reseted after a succesful response
            res = Response("http://ban.me", status=bancode)
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response("http://unban.me")
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            self.assertEqual(mw._bans[None], 0)

        # check for not banning before maxbans for bancode
        for x in xrange(maxbans + 1):
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response("http://ban.me/%d" % x, status=bancode)
            assert mw.process_response(req, res, spider) is res

        # max bans reached and close_spider called
        self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned"))
 def start_requests(self):
     urls = ['https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[1,2,2019]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[20,11,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[1,8,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[17,5,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[18,4,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[15,3,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[27,11,2018]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[15,9,2017]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             'https://www.iqvia.com/newsroom#t=Corporate&f:pdate=[5,5,2017]&f:ptype=[Article,News,Press%20release,Media%20coverage]',
             ]        ]
     for url in urls:
         url = 'https://www.willistowerswatson.com/en/press#first={}&sort=%40displayz45xdate%20descending'
         request = SplashRequest(url=url, splash_headers={'Authorization': basic_auth_header('535209af07354fbbb4110611b27f7504', '')}, args={'wait': 0.5, 'timeout':15}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0'}, callback=self.parse)
         yield request
Example #26
0
 def start_requests(self):
     url = 'https://investorrelations.discover.com/newsroom/press-releases/default.aspx'
     request = SplashRequest(
         url=url,
         splash_headers={
             'Authorization':
             basic_auth_header('535209af07354fbbb4110611b27f7504', '')
         },
         args={'wait': 0.5},
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/201,00101 Firefox/62.0'
         },
         callback=self.parse)
     yield request
Example #27
0
 def process_request(self, request, spider):
     # Called for each request that goes through the downloader
     # middleware.
     # request.meta['proxy'] = "https://5.79.73.131:13200"
     # request.headers['Proxy-Authorization'] = basic_auth_header('*****@*****.**', 'xbyte123')
     request.meta['proxy'] = "http://zproxy.lum-superproxy.io:22225"
     request.headers['Proxy-Authorization'] = basic_auth_header(
         'lum-customer-xbyte-zone-zone_us-country-us', '0gi0pioy3oey')
     # Must either:
     # - return None: continue processing this request
     # - or return a Response object
     # - or return a Request object
     # - or raise IgnoreRequest: process_exception() methods of
     #   installed downloader middleware will be called
     return None
 def process_request(self, request, spider):
     # Called for each request that goes through the downloader
     # middleware.
     # Must either:
     # - return None: continue processing this request
     # - or return a Response object
     # - or return a Request object
     # - or raise IgnoreRequest: process_exception() methods of
     #   installed downloader middleware will be called
     #request.meta['proxy'] = "https://134.122.17.137:8080"
     #proxy_user_pass = "******"
     #encoded_user_pass = base64.encodestring(proxy_user_pass)
     #request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
     proxy = random.choice(proxy_list)
     request.meta['proxy'] = 'http://'+ proxy+':6060'
     request.headers['Proxy-Authorization'] = basic_auth_header('*****@*****.**','Hotthdrn591!')
Example #29
0
 def spider_opened(self, spider):
     usr = getattr(spider, 'http_user', '')
     pwd = getattr(spider, 'http_pass', '')
     if usr or pwd:
         self.auth = basic_auth_header(usr, pwd)
         if not hasattr(spider, 'http_auth_domain'):
             warnings.warn(
                 'Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
                 'problems if the spider makes requests to several different domains. http_auth_domain '
                 'will be set to the domain of the first request, please set it to the correct value '
                 'explicitly.',
                 category=ScrapyDeprecationWarning)
             self.domain_unset = True
         else:
             self.domain = spider.http_auth_domain
             self.domain_unset = False
Example #30
0
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

        next_page = response.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield SplashRequest(
                response.urljoin(next_page),
                splash_headers={
                    'Authorization': basic_auth_header(self.settings['APIKEY'], ''),
                },
            )
 def start_requests(self):
     url = 'https://shopee.co.id/%F0%9F%92%95Celana-Pendek-Pria-i.8497368.57564317'
     yield SplashRequest(url, self.extract_headers,
                         args={
                             'wait': 10,
                             'images_enabled': False,
                             "timeout": 60,
                             'lua_source': self.LUA_SOURCE,
                             'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                         },
                         endpoint='render.har',  # optional; default is render.html
                         splash_headers={
                             'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
                         },
                         cache_args=['lua_source'],
                         )
Example #32
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                                settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Example #33
0
 def start_requests(self):
     print "start_requests"
     #yield SplashRequest(url=parameters.starturl, endpoint='execute', args={'html': 1, 'png': 1, 'wait': parameters.wait, 'timeout': 300, 'lua_source': self.script}, )
     yield SplashRequest(
         url=parameters.starturl,
         endpoint='execute',
         splash_headers={
             'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
         },
         args={
             'lua_source': self.LUA_SOURCE,
             'crawlera_user': self.settings['CRAWLERA_APIKEY'],
             'timeout': 60,
         },
         # tell Splash to cache the lua script, to avoid sending it for every request
         cache_args=['lua_source'],
     )
Example #34
0
 def _configure_js(self, spec, settings):
     self.js_enabled = False
     self.SPLASH_HOST = None
     if settings.get('SPLASH_URL'):
         self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
         self.js_enabled = spec.get('js_enabled', False)
     if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                             or settings.get('SPLASH_USER') is not None):
         self.splash_auth = basic_auth_header(
             settings.get('SPLASH_USER', ''),
             settings.get('SPLASH_PASS', ''))
     self.splash_wait = settings.getint('SPLASH_WAIT', 5)
     self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30)
     self.splash_js_source = settings.get('SPLASH_JS_SOURCE',
                                          'function(){}')
     self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '')
     self._filter_js_urls = self._build_js_url_filter(spec)
def _assert_enabled(spider,
                    settings=None,
                    url='http://quotes.toscrape.com',
                    api_url='autoextract.scrapinghub.com',
                    api_auth=basic_auth_header('apikey', '')):
    mw = _mock_mw(spider, settings)

    req = Request(url, meta=AUTOX_META)
    out = mw.process_request(req, spider)
    assert api_url in out.url
    assert out.meta['autoextract'].get('enabled')
    assert out.headers.get('Authorization') == api_auth

    resp = Response(out.url, request=out, body=b'[{}]')
    proc = mw.process_response(out, resp, spider)
    assert proc.meta['autoextract'].get('original_url') == url
    assert isinstance(proc.meta['autoextract'].get('article'), dict)
 def start_requests(self):
     for place_id in self.place_ids:
         yield SplashRequest(
             url='https://www.google.com/maps/place/?q=place_id:' +
             place_id,
             endpoint='execute',
             splash_headers={
                 'Authorization':
                 basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
             },
             args={
                 'wait': 0.5,
                 'timeout': 60,
                 'lua_source': self.LUA_SOURCE,
             },
             cache_args=['lua_source'],
         )
Example #37
0
 def _configure_js(self, spec, settings):
     self.js_enabled = False
     self.SPLASH_HOST = None
     if settings.get('SPLASH_URL'):
         self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
         self.js_enabled = spec.get('js_enabled', False)
     if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                             settings.get('SPLASH_USER') is not None):
         self.splash_auth = basic_auth_header(
             settings.get('SPLASH_USER', ''),
             settings.get('SPLASH_PASS', ''))
     self.splash_wait = settings.getint('SPLASH_WAIT', 5)
     self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30)
     self.splash_js_source = settings.get(
         'SPLASH_JS_SOURCE', 'function(){}')
     self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '')
     self._filter_js_urls = self._build_js_url_filter(spec)
Example #38
0
def _parse_headers_and_cookies(parsed_args):
    headers = []
    cookies = {}
    for header in parsed_args.headers or ():
        name, val = header.split(':', 1)
        name = name.strip()
        val = val.strip()
        if name.title() == 'Cookie':
            for name, morsel in SimpleCookie(val).items():
                cookies[name] = morsel.value
        else:
            headers.append((name, val))

    if parsed_args.auth:
        user, password = parsed_args.auth.split(':', 1)
        headers.append(('Authorization', basic_auth_header(user, password)))

    return headers, cookies
Example #39
0
    def __init__(self, proxy_list, proxy_username, proxy_password,
                 logstats_interval, stop_if_no_proxies,
                 max_proxies_to_try, backoff_base, backoff_cap, crawler):

        backoff = partial(exp_backoff_full_jitter, base=backoff_base, cap=backoff_cap)
        self.proxies = Proxies(self.cleanup_proxy_list(proxy_list),
                               backoff=backoff)
        if proxy_username and proxy_password:
            self.auth_header = basic_auth_header(proxy_username, proxy_password)
        else:
            self.auth_header = None
        self.logstats_interval = logstats_interval
        self.reanimate_interval = 5
        self.stop_if_no_proxies = stop_if_no_proxies
        self.max_proxies_to_try = max_proxies_to_try
        self.stats = crawler.stats

        self.log_task = None
        self.reanimate_task = None
Example #40
0
 def parse(self, response):
     # print("ok")
     # print(response.body)
     # self.d.write(response.body)
     # self.d.close()
     sel = Selector(response)
     results = sel.xpath(
         "//article[contains(@class, 'placard') and contains(@class, 'tier')]"
     ).extract()
     # print(results)
     for result in results:
         href = ''
         try:
             # href1 = result.xpath(".//header//h4/a/@href").extract()[0]
             href = result[result.find("https:"):result.find("/\',$event")]
             # print("href>>>>>>>"+href)
         except:
             continue
         listingid = self.getListingID(result)
         # print("lid>>>>>>>"+listingid)
         city = self.getCity(result)
         state = self.getState(result)
         zip1 = self.getZip(result)
         propertyType = self.getProperty(result)
         image = self.getImage(result)
         time.sleep(random.randint(1, 5))
         print("href>>>>>>>" + href)
         req = scrapy.Request(href,
                              callback=self.parsePage,
                              meta={
                                  'listingid': listingid,
                                  'city': city,
                                  'state': state,
                                  'zip': zip1,
                                  'propertyType': propertyType,
                                  'image': image
                              })
         t = random.randint(0, len(self.proxyIP) - 1)
         req.meta['proxy'] = self.proxyIP[t]
         req.headers['Proxy-Authorization'] = basic_auth_header(
             '2b37ecba9f', '4ojgLl8h')
         yield req
Example #41
0
 def _build_request(self, rule, link):
     print link.url
     #r = SplashRequest(url=link.url, endpoint='execute', callback=self._response_downloaded, args={'html': 1, 'png': 1, 'wait': parameters.wait, 'timeout': 300, 'lua_source': self.script})
     r = SplashRequest(
         url=link.url,
         endpoint='execute', 
         callback=self._response_downloaded,
         dont_filter=True,
         splash_headers={
             'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
         },
         args={
             'lua_source': self.LUA_SOURCE,
             'crawlera_user': self.settings['CRAWLERA_APIKEY'],
             'timeout': 60,
         },
         cache_args=['lua_source'],
     )
     r.meta.update(rule=rule, link_text=link.text)
     return r
 def __init__(self, candidate_id, user, password):
     self.candidate_id, self.username, self.password = candidate_id, user, password
     self.report_url = 'https://stage-sc.consumerdirect.com/member/credit-report/3b/'
     self.login_url = 'https://stage-sc.consumerdirect.com/login/'
     self.auth = basic_auth_header(
         current_app.config['SMART_CREDIT_HTTP_USER'],
         current_app.config['SMART_CREDIT_HTTP_PASS'])
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,'
         'image/webp,image/apng,*/*;q=0.8,'
         'application/signed-exchange;v=b3',
         'Accept-Encoding':
         'gzip, deflate, br',
         'Accept-Language':
         'en-US,en;q=0.9,hi;q=0.8,ru;q=0.7',
         'Authorization':
         self.auth,
         'Connection':
         'keep-alive',
         'Content-Type':
         'application/x-www-form-urlencoded',
         'Host':
         'stage-sc.consumerdirect.com',
         'Origin':
         'https://stage-sc.consumerdirect.com',
         'Referer':
         'https://stage-sc.consumerdirect.com/login/',
         'Sec-Fetch-Mode':
         'navigate',
         'Sec-Fetch-Site':
         'same-origin',
         'Sec-Fetch-User':
         '******',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) '
         'AppleWebKit/537.36 (KHTML, like Gecko) '
         'Chrome/78.0.3904.97 Safari/537.36'
     }
Example #43
0
 def test_patch(self):
     curl_command = (
         'curl "https://example.com/api/fake" -u "username:password" -H "Ac'
         'cept: application/vnd.go.cd.v4+json" -H "Content-Type: applicatio'
         'n/json" -X PATCH -d \'{"hostname": "agent02.example.com",  "agent'
         '_config_state": "Enabled", "resources": ["Java","Linux"], "enviro'
         'nments": ["Dev"]}\''
     )
     expected_result = {
         "method": "PATCH",
         "url": "https://example.com/api/fake",
         "headers": [
             ("Accept", "application/vnd.go.cd.v4+json"),
             ("Content-Type", "application/json"),
             ("Authorization", basic_auth_header("username", "password")),
         ],
         "body": '{"hostname": "agent02.example.com",  "agent_config_state"'
                 ': "Enabled", "resources": ["Java","Linux"], "environments'
                 '": ["Dev"]}',
     }
     self._test_command(curl_command, expected_result)
    def _read_settings(self, spider: Spider) -> None:
        settings = spider.crawler.settings
        if not settings.get("CRAWLERA_FETCH_APIKEY"):
            self.enabled = False
            logger.info("Crawlera Fetch API cannot be used without an apikey")
            return

        self.apikey = settings["CRAWLERA_FETCH_APIKEY"]
        self.apipass = settings.get("CRAWLERA_FETCH_APIPASS", "")
        self.auth_header = basic_auth_header(self.apikey, self.apipass)

        if settings.get("CRAWLERA_FETCH_URL"):
            self.url = settings["CRAWLERA_FETCH_URL"]

        self.download_slot_policy = settings.get(
            "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain)

        self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR",
                                               True)

        self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
Example #45
0
    def _assert_enabled(self, spider,
                        settings=None,
                        proxyurl='http://proxy.scrapinghub.com:8010',
                        proxyauth=basic_auth_header('user', 'pass'),
                        bancode=503,
                        maxbans=20,
                        download_timeout=1800,
                       ):
        crawler = self._mock_crawler(settings)
        mw = self.mwcls.from_crawler(crawler)
        mw.open_spider(spider)
        req = Request('http://www.scrapytest.org')
        assert mw.process_request(req, spider) is None
        self.assertEqual(req.meta.get('proxy'), proxyurl)
        self.assertEqual(req.meta.get('download_timeout'), download_timeout)
        self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
        res = Response(req.url)
        assert mw.process_response(req, res, spider) is res

        if maxbans > 0:
            # assert ban count is reseted after a succesful response
            res = Response('http://ban.me', status=bancode)
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response('http://unban.me')
            assert mw.process_response(req, res, spider) is res
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)

        # check for not banning before maxbans for bancode
        for x in xrange(maxbans + 1):
            self.assertEqual(crawler.engine.fake_spider_closed_result, None)
            res = Response('http://ban.me/%d' % x, status=bancode)
            assert mw.process_response(req, res, spider) is res

        # max bans reached and close_spider called
        self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
Example #46
0
    def process_feed_item(self, item, feed_params):
        
        # extract location name from the title
        re_match = re.search(r"[\s]near[\s](.*)$", item['title']) or re.search(r"[\s]in[\s](.*)$", item['title'])
        if re_match:
            location_name = re_match.group(1)
        else: 
            location_name = 'Unspecified'
                        
        dt = datetime.strptime (item['incident_datetime'], "%Y-%m-%d %H:%M:%S") # YYYY-MM-DD HH:MM:SS
        report = {
            'incident_title': item['title'],
            'incident_description': item['description'],
            'incident_date' :  dt.strftime('%m/%d/%Y'),  #  'MM/DD/YYYY'
            'incident_hour' :  dt.strftime('%I'),        # 01-12
            'incident_minute' :  dt.strftime('%M'),      # 00-59   
            'incident_ampm' :  dt.strftime('%p').lower(),  # am | pm
            'incident_category' : self.get_category(item, feed_params),
            'latitude' : item['lat'],
            'longitude' : item['lng'],
            'location_name' : location_name,
            'person_first' : 'SkyTruth Alerts',
            'person_last' : '',
            'person_email' : '*****@*****.**',
            }            
        params = {}
        
        api_version = feed_params.get('api_version', 'default')
        if api_version in ('default', 'oilspill'):
            params['auth_token'] = 'JDxEF83bd'
            params['task'] = 'report'
            params['incident_active'] = '1' 
            params['incident_alert_status'] = '1' 
        elif api_version in ('LABB'):
            params['task'] = 'reports'
#            params['task'] = 'report'
            params['action'] = 'edit'
            params['incident_active'] = '1' 
        else:
            self.log('Unknown API version specified: %s' % (api_version), log.ERROR)
            
    
        params = dict(params.items() + report.items())
        
        # retrieve task
        
        self.log('publishing item %s to Ushahidi API %s' % (item['id'], feed_params['api_url']), log.DEBUG)
    
        request = FormRequest (feed_params['api_url'], formdata=params,
        callback=self.submit_report_success,
        errback=self.error_callback,
        dont_filter=True)

        request.meta['report'] = report
        request.meta['item'] = item
        request.meta['feed_params'] = feed_params
        request.meta['dont_retry'] = True
        
        if feed_params.get ('http_user'):
            self.log('Authenticating with user: %s' % (feed_params.get ('http_user')), log.INFO)
            request.headers['Authorization'] = basic_auth_header(feed_params.get ('http_user'), feed_params.get ('http_password'))
        
        #yield request
        yield self.filter_request(request)
Example #47
0
 def get_proxyauth(self, spider):
     """Hook to compute Proxy-Authorization header by custom rules."""
     return basic_auth_header(self.user, getattr(self, 'pass'))
 def get_proxyauth(self, spider):
     """Hook to compute Proxy-Authorization header by custom rules."""
     if self.apikey:
         return basic_auth_header(self.apikey, "")
     return basic_auth_header(self.user, getattr(self, "pass"))
 def get_proxyauth(self, spider):
     """Hook to compute Proxy-Authorization header by custom rules."""
     return basic_auth_header(self.apikey, '')
Example #50
0
def request_authenticate(request, username, password):
    """Autenticate the given request (in place) using the HTTP basic access
    authentication mechanism (RFC 2617) and the given username and password
    """
    request.headers['Authorization'] = basic_auth_header(username, password)
Example #51
0
 def spider_opened(self, spider):
     usr = getattr(spider, 'http_user', '')
     pwd = getattr(spider, 'http_pass', '')
     if usr or pwd:
         self.auth = basic_auth_header(usr, pwd)
Example #52
0
 def test_basic_auth_header_encoding(self):
     self.assertEqual(b'Basic c29tw6Z1c8Oocjpzw7htZXDDpHNz',
             basic_auth_header(u'somæusèr', u'sømepäss', encoding='utf8'))
     # default encoding (ISO-8859-1)
     self.assertEqual(b'Basic c29t5nVz6HI6c_htZXDkc3M=',
             basic_auth_header(u'somæusèr', u'sømepäss'))
Example #53
0
 def test_basic_auth_header(self):
     self.assertEqual(b'Basic c29tZXVzZXI6c29tZXBhc3M=',
             basic_auth_header('someuser', 'somepass'))
     # Check url unsafe encoded header
     self.assertEqual(b'Basic c29tZXVzZXI6QDx5dTk-Jm8_UQ==',
         basic_auth_header('someuser', '@<yu9>&o?Q'))
Example #54
0
 def _authorization(self, spider):
     usr = getattr(spider, 'http_user', '')
     pwd = getattr(spider, 'http_pass', '')
     if usr or pwd:
         return basic_auth_header(usr, pwd)