예제 #1
0
    def parse(self, response):
        allSubpages = []
        for target in response.css('a.cvplbd'):
            if target.css('a::attr("href")').extract_first():
                allSubpages.append(
                    target.css('a::attr("href")').extract_first())
        temp = set(allSubpages)
        subpages = list(temp)

        for page in subpages:
            token, agent = cfscrape.get_tokens(
                page, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')
            yield scrapy.Request(url=page,
                                 cookies=token,
                                 headers={'User-Agent': agent},
                                 callback=self.subParse,
                                 dont_filter=True)

        #for page in subpages:
        #	if page is not None:
        #		response.follow(page, self.subParse)

        yield {"spacing": 'spacing'}

        next_page = response.css(
            'ul.pt-cv-pagination li.active + li a::attr("href")'
        ).extract_first()
        if next_page is not None:
            token, agent = cfscrape.get_tokens(
                page, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')
            yield scrapy.Request(url=start_url + next_page,
                                 cookies=token,
                                 headers={'User-Agent': agent},
                                 callback=self.parse,
                                 dont_filter=True)
예제 #2
0
    def process_response(self, request, response, spider):
        """Handle the a Scrapy response"""

        if not self.is_cloudflare_challenge(response):
            return response

        logger = logging.getLogger("cloudflaremiddleware")

        logger.debug(
            "Cloudflare protection detected on %s, trying to bypass...", response.url
        )

        cloudflare_tokens, __ = get_tokens(
            request.url, user_agent=spider.settings.get("USER_AGENT")
        )

        logger.debug(
            "Successfully bypassed the protection for %s, re-scheduling the request",
            response.url,
        )

        request.cookies.update(cloudflare_tokens)
        request.priority = 99999

        return request
예제 #3
0
def checkHaveibeenpwned(emails):
	print('[INFO]: Using https://haveibeenpwned.com/ to verify the security of your email...')
	url = 'https://haveibeenpwned.com/api/breachedaccount/{}'
	pasteaccount_url = 'https://haveibeenpwned.com/api/v2/pasteaccount/{}'
	headers = {
				'User-Agent': 'PwnChecker-API-Python-Script'
			}
	cookies, user_agent = cfscrape.get_tokens("https://haveibeenpwned.com/api/breachedaccount/[email protected]", user_agent=headers.get('User-Agent'))
	print('[Results]:')
	results = []
	for idx, email in enumerate(emails):
		res = requests.get(url.format(email), headers=headers, cookies=cookies, verify=True)
		if str(res.status_code) == '404':
			result = '账号安全, 无违规记录.'
		elif str(res.status_code) == '200':
			result = '账号存在风险, 有违规记录, 请及时修改密码.详情如下:\n'
			res = requests.get(pasteaccount_url.format(email), headers=headers, cookies=cookies, verify=True)
			if str(res.status_code) == '200':
				json_data = json.dumps(res.content)
				for key, value in json_data.items():
					result += str(key) + ':' + str(value) + ';'
			else:
				result += '详情获取失败QAQ...'
		elif str(res.status_code) == '429':
			raise RuntimeError('验证过于频繁, 请%s秒后重试...' % str(res.headers['Retry-After']))
		elif str(res.status_code) == '503':
			raise RuntimeError('请求被CloudFlare终止, 请确保你使用的ua和cookie是正确的...')
		else:
			raise RuntimeError('验证过程中出现未知错误, 请尝试重新运行程序...')
		results.append([email, result])
		print('--[%d]: %s → %s' % (idx+1, email, result))
		time.sleep(1 + random.random() * 2)
	return results
예제 #4
0
 def get_labels(self, session_cookie=None):
     if os.path.isfile('./tracer/data/labeled_accounts.json'):
         with open('./tracer/data/labeled_accounts.json') as json_file:
             return json.load(json_file)
     scraper = cfscrape.create_scraper()
     content = scraper.get(
         'https://etherscan.io/labelcloud').content.decode('utf-8')
     labels = re.compile(
         '<div class="dropdown-menu list-unstyled py-2 mb-0 w-100 font-size-base" aria-labelledby="(.+?)"><a class="py-1 px-3 d-block" href="(.+?)">'
     ).findall(content)
     account_labels = []
     for label in labels:
         if 'accounts' in label[1]:
             account_labels.append(label)
     print("Found " + str(len(labels)) + " labels.")
     print(str(len(account_labels)) + " labels realted to accounts.")
     categories = []
     labeled_accounts = {}
     for label in account_labels:
         url = 'https://etherscan.io/' + label[1]
         cookies, user_agent = cfscrape.get_tokens(url)
         cookies['ASP.NET_SessionId'] = session_cookie
         headers = {'User-Agent': user_agent}
         page_count = 1
         accounts = []
         accounts_extracted = []
         total = 0
         while accounts_extracted or page_count == 1:
             content = requests.get(url + '/' + str(page_count),
                                    cookies=cookies,
                                    headers=headers).text
             if total == 0:
                 total = int(
                     re.compile('A total of (.+?) account').findall(content)
                     [0].replace(',', ''))
             accounts_extracted = re.compile(
                 '<tr><td>.*?<a href=\'.+?\'>(.+?)</a>.*?</td><td>(.*?)</td><td>.+?</td><td>.+?</td></tr>'
             ).findall(content)
             accounts += accounts_extracted
             page_count += 1
         print("Extracted for '" + label[0] + "' " + str(len(accounts)) +
               " accounts out of " + str(total))
         for account in accounts:
             address = account[0]
             if address not in labeled_accounts:
                 labeled_accounts[address] = {"labels": [], "category": ""}
             account_label = account[1]
             if account_label != '' and account_label not in labeled_accounts[
                     address]["labels"]:
                 labeled_accounts[address]["labels"].append(account_label)
             category = label[0]
             if category and labeled_accounts[address]["category"] == "":
                 labeled_accounts[address]["category"] = category
             if category not in categories:
                 categories.append(category)
     with open('./tracer/data/labeled_accounts.json', 'w') as jsonfile:
         json.dump(labeled_accounts, jsonfile)
     with open('./tracer/data/categories.json', 'w') as jsonfile:
         json.dump(categories, jsonfile)
     return labeled_accounts
예제 #5
0
def start_requests(self):
    for url in self.start_urls:
        token, agent = cfscrape.get_tokens(
            url,
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 RuxitSynthetic/1.0 v6870249674 t38550 ath9b965f92 altpub cvcv=2, _optional_'
        )
        yield Request(url=url, cookies=token, headers={'User-Agent': agent})
예제 #6
0
def get_website(url, render=False):
    import asyncio
    import cfscrape
    from requests_html import HTMLSession

    session = HTMLSession(mock_browser=True)
    requests_ua = session.headers['User-Agent']
    cf_scraper = cfscrape.create_scraper()

    # Run a simple fetch
    response = session.get(url)
    if False and cf_scraper.is_cloudflare_challenge(response):
        # Re-fetch using cfscrape
        try:
            tokens, _ = cfscrape.get_tokens(url, user_agent=requests_ua)
        except ValueError:
            # Presumably occurs when the website does not have cloudflare enabled
            pass
        else:
            response = session.get(url, cookies=tokens)

    if False and render:
        response.html.render(sleep=8)
        return response.html.html
    else:
        return response.html.html
예제 #7
0
def ekonga(query):
    url = "https://www.konga.com/search?search=" + query
    token, agent = cfscrape.get_tokens(
        url,
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 RuxitSynthetic/1.0 v6870249674 t38550 ath9b965f92 altpub cvcv=2, _optional_'
    )
    k_response = requests.get(url=url,
                              cookies=token,
                              headers={'User-Agent': agent})
    k_soup = BeautifulSoup(k_response.text, 'html.parser')
    script = k_soup.find_all("script", {"id": "__NEXT_DATA__"})
    needed = script[0]
    done = json.loads(needed.contents[0])
    konga_data = done["props"]["initialProps"]["pageProps"]["resultsState"][
        "content"]['_rawResults'][0]['hits']
    for x in konga_data:
        if 'Accessories'.lower() not in str(
                x['category'][1]['category_name'].lower()):
            title = x['name']
            img = "https://www-konga-com-res.cloudinary.com/w_auto,f_auto,fl_lossy,dpr_auto,q_auto/media/catalog/product" + x[
                "image_thumbnail_path"]
            link = "https://www.konga.com/product/" + x['url_key']
            price = x['price']

    return title, img, link, price
예제 #8
0
 def start_requests(self):
     start_urls = [
         'https://www.pracuj.pl/praca/warszawa;wp/it%20-%20administracja;cc,5015?rd=30'
     ]
     for url in self.start_urls:
         token, agent = cfscrape.get_tokens(url, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36')
         yield scrapy.Request(url=url, cookies=token, headers={'User-Agent': agent})
예제 #9
0
    def parse(self, response):
        items = response.xpath(
            '//tr[@class="odd"]//td[position() mod 2 = 1]/a/@href').extract()
        data = [{}]
        test = []
        # db.Chats_FWM.insert_one({
        #     "chat_name": update.message.chat_id,
        #     "message": message,
        #     "type": 'farewell'
        # })
        for item in items:
            # data.append({"title": item.split("/Manga/")[1],
            #             "url": item})
            db.mangas.insert_one({
                "title": item.split("/Manga/")[1],
                "url": item
            })

        # with open('mangas__1.json', 'a') as outfile:
        #     json.dump(data, outfile, indent=4)
        next_page = response.css('div.pagination a::attr(href)').extract()
        page_counter = int(re.search(r'\d+', next_page[-1]).group())
        for url in self.start_urls:
            token, agent = cfscrape.get_tokens(
                url, 'Your prefarable user agent, _optional_')
            for page in range(2, page_counter):
                url_second = '%s/MangaList?page=%s' % (url, page)
                yield Request(url_second,
                              cookies=token,
                              headers={'User-Agent': agent})
예제 #10
0
 def __init__(self,
              url,
              output_dir=None,
              output_format=None,
              username_format='full',
              domain='',
              gophish_url=None,
              gophish_api_key=None):
     self.url = url
     self.scraper = cfscrape.create_scraper(delay=10)
     try:
         self.tokens, self.user_agent = cfscrape.get_tokens(url,
                                                            proxies=proxies,
                                                            verify=False)
     except Exception as e:
         click.secho(
             f'[!] failed to retrieve scrape page, received HTTP {str(e)}... exiting.',
             fg='red')
         sys.exit(-1)
     self.output_dir = output_dir
     self.output_format = output_format
     self.username_format = username_format
     self.domain = domain
     self.output_handler = OutputHandler(output_dir, domain,
                                         username_format, output_format,
                                         gophish_url, gophish_api_key)
예제 #11
0
    def cloudflare(resp, **kwargs):
        """
        Bypass CloudFlare's anti-bot protection.
        """
        def is_cloudflare_challenge(resp):
            """Check if the response is a Cloudflare challange.
            Source: goo.gl/v8FvnD
            """
            return (resp.status_code == 503
                    and resp.headers.get('Server', '').startswith('cloudflare')
                    and b'jschl_vc' in resp.content
                    and b'jschl_answer' in resp.content)

        if is_cloudflare_challenge(resp):
            sickrage.app.log.debug(
                'CloudFlare protection detected, trying to bypass it')

            # Get the session used or create a new one
            session = getattr(resp, 'session', requests.Session())

            # Get the original request
            original_request = resp.request

            # Get the CloudFlare tokens and original user-agent
            tokens, user_agent = cfscrape.get_tokens(original_request.url)

            # Add CloudFlare tokens to the session cookies
            session.cookies.update(tokens)

            # Add CloudFlare Tokens to the original request
            original_cookies = dict_from_cookiejar(original_request._cookies)
            original_cookies.update(tokens)
            original_request.prepare_cookies(original_cookies)

            # The same User-Agent must be used for the retry
            # Update the session with the CloudFlare User-Agent
            session.headers['User-Agent'] = user_agent

            # Update the original request with the CloudFlare User-Agent
            original_request.headers['User-Agent'] = user_agent

            # Remove hooks from original request
            original_hooks = original_request.hooks
            original_request.hooks = session.hooks

            # Resend the request
            cf_resp = session.send(original_request,
                                   allow_redirects=True,
                                   **kwargs)

            if cf_resp.ok:
                sickrage.app.log.debug('CloudFlare successfully bypassed.')

            # Add original hooks back to original request
            cf_resp.hooks = original_hooks

            return cf_resp
        else:
            return resp
예제 #12
0
 def start_requests(self):
     cf_requests = []
     for url in self.start_urls:
         token, agent = cfscrape.get_tokens(
             url, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')
         cf_requests.append(
             Request(url=url, cookies=token, headers={'User-Agent': agent}))
     return cf_requests
예제 #13
0
 def start_requests(self):
     cf_requests = []
     for url in self.start_urls:
         token, agent = cfscrape.get_tokens(
             url, 'Your prefarable user agent, _optional_')
         cf_requests.append(
             Request(url=url, cookies=token, headers={'User-Agent': agent}))
     return cf_requests
예제 #14
0
    def start_requests(self):
        """
        https://stackoverflow.com/questions/33247662/how-to-bypass-cloudflare-bot-ddos-protection-in-scrapy
        """

        for url in self.start_urls:
            token, agent = cfscrape.get_tokens(url, self.user_agent)
            yield Request(url, headers={'User-Agent': agent}, cookies=token)
예제 #15
0
    def start_requests(self):
        DailyJobs.resetCollectionToStoreNewData()

        user_agent = UserAgent().random
        scraperSites = ScrapingStructure.getStructureJobs()

        for site in scraperSites:
            if site['enabled']:
                if site['needJs']:
                    if site['needIUAM']:
                        token, agent = cfscrape.get_tokens(
                            site['url'], user_agent)
                        yield SplashRequest(
                            url=site['url'],
                            callback=ScrapingSiteJobsHelper.parseDataBySite,
                            args={'lua_source': site["script"]},
                            endpoint='execute',
                            meta={"site": site},
                            cookies=token,
                            headers={'User-Agent': agent})
                    else:
                        yield SplashRequest(
                            url=site['url'],
                            callback=ScrapingSiteJobsHelper.parseDataBySite,
                            args={
                                'lua_source': site["script"],
                                'customData': site["customData"]
                            },
                            endpoint='execute',
                            meta={"site": site})
                else:
                    if site['needIUAM']:
                        token, agent = cfscrape.get_tokens(
                            site['url'], user_agent)
                        yield SplashRequest(
                            url=site['url'],
                            callback=ScrapingSiteJobsHelper.parseDataBySite,
                            meta={"site": site},
                            cookies=token,
                            headers={'User-Agent': agent})
                    else:
                        yield SplashRequest(
                            url=site['url'],
                            callback=ScrapingSiteJobsHelper.parseDataBySite,
                            meta={"site": site})
예제 #16
0
 def start_requests(self):
     url = self.base_url + "browse?order=added"
     token, agent = cfscrape.get_tokens(url=url)
     self.token = token
     self.agent = agent
     yield scrapy.Request(url=url,
                          callback=self.parse,
                          cookies=token,
                          headers={'User-Agent': agent})
 def start_requests(self):
   cf_requests = []
   for url in self.start_urls:
       token, agent = cfscrape.get_tokens(url, USER_AGENT)
       #token, agent = cfscrape.get_tokens(url)
       cf_requests.append(scrapy.Request(url=url, cookies={'__cfduid': token['__cfduid']}, headers={'User-Agent': agent}))
       print "useragent in cfrequest: " , agent
       print "token in cfrequest: ", token
   return cf_requests
예제 #18
0
파일: obb_cli.py 프로젝트: vin01/tidbits
    def search(self, domains, raw=False, payload=False):
        """Return the search results from OBB for specified domain(s).

        domains : tuple
            Domain(s) to search. Either python list, tuple format or just comma separated values.
        raw: bool
            Print output in raw format with all fields.
        payload: bool
           Print payload info as well from the vulnerability report page(s) for unpatched vulnerabilities.
        """
        try:
            if isinstance(domains, str):
                domains=domains.split(',')
            for domain in domains:
                req = requests.get(
                    OBB_URL, params='search=%s&type=host' % (domain),
                    headers=headers
                )
                soup = BeautifulSoup(req.content, 'html.parser')
                data_table = soup.find(
                    'table', attrs={'class': 'latest-submissions-main-top'})
                if not data_table:
                    return "No results found."
                rows = data_table.find_all('tr')
                cookies = {}
                for row in rows:
                    cols = row.find_all('td')
                    link = cols[0].find('a')
                    if link:
                        href = "https://openbugbounty.org%s" % (
                            link.get('href'))
                    else:
                        href = "Report URL"
                    cols = [ele.text.strip() for ele in cols]
                    if raw:
                        print(cols)
                    else:
                        print('%-20s%-15s%-25s%-30s' %
                              (cols[0], cols[3], cols[4], href))
                    if cols[3] == "unpatched" and payload:
                        if not cookies:
                            tokens = cfscrape.get_tokens(href)
                            cookies = tokens[0]
                            headers['User-Agent']=tokens[1]
                        payload_req = requests.get(
                            href,
                            headers=headers,
                            cookies=cookies
                        )
                        payload_soup = BeautifulSoup(
                            payload_req.content, 'html.parser')
                        text_areas = payload_soup.find_all('textarea')
                        for text_area in text_areas:
                            print(text_area.text.strip(), end="\n\n")
        except requests.exceptions.RequestException as error:
            print(error)
예제 #19
0
def use_cf(site_obj):
    try:
        import cfscrape
        url_info = urlparse(site_obj['checkin'])
        domain_url = url_info.scheme + '://' + url_info.netloc
        cookie_value, user_agent = cfscrape.get_tokens(domain_url, proxies=PROXIES)
        HEADERS['user-agent'] = user_agent
        COOKIES.update(cookie_value)
    finally:
        ...
예제 #20
0
	def start_requests(self):
		cf_requests = []
		for url in self.start_urls:
			token, agent = cfscrape.get_tokens(url, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36")
			
			
			cf_requests.append(scrapy.Request(url=url,
				cookies=token,
				headers={'User-Agent': agent}))
			return cf_requests
예제 #21
0
    def start_requests(self):
        """Solve the "Cloudflare" challenge and start the crawling"""

        try:
            self.cloudflare_token, user_agent = cfscrape.get_tokens(
                'http://%s/' % self.allowed_domains[0],
                user_agent=self.settings.get('USER_AGENT'))
        except Exception:
            raise BookScrapeException(
                'Unable to bypass "cloudflare" antibot protection')
예제 #22
0
    def get_html(self, url):
        """Make a request and get the html (text) from the response"""
        token, agent = cfscrape.get_tokens(url=url)
        response = requests.get(url,
                                headers={'User-Agent': agent},
                                cookies=token)

        if response.status_code != 200:
            raise requests.exception.HTTPError

        return response.text
예제 #23
0
 def process_response(self, request, response, spider):
     if response.status == 503:
         try:
             token,user_agent = get_tokens(request.url,user_agent=request.headers['user-agent'])
         except Exception:
             raise IgnoreRequest
         return Request(url=request.url,cookies=token, dont_filter=True,
                         headers={'user-agent':user_agent}, 
                         meta={'ddos_token':token},
                         callback=request.callback)
     return response
예제 #24
0
 def first_parse(self, response):
     next_url = response.xpath('//div[@class="pager"]//a[@title="Next"]/@href').extract()[0]
     print next_url
     yield scrapy.Request( next_url,
               cookies=self.token,
               headers={'User-Agent': self.agent}, callback=self.next_parse)
     for product in response.xpath('//a[contains(@class, "product-image")]/@href').extract():
         token, agent = cfscrape.get_tokens(product)
         yield scrapy.Request(product,
               cookies=token,
               headers={'User-Agent': agent},
               callback=self.parse_items)
    def run(self):
        print("Collecting requested resources to run the Plugin Resolver by!")
        if not os.path.exists(os.path.expanduser(self.output_folder)):
            try:
                os.makedirs(os.path.expanduser(self.output_folder))
            except OSError:
                print("Unable to create directory: %s" % self.output_folder)
                return

        # Change the working directory to the requested
        # Folder to save plugins in.
        with ChangeDir(os.path.expanduser(self.output_folder)):
            print("Loading Resource information")
            tokens, user_agent = cfscrape.get_tokens('http://www.spigotmc.org')
            # First, iterate through all the bukkit plugins to resolve
            # and begin downloading them.
            print("Retrieving Bukkit Resources")
            for plugin, data in self.bukkit_resources.items():
                resource = data['resource']
                version = data['version']
                download_url = resource.get_download_link(version=version)
                file_name = resource.get_versioned_file_name(version=version)

                try:
                    download(file_name, download_url, tokens, user_agent)
                    print("Downloaded plugin %s to %s" % (resource.plugin_name, file_name))

                except FileNotFoundError:
                    print("Unable to download resource %s from %s" % (resource.plugin_name, download_url))

            print("Retrieving Spigot Resources")
            for plugin, data in self.spigot_resources.items():
                resource = data['resource']
                version = data['version']
                name = data['name']
                download_url = resource.get_download_link(version=version)
                requested_version = resource.version if version == "latest" else version

                file_name = "%s-%s%s" % (name, requested_version, resource.file_type)
                try:
                    download(file_name, download_url, tokens, user_agent)
                    print("Downloaded plugin %s to %s" % (resource.name, file_name))
                except FileNotFoundError:
                    print("Unable to download resource %s from %s" % (resource.name, download_url))

        print("Beginning configuration generation!")
        self.generate_plugin_configuration()
        # Cleanup the access data retrieved by the plugin!

        print("Cleaning the trash!")
        self.__cleanup()
        print("Finished Operations! Resolution complete!")
예제 #26
0
    def start_requests(self):
        cf_requests = []
        user_agent = self.ua.random
        self.logger.info("RANDOM user_agent = %s", user_agent)
        for url in self.start_urls:
            token , agent = cfscrape.get_tokens(url,user_agent)
            self.logger.info("token = %s", token)
            self.logger.info("agent = %s", agent)

            cf_requests.append(scrapy.Request(url=url,
                                              cookies= token,
                                              headers={'User-Agent': agent}))
        return cf_requests
예제 #27
0
	def start_requests(self):
		self.is_updated=False
		urls=[
			"http://nhadat24h.net/ban-bat-dong-san-viet-nam-nha-dat-viet-nam-s686599/",
			"http://nhadat24h.net/cho-thue-nha-dat-bat-dong-san-tai-viet-nam-nha-dat-tai-viet-nam-s686588/"
		]
		token, agent = cfscrape.get_tokens("http://nhadat24h.net")
		self.token=token
		self.agent=agent
		for url in urls:
			yield scrapy.Request(url=url,callback=self.parse,
				cookies=token,
				headers={'User-Agent':agent})
예제 #28
0
 def __addDDOSBypass(self, exchangeName):
     """
     adding async cloudflare scrapper
     from aiocfscrape import CloudflareScraper
     exchange.session = CloudflareScraper(loop=asyncio.get_event_loop())
     """
     #bypassing cloudflare with cookies
     url = self.__exchanges[exchangeName].urls['www']
     tokens, user_agent = cfscrape.get_tokens(url)
     self.__exchanges[exchangeName].headers = {
         'cookie': '; '.join([key + '=' + tokens[key] for key in tokens]),
         'user-agent': user_agent,
     }
예제 #29
0
파일: hooks.py 프로젝트: steflavoie/Medusa
def cloudflare(resp, **kwargs):
    """
    Bypass CloudFlare's anti-bot protection.

    A response hook that retries a request after bypassing CloudFlare anti-bot
    protection.  Use the sessioned hook factory to attach the session to the
    response to persist CloudFlare authentication at the session level.
    """
    if all([resp.status_code == 503,  # Service unavailable
            resp.headers.get('server') == u'cloudflare-nginx', ]):

        log.debug(u'CloudFlare protection detected, trying to bypass it')

        # Get the session used or create a new one
        session = getattr(resp, 'session', requests.Session())

        # Get the original request
        original_request = resp.request

        # Avoid recursion by removing the hook from the original request
        original_request.hooks['response'].remove(cloudflare)

        # Get the CloudFlare tokens and original user-agent
        tokens, user_agent = cfscrape.get_tokens(original_request.url)

        # Add CloudFlare tokens to the session cookies
        session.cookies.update(tokens)
        # Add CloudFlare Tokens to the original request
        original_cookies = dict_from_cookiejar(original_request._cookies)
        original_cookies.update(tokens)
        original_request.prepare_cookies(original_cookies)

        # The same User-Agent must be used for the retry
        # Update the session with the CloudFlare User-Agent
        session.headers['User-Agent'] = user_agent
        # Update the original request with the CloudFlare User-Agent
        original_request.headers['User-Agent'] = user_agent

        # Resend the request
        cf_resp = session.send(
            original_request,
            allow_redirects=True,
            **kwargs
        )

        if cf_resp.ok:
            log.debug('CloudFlare successfully bypassed.')
        return cf_resp
    else:
        return resp
    def start_requests(self):
        user_agent = UserAgent().random
        proxy = random.choice(self.proxy_list)
        url = "https://untappd.com/search?q=*&type=beer"
        token, agent = cfscrape.get_tokens(
            url,
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
        )
        request = Request(url=url,
                          cookies={'__cfduid': token['__cfduid']},
                          headers={'User-Agent': user_agent},
                          callback=self.parse_beer,
                          meta={'proxy': proxy})

        yield request
예제 #31
0
    def process_response(self, request, response, spider):
        """If we can identify a CloudFlare check on this page then use cfscrape to get the cookies"""

        # If this is not a CloudFlare page then no processing is needed
        if not self.is_cloudflare(response):
            return response

        # Otherwise try to retrieve the cookie using cfscrape
        spider.logger.info('Cloudflare protection detected on {}, trying to bypass...'.format(response.url))
        cloudflare_tokens, _ = get_tokens(request.url, user_agent=spider.settings.get('USER_AGENT'))
        spider.logger.info('Obtained CloudFlare tokens for {}, re-scheduling the request'.format(response.url))

        # Add the cookies to the request and reschedule this request for later
        request.cookies.update(cloudflare_tokens)
        request.priority = 99999
        return request
예제 #32
0
    def _setup_cookie(self, search_uri, webscraper):
        '''

        :param search_uri:
        :param webscraper:
        :return:
        '''
        cookie = {}
        headers = {'User-Agent': str(UserAgent().random)}
        try:
            if webscraper.cloudflare_cookie:
                # TODO resolver problema de conexion no anonima
                cookie, user_agent = cfscrape.get_tokens(
                    search_uri, headers['User-Agent'])
                self.logger.info(
                    '{0} Retrieving Cloudflare Cookie: \n{1}'.format(
                        webscraper.name, cookie))
                return cookie, headers

            elif webscraper.thread_defense_bypass_cookie:
                # TODO resolver problema de conexion no anonima
                response = requests.get(search_uri,
                                        verify=True,
                                        headers=headers)
                if response.history:
                    self.logger.debug0('{0} Request Was Redirected:'.format(
                        webscraper.name))
                    for resp in response.history:
                        self.logger.debug(
                            '{0} Response: [ Status Code: {1} ] from [ {2} ]'.
                            format(webscraper.name, resp.status_code,
                                   resp.url))

                    self.logger.debug0(
                        '{0} Final Destination [ Status Code: [ {1} ] from [ {2} ]'
                        .format(webscraper.name, response.status_code,
                                response.url))
                    # thread_defense_bypass = ThreatDefenceBypass()
                    # cookie =  thread_defense_bypass(url=response.url)
                return cookie, headers
            else:
                return cookie, headers
        except Exception as err:
            raise ScraperEngineCookieError(webscraper.name, err,
                                           traceback.format_exc())
예제 #33
0
 def start_requests(self):
     urls = [
         # 'http://quotes.toscrape.com/',
         'https://www.directliquidation.com/electronics/?s=&idx=dl_prod_posts_product_end_date_ts_asc&page=10',
     ]
     cf_requests = []
     for url in urls:
         token, agent = cfscrape.get_tokens(
             url,
             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like asdlkjqwdj) Chrome/16.0.912.36 Safari/535.7'
         )
         print("The token is ", token)
         print("The user agent is", agent)
         cf_requests.append(
             scrapy.Request(url=url,
                            cookies=token,
                            headers={'User-Agent': agent}))
     return cf_requests
예제 #34
0
def cloudflare(session, resp, **kwargs):
    """
    Bypass CloudFlare's anti-bot protection.

    A request handler that retries a request after bypassing CloudFlare anti-bot
    protection.
    """
    if is_cloudflare_challenge(resp):

        log.debug(u'CloudFlare protection detected, trying to bypass it')

        # Get the original request
        original_request = resp.request

        # Get the CloudFlare tokens and original user-agent
        tokens, user_agent = cfscrape.get_tokens(original_request.url)

        # Add CloudFlare tokens to the session cookies
        session.cookies.update(tokens)
        # Add CloudFlare Tokens to the original request
        original_cookies = dict_from_cookiejar(original_request._cookies)
        original_cookies.update(tokens)
        original_request.prepare_cookies(original_cookies)

        # The same User-Agent must be used for the retry
        # Update the session with the CloudFlare User-Agent
        session.headers['User-Agent'] = user_agent
        # Update the original request with the CloudFlare User-Agent
        original_request.headers['User-Agent'] = user_agent

        # Resend the request
        kwargs = filtered_kwargs(kwargs)
        kwargs['allow_redirects'] = True
        cf_resp = session.send(
            original_request,
            **kwargs
        )
        cf_resp.raise_for_status()

        if cf_resp.ok:
            log.debug('CloudFlare successfully bypassed.')
        return cf_resp
    else:
        return resp
예제 #35
0
    def cloudflare(session, resp, **kwargs):
        """
        Bypass CloudFlare's anti-bot protection.
        """

        def filtered_kwargs(kwargs):
            """Filter kwargs to only contain arguments accepted by `requests.Session.send`."""
            return {
                k: v for k, v in kwargs.items()
                if k in ('stream', 'timeout', 'verify', 'cert', 'proxies', 'allow_redirects')
            }

        def is_cloudflare_challenge(resp):
            """Check if the response is a Cloudflare challange.
            Source: goo.gl/v8FvnD
            """
            return (
                    resp.status_code == 503
                    and resp.headers.get('Server', '').startswith('cloudflare')
                    and b'jschl_vc' in resp.content
                    and b'jschl_answer' in resp.content
            )

        if is_cloudflare_challenge(resp):
            sickrage.app.log.debug('CloudFlare protection detected, trying to bypass it')

            # Get the original request
            original_request = resp.request

            # Get the CloudFlare tokens and original user-agent
            tokens, user_agent = cfscrape.get_tokens(original_request.url)

            # Add CloudFlare tokens to the session cookies
            session.cookies.update(tokens)

            # Add CloudFlare Tokens to the original request
            original_cookies = dict_from_cookiejar(original_request._cookies)
            original_cookies.update(tokens)
            original_request.prepare_cookies(original_cookies)

            # The same User-Agent must be used for the retry
            # Update the session with the CloudFlare User-Agent
            session.headers['User-Agent'] = user_agent

            # Update the original request with the CloudFlare User-Agent
            original_request.headers['User-Agent'] = user_agent

            # Remove hooks from original request
            original_hooks = original_request.hooks
            original_request.hooks = []

            # Resend the request
            kwargs['allow_redirects'] = True
            cf_resp = session.send(original_request, **filtered_kwargs(kwargs))

            if cf_resp.ok:
                sickrage.app.log.debug('CloudFlare successfully bypassed.')

            # Add original hooks back to original request
            cf_resp.hooks = original_hooks

            return cf_resp
        else:
            return resp