def parse(self, response): allSubpages = [] for target in response.css('a.cvplbd'): if target.css('a::attr("href")').extract_first(): allSubpages.append( target.css('a::attr("href")').extract_first()) temp = set(allSubpages) subpages = list(temp) for page in subpages: token, agent = cfscrape.get_tokens( page, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)') yield scrapy.Request(url=page, cookies=token, headers={'User-Agent': agent}, callback=self.subParse, dont_filter=True) #for page in subpages: # if page is not None: # response.follow(page, self.subParse) yield {"spacing": 'spacing'} next_page = response.css( 'ul.pt-cv-pagination li.active + li a::attr("href")' ).extract_first() if next_page is not None: token, agent = cfscrape.get_tokens( page, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)') yield scrapy.Request(url=start_url + next_page, cookies=token, headers={'User-Agent': agent}, callback=self.parse, dont_filter=True)
def process_response(self, request, response, spider): """Handle the a Scrapy response""" if not self.is_cloudflare_challenge(response): return response logger = logging.getLogger("cloudflaremiddleware") logger.debug( "Cloudflare protection detected on %s, trying to bypass...", response.url ) cloudflare_tokens, __ = get_tokens( request.url, user_agent=spider.settings.get("USER_AGENT") ) logger.debug( "Successfully bypassed the protection for %s, re-scheduling the request", response.url, ) request.cookies.update(cloudflare_tokens) request.priority = 99999 return request
def checkHaveibeenpwned(emails): print('[INFO]: Using https://haveibeenpwned.com/ to verify the security of your email...') url = 'https://haveibeenpwned.com/api/breachedaccount/{}' pasteaccount_url = 'https://haveibeenpwned.com/api/v2/pasteaccount/{}' headers = { 'User-Agent': 'PwnChecker-API-Python-Script' } cookies, user_agent = cfscrape.get_tokens("https://haveibeenpwned.com/api/breachedaccount/[email protected]", user_agent=headers.get('User-Agent')) print('[Results]:') results = [] for idx, email in enumerate(emails): res = requests.get(url.format(email), headers=headers, cookies=cookies, verify=True) if str(res.status_code) == '404': result = '账号安全, 无违规记录.' elif str(res.status_code) == '200': result = '账号存在风险, 有违规记录, 请及时修改密码.详情如下:\n' res = requests.get(pasteaccount_url.format(email), headers=headers, cookies=cookies, verify=True) if str(res.status_code) == '200': json_data = json.dumps(res.content) for key, value in json_data.items(): result += str(key) + ':' + str(value) + ';' else: result += '详情获取失败QAQ...' elif str(res.status_code) == '429': raise RuntimeError('验证过于频繁, 请%s秒后重试...' % str(res.headers['Retry-After'])) elif str(res.status_code) == '503': raise RuntimeError('请求被CloudFlare终止, 请确保你使用的ua和cookie是正确的...') else: raise RuntimeError('验证过程中出现未知错误, 请尝试重新运行程序...') results.append([email, result]) print('--[%d]: %s → %s' % (idx+1, email, result)) time.sleep(1 + random.random() * 2) return results
def get_labels(self, session_cookie=None): if os.path.isfile('./tracer/data/labeled_accounts.json'): with open('./tracer/data/labeled_accounts.json') as json_file: return json.load(json_file) scraper = cfscrape.create_scraper() content = scraper.get( 'https://etherscan.io/labelcloud').content.decode('utf-8') labels = re.compile( '<div class="dropdown-menu list-unstyled py-2 mb-0 w-100 font-size-base" aria-labelledby="(.+?)"><a class="py-1 px-3 d-block" href="(.+?)">' ).findall(content) account_labels = [] for label in labels: if 'accounts' in label[1]: account_labels.append(label) print("Found " + str(len(labels)) + " labels.") print(str(len(account_labels)) + " labels realted to accounts.") categories = [] labeled_accounts = {} for label in account_labels: url = 'https://etherscan.io/' + label[1] cookies, user_agent = cfscrape.get_tokens(url) cookies['ASP.NET_SessionId'] = session_cookie headers = {'User-Agent': user_agent} page_count = 1 accounts = [] accounts_extracted = [] total = 0 while accounts_extracted or page_count == 1: content = requests.get(url + '/' + str(page_count), cookies=cookies, headers=headers).text if total == 0: total = int( re.compile('A total of (.+?) account').findall(content) [0].replace(',', '')) accounts_extracted = re.compile( '<tr><td>.*?<a href=\'.+?\'>(.+?)</a>.*?</td><td>(.*?)</td><td>.+?</td><td>.+?</td></tr>' ).findall(content) accounts += accounts_extracted page_count += 1 print("Extracted for '" + label[0] + "' " + str(len(accounts)) + " accounts out of " + str(total)) for account in accounts: address = account[0] if address not in labeled_accounts: labeled_accounts[address] = {"labels": [], "category": ""} account_label = account[1] if account_label != '' and account_label not in labeled_accounts[ address]["labels"]: labeled_accounts[address]["labels"].append(account_label) category = label[0] if category and labeled_accounts[address]["category"] == "": labeled_accounts[address]["category"] = category if category not in categories: categories.append(category) with open('./tracer/data/labeled_accounts.json', 'w') as jsonfile: json.dump(labeled_accounts, jsonfile) with open('./tracer/data/categories.json', 'w') as jsonfile: json.dump(categories, jsonfile) return labeled_accounts
def start_requests(self): for url in self.start_urls: token, agent = cfscrape.get_tokens( url, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 RuxitSynthetic/1.0 v6870249674 t38550 ath9b965f92 altpub cvcv=2, _optional_' ) yield Request(url=url, cookies=token, headers={'User-Agent': agent})
def get_website(url, render=False): import asyncio import cfscrape from requests_html import HTMLSession session = HTMLSession(mock_browser=True) requests_ua = session.headers['User-Agent'] cf_scraper = cfscrape.create_scraper() # Run a simple fetch response = session.get(url) if False and cf_scraper.is_cloudflare_challenge(response): # Re-fetch using cfscrape try: tokens, _ = cfscrape.get_tokens(url, user_agent=requests_ua) except ValueError: # Presumably occurs when the website does not have cloudflare enabled pass else: response = session.get(url, cookies=tokens) if False and render: response.html.render(sleep=8) return response.html.html else: return response.html.html
def ekonga(query): url = "https://www.konga.com/search?search=" + query token, agent = cfscrape.get_tokens( url, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 RuxitSynthetic/1.0 v6870249674 t38550 ath9b965f92 altpub cvcv=2, _optional_' ) k_response = requests.get(url=url, cookies=token, headers={'User-Agent': agent}) k_soup = BeautifulSoup(k_response.text, 'html.parser') script = k_soup.find_all("script", {"id": "__NEXT_DATA__"}) needed = script[0] done = json.loads(needed.contents[0]) konga_data = done["props"]["initialProps"]["pageProps"]["resultsState"][ "content"]['_rawResults'][0]['hits'] for x in konga_data: if 'Accessories'.lower() not in str( x['category'][1]['category_name'].lower()): title = x['name'] img = "https://www-konga-com-res.cloudinary.com/w_auto,f_auto,fl_lossy,dpr_auto,q_auto/media/catalog/product" + x[ "image_thumbnail_path"] link = "https://www.konga.com/product/" + x['url_key'] price = x['price'] return title, img, link, price
def start_requests(self): start_urls = [ 'https://www.pracuj.pl/praca/warszawa;wp/it%20-%20administracja;cc,5015?rd=30' ] for url in self.start_urls: token, agent = cfscrape.get_tokens(url, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36') yield scrapy.Request(url=url, cookies=token, headers={'User-Agent': agent})
def parse(self, response): items = response.xpath( '//tr[@class="odd"]//td[position() mod 2 = 1]/a/@href').extract() data = [{}] test = [] # db.Chats_FWM.insert_one({ # "chat_name": update.message.chat_id, # "message": message, # "type": 'farewell' # }) for item in items: # data.append({"title": item.split("/Manga/")[1], # "url": item}) db.mangas.insert_one({ "title": item.split("/Manga/")[1], "url": item }) # with open('mangas__1.json', 'a') as outfile: # json.dump(data, outfile, indent=4) next_page = response.css('div.pagination a::attr(href)').extract() page_counter = int(re.search(r'\d+', next_page[-1]).group()) for url in self.start_urls: token, agent = cfscrape.get_tokens( url, 'Your prefarable user agent, _optional_') for page in range(2, page_counter): url_second = '%s/MangaList?page=%s' % (url, page) yield Request(url_second, cookies=token, headers={'User-Agent': agent})
def __init__(self, url, output_dir=None, output_format=None, username_format='full', domain='', gophish_url=None, gophish_api_key=None): self.url = url self.scraper = cfscrape.create_scraper(delay=10) try: self.tokens, self.user_agent = cfscrape.get_tokens(url, proxies=proxies, verify=False) except Exception as e: click.secho( f'[!] failed to retrieve scrape page, received HTTP {str(e)}... exiting.', fg='red') sys.exit(-1) self.output_dir = output_dir self.output_format = output_format self.username_format = username_format self.domain = domain self.output_handler = OutputHandler(output_dir, domain, username_format, output_format, gophish_url, gophish_api_key)
def cloudflare(resp, **kwargs): """ Bypass CloudFlare's anti-bot protection. """ def is_cloudflare_challenge(resp): """Check if the response is a Cloudflare challange. Source: goo.gl/v8FvnD """ return (resp.status_code == 503 and resp.headers.get('Server', '').startswith('cloudflare') and b'jschl_vc' in resp.content and b'jschl_answer' in resp.content) if is_cloudflare_challenge(resp): sickrage.app.log.debug( 'CloudFlare protection detected, trying to bypass it') # Get the session used or create a new one session = getattr(resp, 'session', requests.Session()) # Get the original request original_request = resp.request # Get the CloudFlare tokens and original user-agent tokens, user_agent = cfscrape.get_tokens(original_request.url) # Add CloudFlare tokens to the session cookies session.cookies.update(tokens) # Add CloudFlare Tokens to the original request original_cookies = dict_from_cookiejar(original_request._cookies) original_cookies.update(tokens) original_request.prepare_cookies(original_cookies) # The same User-Agent must be used for the retry # Update the session with the CloudFlare User-Agent session.headers['User-Agent'] = user_agent # Update the original request with the CloudFlare User-Agent original_request.headers['User-Agent'] = user_agent # Remove hooks from original request original_hooks = original_request.hooks original_request.hooks = session.hooks # Resend the request cf_resp = session.send(original_request, allow_redirects=True, **kwargs) if cf_resp.ok: sickrage.app.log.debug('CloudFlare successfully bypassed.') # Add original hooks back to original request cf_resp.hooks = original_hooks return cf_resp else: return resp
def start_requests(self): cf_requests = [] for url in self.start_urls: token, agent = cfscrape.get_tokens( url, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)') cf_requests.append( Request(url=url, cookies=token, headers={'User-Agent': agent})) return cf_requests
def start_requests(self): cf_requests = [] for url in self.start_urls: token, agent = cfscrape.get_tokens( url, 'Your prefarable user agent, _optional_') cf_requests.append( Request(url=url, cookies=token, headers={'User-Agent': agent})) return cf_requests
def start_requests(self): """ https://stackoverflow.com/questions/33247662/how-to-bypass-cloudflare-bot-ddos-protection-in-scrapy """ for url in self.start_urls: token, agent = cfscrape.get_tokens(url, self.user_agent) yield Request(url, headers={'User-Agent': agent}, cookies=token)
def start_requests(self): DailyJobs.resetCollectionToStoreNewData() user_agent = UserAgent().random scraperSites = ScrapingStructure.getStructureJobs() for site in scraperSites: if site['enabled']: if site['needJs']: if site['needIUAM']: token, agent = cfscrape.get_tokens( site['url'], user_agent) yield SplashRequest( url=site['url'], callback=ScrapingSiteJobsHelper.parseDataBySite, args={'lua_source': site["script"]}, endpoint='execute', meta={"site": site}, cookies=token, headers={'User-Agent': agent}) else: yield SplashRequest( url=site['url'], callback=ScrapingSiteJobsHelper.parseDataBySite, args={ 'lua_source': site["script"], 'customData': site["customData"] }, endpoint='execute', meta={"site": site}) else: if site['needIUAM']: token, agent = cfscrape.get_tokens( site['url'], user_agent) yield SplashRequest( url=site['url'], callback=ScrapingSiteJobsHelper.parseDataBySite, meta={"site": site}, cookies=token, headers={'User-Agent': agent}) else: yield SplashRequest( url=site['url'], callback=ScrapingSiteJobsHelper.parseDataBySite, meta={"site": site})
def start_requests(self): url = self.base_url + "browse?order=added" token, agent = cfscrape.get_tokens(url=url) self.token = token self.agent = agent yield scrapy.Request(url=url, callback=self.parse, cookies=token, headers={'User-Agent': agent})
def start_requests(self): cf_requests = [] for url in self.start_urls: token, agent = cfscrape.get_tokens(url, USER_AGENT) #token, agent = cfscrape.get_tokens(url) cf_requests.append(scrapy.Request(url=url, cookies={'__cfduid': token['__cfduid']}, headers={'User-Agent': agent})) print "useragent in cfrequest: " , agent print "token in cfrequest: ", token return cf_requests
def search(self, domains, raw=False, payload=False): """Return the search results from OBB for specified domain(s). domains : tuple Domain(s) to search. Either python list, tuple format or just comma separated values. raw: bool Print output in raw format with all fields. payload: bool Print payload info as well from the vulnerability report page(s) for unpatched vulnerabilities. """ try: if isinstance(domains, str): domains=domains.split(',') for domain in domains: req = requests.get( OBB_URL, params='search=%s&type=host' % (domain), headers=headers ) soup = BeautifulSoup(req.content, 'html.parser') data_table = soup.find( 'table', attrs={'class': 'latest-submissions-main-top'}) if not data_table: return "No results found." rows = data_table.find_all('tr') cookies = {} for row in rows: cols = row.find_all('td') link = cols[0].find('a') if link: href = "https://openbugbounty.org%s" % ( link.get('href')) else: href = "Report URL" cols = [ele.text.strip() for ele in cols] if raw: print(cols) else: print('%-20s%-15s%-25s%-30s' % (cols[0], cols[3], cols[4], href)) if cols[3] == "unpatched" and payload: if not cookies: tokens = cfscrape.get_tokens(href) cookies = tokens[0] headers['User-Agent']=tokens[1] payload_req = requests.get( href, headers=headers, cookies=cookies ) payload_soup = BeautifulSoup( payload_req.content, 'html.parser') text_areas = payload_soup.find_all('textarea') for text_area in text_areas: print(text_area.text.strip(), end="\n\n") except requests.exceptions.RequestException as error: print(error)
def use_cf(site_obj): try: import cfscrape url_info = urlparse(site_obj['checkin']) domain_url = url_info.scheme + '://' + url_info.netloc cookie_value, user_agent = cfscrape.get_tokens(domain_url, proxies=PROXIES) HEADERS['user-agent'] = user_agent COOKIES.update(cookie_value) finally: ...
def start_requests(self): cf_requests = [] for url in self.start_urls: token, agent = cfscrape.get_tokens(url, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36") cf_requests.append(scrapy.Request(url=url, cookies=token, headers={'User-Agent': agent})) return cf_requests
def start_requests(self): """Solve the "Cloudflare" challenge and start the crawling""" try: self.cloudflare_token, user_agent = cfscrape.get_tokens( 'http://%s/' % self.allowed_domains[0], user_agent=self.settings.get('USER_AGENT')) except Exception: raise BookScrapeException( 'Unable to bypass "cloudflare" antibot protection')
def get_html(self, url): """Make a request and get the html (text) from the response""" token, agent = cfscrape.get_tokens(url=url) response = requests.get(url, headers={'User-Agent': agent}, cookies=token) if response.status_code != 200: raise requests.exception.HTTPError return response.text
def process_response(self, request, response, spider): if response.status == 503: try: token,user_agent = get_tokens(request.url,user_agent=request.headers['user-agent']) except Exception: raise IgnoreRequest return Request(url=request.url,cookies=token, dont_filter=True, headers={'user-agent':user_agent}, meta={'ddos_token':token}, callback=request.callback) return response
def first_parse(self, response): next_url = response.xpath('//div[@class="pager"]//a[@title="Next"]/@href').extract()[0] print next_url yield scrapy.Request( next_url, cookies=self.token, headers={'User-Agent': self.agent}, callback=self.next_parse) for product in response.xpath('//a[contains(@class, "product-image")]/@href').extract(): token, agent = cfscrape.get_tokens(product) yield scrapy.Request(product, cookies=token, headers={'User-Agent': agent}, callback=self.parse_items)
def run(self): print("Collecting requested resources to run the Plugin Resolver by!") if not os.path.exists(os.path.expanduser(self.output_folder)): try: os.makedirs(os.path.expanduser(self.output_folder)) except OSError: print("Unable to create directory: %s" % self.output_folder) return # Change the working directory to the requested # Folder to save plugins in. with ChangeDir(os.path.expanduser(self.output_folder)): print("Loading Resource information") tokens, user_agent = cfscrape.get_tokens('http://www.spigotmc.org') # First, iterate through all the bukkit plugins to resolve # and begin downloading them. print("Retrieving Bukkit Resources") for plugin, data in self.bukkit_resources.items(): resource = data['resource'] version = data['version'] download_url = resource.get_download_link(version=version) file_name = resource.get_versioned_file_name(version=version) try: download(file_name, download_url, tokens, user_agent) print("Downloaded plugin %s to %s" % (resource.plugin_name, file_name)) except FileNotFoundError: print("Unable to download resource %s from %s" % (resource.plugin_name, download_url)) print("Retrieving Spigot Resources") for plugin, data in self.spigot_resources.items(): resource = data['resource'] version = data['version'] name = data['name'] download_url = resource.get_download_link(version=version) requested_version = resource.version if version == "latest" else version file_name = "%s-%s%s" % (name, requested_version, resource.file_type) try: download(file_name, download_url, tokens, user_agent) print("Downloaded plugin %s to %s" % (resource.name, file_name)) except FileNotFoundError: print("Unable to download resource %s from %s" % (resource.name, download_url)) print("Beginning configuration generation!") self.generate_plugin_configuration() # Cleanup the access data retrieved by the plugin! print("Cleaning the trash!") self.__cleanup() print("Finished Operations! Resolution complete!")
def start_requests(self): cf_requests = [] user_agent = self.ua.random self.logger.info("RANDOM user_agent = %s", user_agent) for url in self.start_urls: token , agent = cfscrape.get_tokens(url,user_agent) self.logger.info("token = %s", token) self.logger.info("agent = %s", agent) cf_requests.append(scrapy.Request(url=url, cookies= token, headers={'User-Agent': agent})) return cf_requests
def start_requests(self): self.is_updated=False urls=[ "http://nhadat24h.net/ban-bat-dong-san-viet-nam-nha-dat-viet-nam-s686599/", "http://nhadat24h.net/cho-thue-nha-dat-bat-dong-san-tai-viet-nam-nha-dat-tai-viet-nam-s686588/" ] token, agent = cfscrape.get_tokens("http://nhadat24h.net") self.token=token self.agent=agent for url in urls: yield scrapy.Request(url=url,callback=self.parse, cookies=token, headers={'User-Agent':agent})
def __addDDOSBypass(self, exchangeName): """ adding async cloudflare scrapper from aiocfscrape import CloudflareScraper exchange.session = CloudflareScraper(loop=asyncio.get_event_loop()) """ #bypassing cloudflare with cookies url = self.__exchanges[exchangeName].urls['www'] tokens, user_agent = cfscrape.get_tokens(url) self.__exchanges[exchangeName].headers = { 'cookie': '; '.join([key + '=' + tokens[key] for key in tokens]), 'user-agent': user_agent, }
def cloudflare(resp, **kwargs): """ Bypass CloudFlare's anti-bot protection. A response hook that retries a request after bypassing CloudFlare anti-bot protection. Use the sessioned hook factory to attach the session to the response to persist CloudFlare authentication at the session level. """ if all([resp.status_code == 503, # Service unavailable resp.headers.get('server') == u'cloudflare-nginx', ]): log.debug(u'CloudFlare protection detected, trying to bypass it') # Get the session used or create a new one session = getattr(resp, 'session', requests.Session()) # Get the original request original_request = resp.request # Avoid recursion by removing the hook from the original request original_request.hooks['response'].remove(cloudflare) # Get the CloudFlare tokens and original user-agent tokens, user_agent = cfscrape.get_tokens(original_request.url) # Add CloudFlare tokens to the session cookies session.cookies.update(tokens) # Add CloudFlare Tokens to the original request original_cookies = dict_from_cookiejar(original_request._cookies) original_cookies.update(tokens) original_request.prepare_cookies(original_cookies) # The same User-Agent must be used for the retry # Update the session with the CloudFlare User-Agent session.headers['User-Agent'] = user_agent # Update the original request with the CloudFlare User-Agent original_request.headers['User-Agent'] = user_agent # Resend the request cf_resp = session.send( original_request, allow_redirects=True, **kwargs ) if cf_resp.ok: log.debug('CloudFlare successfully bypassed.') return cf_resp else: return resp
def start_requests(self): user_agent = UserAgent().random proxy = random.choice(self.proxy_list) url = "https://untappd.com/search?q=*&type=beer" token, agent = cfscrape.get_tokens( url, 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36' ) request = Request(url=url, cookies={'__cfduid': token['__cfduid']}, headers={'User-Agent': user_agent}, callback=self.parse_beer, meta={'proxy': proxy}) yield request
def process_response(self, request, response, spider): """If we can identify a CloudFlare check on this page then use cfscrape to get the cookies""" # If this is not a CloudFlare page then no processing is needed if not self.is_cloudflare(response): return response # Otherwise try to retrieve the cookie using cfscrape spider.logger.info('Cloudflare protection detected on {}, trying to bypass...'.format(response.url)) cloudflare_tokens, _ = get_tokens(request.url, user_agent=spider.settings.get('USER_AGENT')) spider.logger.info('Obtained CloudFlare tokens for {}, re-scheduling the request'.format(response.url)) # Add the cookies to the request and reschedule this request for later request.cookies.update(cloudflare_tokens) request.priority = 99999 return request
def _setup_cookie(self, search_uri, webscraper): ''' :param search_uri: :param webscraper: :return: ''' cookie = {} headers = {'User-Agent': str(UserAgent().random)} try: if webscraper.cloudflare_cookie: # TODO resolver problema de conexion no anonima cookie, user_agent = cfscrape.get_tokens( search_uri, headers['User-Agent']) self.logger.info( '{0} Retrieving Cloudflare Cookie: \n{1}'.format( webscraper.name, cookie)) return cookie, headers elif webscraper.thread_defense_bypass_cookie: # TODO resolver problema de conexion no anonima response = requests.get(search_uri, verify=True, headers=headers) if response.history: self.logger.debug0('{0} Request Was Redirected:'.format( webscraper.name)) for resp in response.history: self.logger.debug( '{0} Response: [ Status Code: {1} ] from [ {2} ]'. format(webscraper.name, resp.status_code, resp.url)) self.logger.debug0( '{0} Final Destination [ Status Code: [ {1} ] from [ {2} ]' .format(webscraper.name, response.status_code, response.url)) # thread_defense_bypass = ThreatDefenceBypass() # cookie = thread_defense_bypass(url=response.url) return cookie, headers else: return cookie, headers except Exception as err: raise ScraperEngineCookieError(webscraper.name, err, traceback.format_exc())
def start_requests(self): urls = [ # 'http://quotes.toscrape.com/', 'https://www.directliquidation.com/electronics/?s=&idx=dl_prod_posts_product_end_date_ts_asc&page=10', ] cf_requests = [] for url in urls: token, agent = cfscrape.get_tokens( url, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like asdlkjqwdj) Chrome/16.0.912.36 Safari/535.7' ) print("The token is ", token) print("The user agent is", agent) cf_requests.append( scrapy.Request(url=url, cookies=token, headers={'User-Agent': agent})) return cf_requests
def cloudflare(session, resp, **kwargs): """ Bypass CloudFlare's anti-bot protection. A request handler that retries a request after bypassing CloudFlare anti-bot protection. """ if is_cloudflare_challenge(resp): log.debug(u'CloudFlare protection detected, trying to bypass it') # Get the original request original_request = resp.request # Get the CloudFlare tokens and original user-agent tokens, user_agent = cfscrape.get_tokens(original_request.url) # Add CloudFlare tokens to the session cookies session.cookies.update(tokens) # Add CloudFlare Tokens to the original request original_cookies = dict_from_cookiejar(original_request._cookies) original_cookies.update(tokens) original_request.prepare_cookies(original_cookies) # The same User-Agent must be used for the retry # Update the session with the CloudFlare User-Agent session.headers['User-Agent'] = user_agent # Update the original request with the CloudFlare User-Agent original_request.headers['User-Agent'] = user_agent # Resend the request kwargs = filtered_kwargs(kwargs) kwargs['allow_redirects'] = True cf_resp = session.send( original_request, **kwargs ) cf_resp.raise_for_status() if cf_resp.ok: log.debug('CloudFlare successfully bypassed.') return cf_resp else: return resp
def cloudflare(session, resp, **kwargs): """ Bypass CloudFlare's anti-bot protection. """ def filtered_kwargs(kwargs): """Filter kwargs to only contain arguments accepted by `requests.Session.send`.""" return { k: v for k, v in kwargs.items() if k in ('stream', 'timeout', 'verify', 'cert', 'proxies', 'allow_redirects') } def is_cloudflare_challenge(resp): """Check if the response is a Cloudflare challange. Source: goo.gl/v8FvnD """ return ( resp.status_code == 503 and resp.headers.get('Server', '').startswith('cloudflare') and b'jschl_vc' in resp.content and b'jschl_answer' in resp.content ) if is_cloudflare_challenge(resp): sickrage.app.log.debug('CloudFlare protection detected, trying to bypass it') # Get the original request original_request = resp.request # Get the CloudFlare tokens and original user-agent tokens, user_agent = cfscrape.get_tokens(original_request.url) # Add CloudFlare tokens to the session cookies session.cookies.update(tokens) # Add CloudFlare Tokens to the original request original_cookies = dict_from_cookiejar(original_request._cookies) original_cookies.update(tokens) original_request.prepare_cookies(original_cookies) # The same User-Agent must be used for the retry # Update the session with the CloudFlare User-Agent session.headers['User-Agent'] = user_agent # Update the original request with the CloudFlare User-Agent original_request.headers['User-Agent'] = user_agent # Remove hooks from original request original_hooks = original_request.hooks original_request.hooks = [] # Resend the request kwargs['allow_redirects'] = True cf_resp = session.send(original_request, **filtered_kwargs(kwargs)) if cf_resp.ok: sickrage.app.log.debug('CloudFlare successfully bypassed.') # Add original hooks back to original request cf_resp.hooks = original_hooks return cf_resp else: return resp