def test_cf_recaptcha_15_04_2019(self, **kwargs): scraper = cfscrape.CloudflareScraper(**kwargs) message = re.compile(r'captcha challenge presented') scraper.get.when.called_with(url) \ .should.have.raised(cfscrape.CloudflareCaptchaError, message) v = ssl.OPENSSL_VERSION_NUMBER ssl.OPENSSL_VERSION_NUMBER = 0x0090581f try: scraper = cfscrape.CloudflareScraper(**kwargs) message = re.compile(r'OpenSSL version is lower than 1.1.1') scraper.get.when.called_with(url) \ .should.have.raised(cfscrape.CloudflareCaptchaError, message) finally: ssl.OPENSSL_VERSION_NUMBER = v
def __init__(self): coins = ['Bitcoin', 'Ethereum', 'Cash', 'Litecoin', 'Omise', 'TRON'] self.coins = ['BTC', 'ETH', 'BCH', 'LTC', 'OMG', 'TRX'] scraper = cfscrape.CloudflareScraper() # Coinspot uses cloudflare page = scraper.get("https://www.coinspot.com.au/tradecoins") soup = BeautifulSoup(page.content, 'html.parser') pricelist = soup.find_all('tr', class_="tradeitem showrow") foundcoin = [0] * len(coins) buyPrice = [0] * len(coins) sellPrice = [0] * len(coins) for i, element in enumerate(pricelist): for j, coin in enumerate(coins): if coin in element.get_text() and foundcoin[j] is 0: print('\nFOUND: ' + coin) table_elements = element.find_all('td') buyPrice[j] = float(table_elements[1].attrs['data-value']) sellPrice[j] = float(table_elements[2].attrs['data-value']) foundcoin[j] = 1 self.buyPrice = buyPrice self.sellPrice = sellPrice print(self.buyPrice)
def test(self, **kwargs): __Popen = subprocess.Popen # Temporarily Mock subprocess method to return non-zero exit code def mock(*args, **kwargs): def node(): pass node.communicate = lambda: ('stdout', 'Outdated Node.js detected') node.returncode = 1 return node subprocess.Popen = mock try: scraper = cfscrape.CloudflareScraper(**kwargs) message = re.compile(r'non-zero exit status') scraper.get.when.called_with(url) \ .should.have.raised(subprocess.CalledProcessError, message) caplog.text.should_not.match( re.compile(r'Error executing Cloudflare IUAM Javascript')) caplog.text.should.match( re.compile(r'Outdated Node.js detected')) finally: subprocess.Popen = __Popen
def __init__(self, proxy, loop, localsession, executor, id=0): self.proxy = proxy self.loop = loop self.session = cfscrape.CloudflareScraper(headers=genHeaders()) self.localsession = localsession self.executor = executor self.id = id #用于标记这次投票是第几次 #if proxy: self.fingerprint = md5( (proxy + 'Hecate2' + str(time.time())).encode()).hexdigest()
def login(self): """Login user if needed""" LOGGER.info('"%s": start login, method: "%s"', self.username, self.login_method) cookie = self.get_cookie(self.username) if cookie is None: LOGGER.info('"%s": no cookie, new login, method "%s"', self.username, self.login_method) method_dict = { 'g': self.login_google, 'google': self.login_google, 'v': self.login_vk, 'vk': self.login_vk, 'f': self.login_facebook, 'facebook': self.login_facebook, } if type(self.login_method) is not str: raise RRClientException( f"{self.login_method} is not a valid login method.") if self.login_method.lower() not in method_dict.keys(): raise RRClientException( f"{self.login_method} is not a valid login method.") auth_text = requests.get("https://rivalregions.com").text browser = Browser(showWindow=self.show_window) browser = method_dict[self.login_method.lower()](browser, auth_text) LOGGER.info('"%s": Get cookie', self.username) phpsessid = browser.get_cookie('PHPSESSID') if phpsessid: cookie = self.create_cookie(phpsessid.get('expiry', None), phpsessid.get('value', None)) self.write_cookie(self.username, cookie) else: raise NoPHPsessidException() LOGGER.debug('"%s": closing login tab', self.username) browser.close_current_tab() self.session = cfscrape.CloudflareScraper() self.cookie = cookie self.session.cookies.set(**cookie) LOGGER.debug('"%s": set the var_c', self.username) response = self.session.get('https://rivalregions.com/#overview') lines = response.text.split("\n") for line in lines: if re.match("(.*)var c_html(.*)", line): var_c = line.split("'")[-2] LOGGER.debug('"%s": got var_c: %s', self.username, var_c) self.var_c = line.split("'")[-2]
def new_session(self): """ Launch a new session """ self.session = cfscrape.CloudflareScraper() retries = Retry(total=5, backoff_factor=1) self.session.mount('https://', HTTPAdapter(max_retries=retries)) self.pass_cloudflare()
def test_js_challenge_environment_error(self, **kwargs): __path = os.environ['PATH'] # Temporarily unset PATH to hide Node.js os.environ['PATH'] = '' try: scraper = cfscrape.CloudflareScraper(**kwargs) message = re.compile(r'Missing Node.js runtime') scraper.get.when.called_with(url) \ .should.have.raised(EnvironmentError, message) finally: os.environ['PATH'] = __path
def login(self): """Login user if needed""" cookie = self.get_cookie(self.username) if cookie is None: LOGGER.info('Client login "%s" username "%s"', self.login_method, self.username) if self.login_method not in [ "g", "google", "v", "vk", "f", "facebook" ]: raise RRClientException("Not a valid login method.") auth_text = requests.get("https://rivalregions.com").text web = Browser(showWindow=self.show_window) method_dict = { 'g': self.login_google, 'google': self.login_google, 'v': self.login_vk, 'vk': self.login_vk, 'f': self.login_facebook, 'facebook': self.login_facebook, } if self.login_method in method_dict: web = method_dict[self.login_method](web, auth_text) else: LOGGER.info('Invallid loggin method "%s"', self.login_method) sys.exit() LOGGER.debug('Get cookie') phpsessid = web.get_cookie('PHPSESSID') if phpsessid: cookie = self.create_cookie(phpsessid.get('expiry', None), phpsessid.get('value', None)) self.write_cookie(self.username, cookie) else: raise NoPHPsessidException() LOGGER.debug('closing login tab') web.close_current_tab() self.session = cfscrape.CloudflareScraper() self.cookie = cookie self.session.cookies.set(**cookie) LOGGER.debug('set the var_c') response = self.session.get('https://rivalregions.com/#overview') lines = response.text.split("\n") for line in lines: if re.match("(.*)var c_html(.*)", line): var_c = line.split("'")[-2] LOGGER.debug('var_c: %s', var_c) self.var_c = line.split("'")[-2]
def test(self, **kwargs): __Popen = subprocess.Popen # Temporarily disable this method to generate an exception subprocess.Popen = None try: scraper = cfscrape.CloudflareScraper(**kwargs) scraper.get.when.called_with(url) \ .should.have.raised(TypeError) caplog.text.should.match( re.compile(r'Error executing Cloudflare IUAM Javascript')) finally: subprocess.Popen = __Popen
def grab_password(email): # No docs(Because no API), just found it by analyzing the network and told the admin :D url = "https://ghostproject.fr/search.php" scraper = cfscrape.CloudflareScraper() cookie = {"cookieconsent_status": "dismiss"} data = {"param": email} req = scraper.post(url, cookies=cookie, data=data).text result = req.split("\\n") if "Error" in req or len(result) == 2: return False else: return result[1:-1]
def get_tags(self): """Get tags.""" page = self.page url = self.url result = list(self.parse_page(page)) if not result: h1_tag_text = page.select_one('h1').text if h1_tag_text != '503 Service Temporarily Unavailable': log.error('Unexpected H1-tag text', text=h1_tag_text) if self.scraper is None: self.scraper = cfscrape.CloudflareScraper() resp = self.scraper.get(url, timeout=10) html_soup = bs4.BeautifulSoup(resp.text, 'lxml') return self.parse_page(html_soup) return result
def test(self, **kwargs): __Popen = subprocess.Popen # Temporarily Mock subprocess method to raise an OSError def mock(*args, **kwargs): raise OSError('System Error') subprocess.Popen = mock try: scraper = cfscrape.CloudflareScraper(**kwargs) scraper.get.when.called_with(url) \ .should.have.raised(OSError, re.compile(r'System Error')) caplog.text.should.equal('') finally: subprocess.Popen = __Popen
def get_tags(self): """Get tags.""" classname_to_namespace_dict = { 'tag-type-artist': 'creator', 'tag-type-character': 'character', 'tag-type-copyright': 'series', 'tag-type-species': 'species', 'tag-type-general': '' } scraper = cfscrape.CloudflareScraper() resp = scraper.get(self.url, timeout=10) page = bs4.BeautifulSoup(resp.text, 'lxml') for key, namespace in classname_to_namespace_dict.items(): for item in page.select('li.{}'.format(key)): name = \ item.text \ .rsplit(' ', 1)[0].strip().split('? ', 1)[1].strip() yield (namespace, name)
def open(self, url): count = 0 maxcount = 5 scraper = cfscrape.CloudflareScraper() # Retry while count < maxcount: try: response = scraper.get(url) return response except Exception as e: print("Could not open '{}', retrying... ({})".format( url, count)) count = count + 1 time.sleep(1) if count >= maxcount: raise
def scraper(self, url: str): """ It takes a URL, makes a request to it, and returns the response :param url: str = "hxxps://URL" :type url: str :return: A response object. """ cloudflare_scraper = cfscrape.CloudflareScraper() resp = cloudflare_scraper.get(url, stream=True, timeout=10) resp.headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/89.0", }) try: resp.raise_for_status() except (requests.HTTPError, requests.ReadTimeout) as err: status = err.response.status_code if status not in (403, 429): self.log.error(f"{str(err)}") except requests.exceptions.RequestException as err: self.log.error(f"{str(err)}") else: return resp return None
def init_cfscraper(): s = cfs.CloudflareScraper() print("CFScraper initialized") return s
floor=comment["floor"], time=datetime.strptime(comment["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), content=comment["content"]) else: new_comt = dcard_comments( id=comment["postId"], owner="這則回應已被本人刪除", gender=0, floor=comment["floor"], time=datetime.strptime(comment["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), content="已經刪除的內容就像 Dcard 一樣,錯過是無法再相見的!") db.session.add(new_comt) except Exception as e: local_var_key = locals() logging.error(traceback.format_exc()) logging.error({ key: local_var_key[key] for key in local_var_key if key != "article_comments" }) logging.error("") db.session.commit() crawler = Dcard_crawler(req_module=cfscrape.CloudflareScraper()) article_list_gen = crawler.get_article_list(cate="dressup", limit=100) list_gen_factory = generators_factory(article_list_gen) article_gen = crawler.get_article(list_generator=list_gen_factory()) comt_gen = crawler.get_comments(list_generator=list_gen_factory(), limit=100) crawler.save_data(article_gen, comt_gen)
import cfscrape import fileinput import random import time import os import array as arr LinkFile = "link.txt" #file chua link tap hop chuong truyen filenameTXT = "Ketqua.txt" chapter = 0 f = open(LinkFile, "r") for x in f: if "http" in x: StrippedContent = "" ChapterURL = x scraper = cfscrape.CloudflareScraper() response = scraper.get(ChapterURL) filenameHTML = str(chapter) + ".html" bCheckLink = 0 open(filenameHTML, 'wb').write(response.content) with open(filenameHTML, encoding="utf-8") as fp: soup = BeautifulSoup(fp, "lxml") try: #Luu tieu de downloaded = soup.title.string StrippedContent = "\n" + downloaded + "\n" #Lay noi dung chuong div = soup.find(id="bookContentBody") for elem in div.find_all("p"): elem.replace_with(elem.text + "\n\n")
def main(prog_input=None, resize=False, size=None, db_path=None, place=DEFAULT_PLACE, match_filter='default', write_tags=False, input_mode='default', verbose=False, debug=False, abort_on_error=False): """Get similar image from iqdb.""" assert prog_input is not None, "Input is not a valid path" # logging log_level = None if verbose: log_level = logging.INFO if debug: log_level = logging.DEBUG if log_level: logging.basicConfig(handlers=[ logging.FileHandler(os.path.join(user_data_dir, 'output.log'), 'w', 'utf-8') ], level=log_level) init_program(db_path) br = mechanicalsoup.StatefulBrowser(soup_config={'features': 'lxml'}) br.raise_on_404 = True scraper = cfscrape.CloudflareScraper() # variable used in both input mode error_set = [] if input_mode == 'folder': assert os.path.isdir(prog_input), 'Input is not valid folder' files = [os.path.join(prog_input, x) for x in os.listdir(prog_input)] if not files: print('No files found.') return sorted_files = sorted(files, key=lambda x: os.path.splitext(x)[1]) for idx, ff in enumerate(sorted_files): log.debug('file', f=os.path.basename(ff), idx=idx, total=len(files)) result = {} try: result = run_program_for_single_img(ff, resize, size, place, match_filter, write_tags, browser=br, scraper=scraper, disable_tag_print=True) except Exception as e: # pylint:disable=broad-except if abort_on_error: raise e error_set.append((ff, e)) if result is not None and result.get('error'): error_set.extend([(ff, x) for x in result['error']]) else: image = prog_input result = run_program_for_single_img(image, resize, size, place, match_filter, write_tags, browser=br, scraper=scraper) if result is not None and result.get('error'): error_set.extend([(image, x) for x in result['error']]) if error_set: log.error('Found error(s)') for x in error_set: log.error('path: ' + x[0] + '\nerror: ' + str(x[1]))
def login(self): """Login user if needed""" LOGGER.info('"%s": start login, method: "%s"', self.username, self.login_method) cookies = CookieHandler.get_cookies(self.username) if not cookies: cookies = [] LOGGER.info('"%s": no cookie, new login, method "%s"', self.username, self.login_method) login_method_dict = { 'g': login_methods.login_google, 'google': login_methods.login_google, 'v': login_methods.login_vk, 'vk': login_methods.login_vk, 'f': login_methods.login_facebook, 'facebook': login_methods.login_facebook, } auth_text = requests.get("https://rivalregions.com").text browser = Browser(showWindow=self.show_window) if self.login_method in login_method_dict: browser = login_method_dict[self.login_method](browser, auth_text, self.username, self.password) else: LOGGER.info('"%s": Invalid login method "%s"', self.username, self.login_method) sys.exit() LOGGER.info('"%s": Get PHPSESSID', self.username) browser_cookie = browser.get_cookie('PHPSESSID') if browser_cookie: expiry = browser_cookie.get('expiry', None) value = browser_cookie.get('value', None) LOGGER.info('"%s": "value": %s, "expiry": %s', self.username, value, expiry) cookie = CookieHandler.create_cookie('PHPSESSID', expiry, value) cookies.append(cookie) else: raise NoCookieException() cookie_names = ['rr_f'] for cookie_name in cookie_names: browser_cookie = browser.get_cookie(cookie_name) if browser_cookie: LOGGER.info('"%s": Get %s', self.username, cookie_name) expiry = browser_cookie.get('expiry', None) value = browser_cookie.get('value', None) cookies.append( CookieHandler.create_cookie(cookie_name, expiry, value)) LOGGER.info('"%s": "value": %s, "expiry": %s', self.username, value, expiry) else: raise NoCookieException() CookieHandler.write_cookies(self.username, cookies) LOGGER.debug('"%s": closing login tab', self.username) browser.close_current_tab() else: LOGGER.info('"%s": Cookies found', self.username) self.session = cfscrape.CloudflareScraper() for cookie in cookies: self.session.cookies.set(**cookie) LOGGER.debug('"%s": set the var_c', self.username) response = self.session.get('https://rivalregions.com/#overview') lines = response.text.split("\n") for line in lines: if re.match("(.*)var c_html(.*)", line): var_c = line.split("'")[-2] LOGGER.debug('"%s": got var_c: %s', self.username, var_c) self.var_c = line.split("'")[-2]
def test_js_challenge_21_05_2015(self, **kwargs): scraper = cfscrape.CloudflareScraper(**kwargs) expect(scraper.get(url).content).to.equal(requested_page)
def check_resp(self, u, **kwargs): scraper = cfscrape.CloudflareScraper(**kwargs) resp = scraper.get(u) self.assertEqual(resp and resp.content, requested_page)
import cfscrape import requests import os from recaptcha import * from bs4 import BeautifulSoup import re # Requests wrapper #url = 'https://www.acgnx.se/' url = 'https://www.acgnx.se/show-8A7C71BCBEB854DDF0880AF26FB4504A47F50B2D.html' session = requests.session() session.headers = 'content-type' session.mount("http://", cfscrape.CloudflareScraper()) scraper = cfscrape.create_scraper(sess=session) req = scraper.get(url).content #print req ### Save request as HTML named as 'Result.html' f_name = '\Result.html' f = open(f_name, 'w') f.write(req.encode('UTF-8')) f.close ### Excute JavaScript file start = time() driver = webdriver.Chrome( os.getcwd() + "\chromedriver.exe" ) # Optional argument, if not specified will search path. driver.get(os.getcwd() + f_name) #print driver.page_source
headers = { 'user-agent': uaGen.random, 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'referer': "https://www.internationalsaimoe.com/voting/", 'accept-encoding': "gzip, deflate", 'accept-language': acceptLanguage[random.randint(0, indexLanguage)], 'cache-control': "no-cache", 'connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Upgrade-Insecure-Requests': '1', } return headers localsession = cfscrape.CloudflareScraper(headers=genHeaders()) import numpy as np from skimage.filters import gaussian from skimage.exposure import equalize_hist from skimage.morphology import opening, label from skimage.measure import regionprops def judge(img): #判断能否识别验证码 img = img[97:, :] img = gaussian(img, sigma=0.85) img = equalize_hist(img) img = (img > 0.7) * 1.0 img = opening(img, selem=np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]])) image_region = img * 0