def open_url(url, method='get', headers=None, cookies=None, params={}, data={}, redirects=True, verify=True, timeout=None, files=None, auth=None, proxies=None, hooks=None, stream=None, cert=None, json=None): if headers == None: headers = {} headers['User-Agent'] = User_Agent link = getattr(scraper,method)(url, headers=headers, cookies=cookies, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=timeout, files=files, auth=auth, proxies=proxies, hooks=hooks, stream=stream, cert=cert, json=json) try: su = sucuri().get(link.content) if su: headers['Cookie'] = su if not url[-1] == '/': url = '%s/' %url link = getattr(scraper,method)(url, headers=headers, cookies=cookies, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=timeout, files=files, auth=auth, proxies=proxies, hooks=hooks, stream=stream, cert=cert, json=json) except: pass if '_Incapsula_' in link.content: link = crack(scraper, link) return link
def unblock(): """ Unblock the target url/session :return: """ r = session.get(target_url) r = crack(session, r) with open('unblocked.html', 'wb') as f: f.write(r.content) return incap_blocked(r)
def updatePreviews(previewsOnly): """Will update the global array that holds all preview instances (Predictions)""" session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0' }) # get full page r = crack(session, session.get('https://www.whoscored.com')) soup = BeautifulSoup(r.content, "html.parser") # get tomorrow's matches r2 = crack( session, session.get( "https://www.whoscored.com/LiveScoresSummary?day=tomorrow")) soup2 = BeautifulSoup(r2.content, "html.parser") # find all preview a's tables = [ soup.find("table", class_="detailed-tournaments"), soup2.find("table", class_="detailed-tournaments") ] for t, table in enumerate(tables): for i, row in enumerate(table.select("tr.match")): # create new prediction instance predictions.append( Prediction( len(row.select("td.toolbar.right a.preview")) == 1, t, row.select("td.time")[0].contents[0].strip(), row.select("td.home a")[0].get_text().strip(), row.select("td.away a")[0].get_text().strip())) previewCounter = 0 for p in predictions: if previewsOnly and not p.hasPreview: continue previewCounter += 1 print "time: ", p.day + " " + p.time print "home: ", p.hometeam print "away: ", p.awayteam print "preview: ", "Yes" if p.hasPreview else "No" print( str(len(predictions)) + " matches in total, of which " + str(previewCounter) + " have a preview available.")
def open_url(url, method='get', headers=None, cookies=None, params=None, data=None, redirects=True, verify=True): if headers == None: headers = {} headers['User-Agent'] = User_Agent link = getattr(scraper, method)(url, headers=headers, cookies=cookies, params=params, data=data, allow_redirects=redirects, verify=verify) try: if link.headers['Server'] == 'Sucuri/Cloudproxy': su = sucuri().get(link.content) headers['Cookie'] = su if not url[-1] == '/': url = '%s/' % url link = getattr(scraper, method)(url, headers=headers, cookies=cookies, params=params, data=data, allow_redirects=redirects, verify=verify) except: pass if '_Incapsula_' in link.content: link = getattr(scraper, method)(url, headers=headers, cookies=cookies, params=params, data=data, allow_redirects=redirects, verify=verify) link = crack(scraper, link) return link
import requests from incapsula import crack session = requests.Session() r = crack(session, session.get('http://www.bjs.com')) print r.content
def OPEN_URL(url, method='get', headers=None, params=None, data=None, redirects=True, verify=True, mobile=False, timeout=None, output=None, XHR=False): if timeout == None: timeout = 30 if headers == None: headers = {} headers['User-Agent'] = random_agent() if not 'User-Agent' in headers: if not 'user-agent' in headers: headers['User-Agent'] = random_agent() elif mobile == True: headers['User-Agent'] = '' headers['User-Agent'] = 'Apple-iPhone/701.341' if output == 'geturl': link = requests.head(url, allow_redirects=True) link = str(link.url) return link if output == 'cookie': cookie = [] r = session.get(url, headers=headers) cookie_dict = session.cookies.get_dict() for k, v in cookie_dict.items(): cookie = "%s=%s" % (k, v) return cookie if XHR == True: print("REQUESTING WITH XMLHttpRequest") headers['X-Requested-With'] = 'XMLHttpRequest' if not 'Accept-Language' in headers: headers['Accept-Language'] = 'en-US' if not 'referer' in headers: if not 'Referer' in headers: headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) link = requests.get(url, headers=headers, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=int(timeout)) response_code = link.status_code print("RESPONSE CODE", response_code) try: resp_header = link.headers['Server'] except: resp_header = 'none' try: if resp_header.lower( ) == 'cloudflare-nginx' and response_code == 503 and not "sucuri_cloudproxy_js" in link.content: print("DETECTED CLOUDFLARE", url) link = scraper.get(url) except: pass try: if "sucuri_cloudproxy_js" in link.content: print("DETECTED SUCURI", url) su = sucuri().get(link.content) headers['Cookie'] = su link = session.get(url, headers=headers, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=int(timeout)) except: pass if '_Incapsula_' in link.content: print("DETECTED _Incapsula_", url) response = session.get(url) # url is blocked by incapsula link = crack(session, response) return link
def OPEN_URL(url, method='get', headers=None, params=None, data=None, redirects=True, verify=True, mobile=False, timeout=None): if timeout == None: timeout = '30' if headers == None: headers = {} headers['User-Agent'] = random_agent() elif mobile == True: headers['User-Agent'] = '' headers['User-Agent'] = 'Apple-iPhone/701.341' link = requests.get(url, headers=headers, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=int(timeout)) response_code = link.status_code print("RESPONSE CODE", response_code) try: resp_header = link.headers['Server'] except: resp_header = 'none' try: if resp_header.lower( ) == 'cloudflare-nginx' and response_code == 503 and not "sucuri_cloudproxy_js" in link.content: print("DETECTED CLOUDFLARE", url) link = scraper.get(url) except: pass try: if "sucuri_cloudproxy_js" in link.content: print("DETECTED SUCURI", url) su = sucuri().get(link.content) headers['Cookie'] = su if not url[-1] == '/': url = '%s/' % url link = requests.get(url, headers=headers, params=params, data=data, allow_redirects=redirects, verify=verify, timeout=int(timeout)) except: pass if '_Incapsula_' in link.content: print("DETECTED _Incapsula_", url) response = session.get(url) # url is blocked by incapsula link = crack(session, response) return link