def get_tokens(cls, url, user_agent=None, **kwargs): scraper = cls.create_scraper() if user_agent: scraper.headers['User-Agent'] = user_agent try: resp = scraper.get(url, **kwargs) resp.raise_for_status() except (BaseException, Exception): logging.error('[%s] returned an error. Could not collect tokens.' % url) raise domain = urlparse(resp.url).netloc for d in scraper.cookies.list_domains(): if d.startswith('.') and d in ('.' + domain): cookie_domain = d break else: raise ValueError( 'Unable to find Cloudflare cookies.' ' Does the site actually have Cloudflare IUAM (\'I\'m Under Attack Mode\') enabled?' ) return ({ '__cfduid': scraper.cookies.get('__cfduid', '', domain=cookie_domain), 'cf_clearance': scraper.cookies.get('cf_clearance', '', domain=cookie_domain) }, scraper.headers['User-Agent'])
def proxy_setting(setting, request_url, force=False): """ Returns a list of a) proxy_setting address value or a PAC is fetched and parsed if proxy_setting starts with "PAC:" (case-insensitive) and b) True/False if "PAC" is found in the proxy_setting. The PAC data parser is crude, javascript is not eval'd. The first "PROXY URL" found is extracted with a list of "url_a_part.url_remaining", "url_b_part.url_remaining", "url_n_part.url_remaining" and so on. Also, PAC data items are escaped for matching therefore regular expression items will not match a request_url. If force is True or request_url contains a PAC parsed data item then the PAC proxy address is returned else False. None is returned in the event of an error fetching PAC data. """ # check for "PAC" usage match = re.search(r'^\s*PAC:\s*(.*)', setting, re.I) if not match: return setting, False pac_url = match.group(1) # prevent a recursive test with existing proxy setting when fetching PAC url global PROXY_SETTING proxy_setting_backup = PROXY_SETTING PROXY_SETTING = '' resp = '' try: resp = get_url(pac_url) except (BaseException, Exception): pass PROXY_SETTING = proxy_setting_backup if not resp: return None, False proxy_address = None request_url_match = False parsed_url = urlparse(request_url) netloc = parsed_url.netloc for pac_data in re.finditer(r"""(?:[^'"]*['"])([^.]+\.[^'"]*)(?:['"])""", resp, re.I): data = re.search(r"""PROXY\s+([^'"]+)""", pac_data.group(1), re.I) if data: if force: return data.group(1), True proxy_address = (proxy_address, data.group(1))[None is proxy_address] elif re.search(re.escape(pac_data.group(1)), netloc, re.I): request_url_match = True if None is not proxy_address: break if None is proxy_address: return None, True return (False, proxy_address)[request_url_match], True
def solve_ddg_challenge(self, resp, **original_kwargs): parsed_url = urlparse(resp.url) try: submit_url = parsed_url.scheme + ':' + re.findall( '"frm"[^>]+?action="([^"]+)"', resp.text)[0] kwargs = { k: v for k, v in original_kwargs.items() if k not in ['hooks'] } kwargs.setdefault('headers', {}) kwargs.setdefault( 'data', dict(h=b64encodestring( '%s://%s' % (parsed_url.scheme, parsed_url.hostname)), u=b64encodestring(parsed_url.path), p=b64encodestring(parsed_url.port or ''))) self.wait() resp = self.request('POST', submit_url, **kwargs) except (BaseException, Exception): pass return resp
def solve_cf_challenge(self, resp, **original_kwargs): body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body: raise CloudflareError( 'Cloudflare captcha presented for %s, please notify SickGear for an update, ua: %s' % (domain, self.cf_ua), response=resp) try: action, method = re.findall( r'(?sim)<form.*?id="challenge.*?action="/?([^?"]+).*?method="([^"]+)', body)[0] except (Exception, BaseException): action, method = 'cdn-cgi/l/chk_jschl', resp.request.method submit_url = '%s://%s/%s' % (parsed_url.scheme, domain, action) cloudflare_kwargs = { k: v for k, v in original_kwargs.items() if k not in ['hooks'] } params = cloudflare_kwargs.setdefault( ('data', 'params')['GET' == method.upper()], {}) headers = cloudflare_kwargs.setdefault('headers', {}) headers['Referer'] = resp.url try: token = re.findall(r'(?sim)__cf_chl_jschl_tk__=([^"]+)', body)[0] cloudflare_kwargs['params'] = dict(__cf_chl_jschl_tk__=token) except (Exception, BaseException): pass if self.delay == self.default_delay: try: # no instantiated delay, therefore check js for hard coded CF delay self.delay = float( re.search(r'submit\(\);[^0-9]*?([0-9]+)', body).group(1)) / float(1000) except (BaseException, Exception): pass for i in re.findall(r'(<input[^>]+?hidden[^>]+?>)', re.sub(r'(?sim)<!--\s+<input.*?(?=<)', '', body)): value = re.findall(r'value="([^"\']+?)["\']', i) name = re.findall(r'name="([^"\']+?)["\']', i) if all([name, value]): params[name[0]] = value[0] js = self.extract_js(body, domain) atob = (lambda s: b64decodestring('%s' % s)) try: # Eval the challenge algorithm params['jschl_answer'] = str(js2py.EvalJs({'atob': atob}).eval(js)) except (BaseException, Exception): try: params['jschl_answer'] = str( js2py.EvalJs({ 'atob': atob }).eval(js)) except (BaseException, Exception) as e: # Something is wrong with the page. This may indicate Cloudflare has changed their anti-bot technique. raise ValueError( 'Unable to parse Cloudflare anti-bot IUAM page: %r' % e) # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for # performing other types of requests even as the first request. cloudflare_kwargs['allow_redirects'] = False self.wait() response = self.request(method, submit_url, **cloudflare_kwargs) if response: if 200 == getattr(response, 'status_code'): return response # legacy redirection handler (pre 2019.11.xx) location = response.headers.get('Location') try: r = urlparse(location) except (Exception, BaseException): # Something is wrong with the page, perhaps CF changed their anti-bot technique raise ValueError( 'Unable to find a new location from Cloudflare anti-bot IUAM page' ) if not r.netloc or location.startswith('/'): location = urlunparse((parsed_url.scheme, domain, r.path, r.params, r.query, r.fragment)) return self.request(resp.request.method, location, **original_kwargs)
def _authorised(self, **kwargs): result = False if self.digest: digest = [x[::-1] for x in self.digest[::-1].rpartition('=')] self.digest = digest[2] + digest[1] + quote(unquote(digest[0])) params = dict(logged_in=(lambda y='': all([ self.url and self.session.cookies.get_dict( domain='.' + urlparse(self.url).netloc) and self.session. cookies.clear('.' + urlparse(self.url).netloc) is None or True ] + [ 'RSS' in y, 'type="password"' not in y, self.has_all_cookies(['speedian'], 'inSpeed_') ] + [(self.session.cookies.get('inSpeed_' + c) or 'sg!no!pw') in self.digest for c in ['speedian']])), failed_msg=(lambda y=None: None), post_params={'login': False}) result = super(SpeedCDProvider, self)._authorised(**params) if not result and not self.failure_count: if self.url and self.digest: self.get_url('%slogout.php' % self.url, skip_auth=True, post_data={ 'submit.x': 24, 'submit.y': 11 }) self.digest = '' params = dict( logged_in=(lambda y='': all([ self.session.cookies.get_dict(domain='.speed.cd') and self. session.cookies.clear('.speed.cd') is None or True ] + [bool(y), not re.search('(?i)type="password"', y)] + [ re.search('(?i)Logout', y) or not self.digest or (self.session.cookies.get('inSpeed_speedian') or 'sg!no!pw' ) in self.digest ])), failed_msg= (lambda y='': (re.search( r'(?i)(username|password)((<[^>]+>)|\W)*' + r'(or|and|/|\s)((<[^>]+>)|\W)*(password|incorrect)', y ) and u'Invalid username or password for %s. Check settings' or u'Failed to authenticate or parse a response from %s, abort provider' )), post_params={'form_tmpl': True}) self.urls['login_action'] = self.urls.get('do_login') session = super(SpeedCDProvider, self)._authorised(session=None, resp_sess=True, **params) self.urls['login_action'] = '' if session: self.digest = 'inSpeed_speedian=%s' % session.cookies.get( 'inSpeed_speedian') sickbeard.save_config() result = True logger.log('Cookie details for %s updated.' % self.name, logger.DEBUG) elif not self.failure_count: logger.log( 'Invalid cookie details for %s and login failed. Check settings' % self.name, logger.ERROR) return result
def get_url( url, # type: AnyStr post_data=None, # type: Optional params=None, # type: Optional headers=None, # type: Optional[Dict] timeout=30, # type: int session=None, # type: Optional[requests.Session] parse_json=False, # type: bool raise_status_code=False, # type: bool raise_exceptions=False, # type: bool as_binary=False, # type: bool encoding=None, # type: Optional[AnyStr] **kwargs): # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]] """ Either 1) Returns a byte-string retrieved from the url provider. 2) Return True/False if success after using kwargs 'savefile' set to file pathname. 3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True. 4) JSON Dict if parse_json=True. :param url: url :param post_data: post data :param params: :param headers: headers to add :param timeout: timeout :param session: optional session object :param parse_json: return JSON Dict :param raise_status_code: raise exception for status codes :param raise_exceptions: raise exceptions :param as_binary: return bytes instead of text :param encoding: overwrite encoding return header if as_binary is False :param kwargs: :return: """ response_attr = ('text', 'content')[as_binary] # selectively mute some errors mute = filter_list(lambda x: kwargs.pop(x, False), [ 'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout', 'mute_http_error' ]) # reuse or instantiate request session resp_sess = kwargs.pop('resp_sess', None) if None is session: session = CloudflareScraper.create_scraper() session.headers.update({'User-Agent': USER_AGENT}) # download and save file or simply fetch url savename = kwargs.pop('savename', None) if savename: # session streaming session.stream = True if not kwargs.pop('nocache', False): cache_dir = CACHE_DIR or get_system_temp_dir() session = CacheControl(sess=session, cache=caches.FileCache( ek.ek(os.path.join, cache_dir, 'sessions'))) provider = kwargs.pop('provider', None) # handle legacy uses of `json` param if kwargs.get('json'): parse_json = kwargs.pop('json') # session master headers req_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip,deflate' } if headers: req_headers.update(headers) if hasattr(session, 'reserved') and 'headers' in session.reserved: req_headers.update(session.reserved['headers'] or {}) session.headers.update(req_headers) # session parameters session.params = params # session ssl verify session.verify = False # don't trust os environments (auth, proxies, ...) session.trust_env = False response = None try: # sanitise url parsed = list(urlparse(url)) parsed[2] = re.sub('/{2,}', '/', parsed[2]) # replace two or more / with one url = urlunparse(parsed) # session proxies if PROXY_SETTING: (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url) msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url) if None is proxy_address: logger.debug('Proxy error, aborted the request using %s' % msg) return elif proxy_address: logger.debug('Using %s' % msg) session.proxies = { 'http': proxy_address, 'https': proxy_address } # decide if we get or post data to server if post_data or 'post_json' in kwargs: if True is post_data: post_data = None if post_data: kwargs.setdefault('data', post_data) if 'post_json' in kwargs: kwargs.setdefault('json', kwargs.pop('post_json')) response = session.post(url, timeout=timeout, **kwargs) else: response = session.get(url, timeout=timeout, **kwargs) if response.ok and not response.content and 'url=' in response.headers.get( 'Refresh', '').lower(): url = response.headers.get('Refresh').lower().split( 'url=')[1].strip('/') if not url.startswith('http'): parsed[2] = '/%s' % url url = urlunparse(parsed) response = session.get(url, timeout=timeout, **kwargs) # if encoding is not in header try to use best guess # ignore downloads with savename if not savename and not as_binary: if encoding: response.encoding = encoding elif not response.encoding or 'charset' not in response.headers.get( 'Content-Type', ''): response.encoding = response.apparent_encoding # noinspection PyProtectedMember if provider and provider._has_signature(response.text): return getattr(response, response_attr) if raise_status_code: response.raise_for_status() if not response.ok: http_err_text = 'CloudFlare Ray ID' in response.text and \ 'CloudFlare reports, "Website is offline"; ' or '' if response.status_code in http_error_code: http_err_text += http_error_code[response.status_code] elif response.status_code in range(520, 527): http_err_text += 'Origin server connection failure' else: http_err_text = 'Custom HTTP error code' if 'mute_http_error' not in mute: logger.debug( u'Response not ok. %s: %s from requested url %s' % (response.status_code, http_err_text, url)) return except requests.exceptions.HTTPError as e: if raise_status_code: response.raise_for_status() logger.warning(u'HTTP error %s while loading URL%s' % (e.errno, _maybe_request_url(e))) return except requests.exceptions.ConnectionError as e: if 'mute_connect_err' not in mute: logger.warning(u'Connection error msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except requests.exceptions.ReadTimeout as e: if 'mute_read_timeout' not in mute: logger.warning(u'Read timed out msg:%s while loading URL%s' % (ex(e), _maybe_request_url(e))) if raise_exceptions: raise e return except (requests.exceptions.Timeout, socket.timeout) as e: if 'mute_connect_timeout' not in mute: logger.warning( u'Connection timed out msg:%s while loading URL %s' % (ex(e), _maybe_request_url(e, url))) if raise_exceptions: raise e return except (BaseException, Exception) as e: if ex(e): logger.warning( u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s' % (url, ex(e), traceback.format_exc())) else: logger.warning( u'Unknown exception while loading URL %s\r\nDetail... %s' % (url, traceback.format_exc())) if raise_exceptions: raise e return if parse_json: try: data_json = response.json() if resp_sess: return ({}, data_json)[isinstance(data_json, (dict, list))], session return ({}, data_json)[isinstance(data_json, (dict, list))] except (TypeError, Exception) as e: logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' % (url, ex(e))) if raise_exceptions: raise e return None if savename: try: write_file(savename, response, raw=True, raise_exceptions=raise_exceptions) except (BaseException, Exception) as e: if raise_exceptions: raise e return return True if resp_sess: return getattr(response, response_attr), session return getattr(response, response_attr)