Esempio n. 1
0
    def get_tokens(cls, url, user_agent=None, **kwargs):
        scraper = cls.create_scraper()
        if user_agent:
            scraper.headers['User-Agent'] = user_agent

        try:
            resp = scraper.get(url, **kwargs)
            resp.raise_for_status()
        except (BaseException, Exception):
            logging.error('[%s] returned an error. Could not collect tokens.' %
                          url)
            raise

        domain = urlparse(resp.url).netloc

        for d in scraper.cookies.list_domains():
            if d.startswith('.') and d in ('.' + domain):
                cookie_domain = d
                break
        else:
            raise ValueError(
                'Unable to find Cloudflare cookies.'
                ' Does the site actually have Cloudflare IUAM (\'I\'m Under Attack Mode\') enabled?'
            )

        return ({
            '__cfduid':
            scraper.cookies.get('__cfduid', '', domain=cookie_domain),
            'cf_clearance':
            scraper.cookies.get('cf_clearance', '', domain=cookie_domain)
        }, scraper.headers['User-Agent'])
Esempio n. 2
0
def proxy_setting(setting, request_url, force=False):
    """
    Returns a list of a) proxy_setting address value or a PAC is fetched and parsed if proxy_setting
    starts with "PAC:" (case-insensitive) and b) True/False if "PAC" is found in the proxy_setting.

    The PAC data parser is crude, javascript is not eval'd. The first "PROXY URL" found is extracted with a list
    of "url_a_part.url_remaining", "url_b_part.url_remaining", "url_n_part.url_remaining" and so on.
    Also, PAC data items are escaped for matching therefore regular expression items will not match a request_url.

    If force is True or request_url contains a PAC parsed data item then the PAC proxy address is returned else False.
    None is returned in the event of an error fetching PAC data.

    """

    # check for "PAC" usage
    match = re.search(r'^\s*PAC:\s*(.*)', setting, re.I)
    if not match:
        return setting, False
    pac_url = match.group(1)

    # prevent a recursive test with existing proxy setting when fetching PAC url
    global PROXY_SETTING
    proxy_setting_backup = PROXY_SETTING
    PROXY_SETTING = ''

    resp = ''
    try:
        resp = get_url(pac_url)
    except (BaseException, Exception):
        pass
    PROXY_SETTING = proxy_setting_backup

    if not resp:
        return None, False

    proxy_address = None
    request_url_match = False
    parsed_url = urlparse(request_url)
    netloc = parsed_url.netloc
    for pac_data in re.finditer(r"""(?:[^'"]*['"])([^.]+\.[^'"]*)(?:['"])""",
                                resp, re.I):
        data = re.search(r"""PROXY\s+([^'"]+)""", pac_data.group(1), re.I)
        if data:
            if force:
                return data.group(1), True
            proxy_address = (proxy_address,
                             data.group(1))[None is proxy_address]
        elif re.search(re.escape(pac_data.group(1)), netloc, re.I):
            request_url_match = True
            if None is not proxy_address:
                break

    if None is proxy_address:
        return None, True

    return (False, proxy_address)[request_url_match], True
Esempio n. 3
0
 def solve_ddg_challenge(self, resp, **original_kwargs):
     parsed_url = urlparse(resp.url)
     try:
         submit_url = parsed_url.scheme + ':' + re.findall(
             '"frm"[^>]+?action="([^"]+)"', resp.text)[0]
         kwargs = {
             k: v
             for k, v in original_kwargs.items() if k not in ['hooks']
         }
         kwargs.setdefault('headers', {})
         kwargs.setdefault(
             'data',
             dict(h=b64encodestring(
                 '%s://%s' % (parsed_url.scheme, parsed_url.hostname)),
                  u=b64encodestring(parsed_url.path),
                  p=b64encodestring(parsed_url.port or '')))
         self.wait()
         resp = self.request('POST', submit_url, **kwargs)
     except (BaseException, Exception):
         pass
     return resp
Esempio n. 4
0
    def solve_cf_challenge(self, resp, **original_kwargs):
        body = resp.text
        parsed_url = urlparse(resp.url)
        domain = parsed_url.netloc

        if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body:
            raise CloudflareError(
                'Cloudflare captcha presented for %s, please notify SickGear for an update, ua: %s'
                % (domain, self.cf_ua),
                response=resp)

        try:
            action, method = re.findall(
                r'(?sim)<form.*?id="challenge.*?action="/?([^?"]+).*?method="([^"]+)',
                body)[0]
        except (Exception, BaseException):
            action, method = 'cdn-cgi/l/chk_jschl', resp.request.method
        submit_url = '%s://%s/%s' % (parsed_url.scheme, domain, action)

        cloudflare_kwargs = {
            k: v
            for k, v in original_kwargs.items() if k not in ['hooks']
        }
        params = cloudflare_kwargs.setdefault(
            ('data', 'params')['GET' == method.upper()], {})
        headers = cloudflare_kwargs.setdefault('headers', {})
        headers['Referer'] = resp.url
        try:
            token = re.findall(r'(?sim)__cf_chl_jschl_tk__=([^"]+)', body)[0]
            cloudflare_kwargs['params'] = dict(__cf_chl_jschl_tk__=token)
        except (Exception, BaseException):
            pass

        if self.delay == self.default_delay:
            try:
                # no instantiated delay, therefore check js for hard coded CF delay
                self.delay = float(
                    re.search(r'submit\(\);[^0-9]*?([0-9]+)',
                              body).group(1)) / float(1000)
            except (BaseException, Exception):
                pass

        for i in re.findall(r'(<input[^>]+?hidden[^>]+?>)',
                            re.sub(r'(?sim)<!--\s+<input.*?(?=<)', '', body)):
            value = re.findall(r'value="([^"\']+?)["\']', i)
            name = re.findall(r'name="([^"\']+?)["\']', i)
            if all([name, value]):
                params[name[0]] = value[0]

        js = self.extract_js(body, domain)
        atob = (lambda s: b64decodestring('%s' % s))
        try:
            # Eval the challenge algorithm
            params['jschl_answer'] = str(js2py.EvalJs({'atob': atob}).eval(js))
        except (BaseException, Exception):
            try:
                params['jschl_answer'] = str(
                    js2py.EvalJs({
                        'atob': atob
                    }).eval(js))
            except (BaseException, Exception) as e:
                # Something is wrong with the page. This may indicate Cloudflare has changed their anti-bot technique.
                raise ValueError(
                    'Unable to parse Cloudflare anti-bot IUAM page: %r' % e)

        # Requests transforms any request into a GET after a redirect,
        # so the redirect has to be handled manually here to allow for
        # performing other types of requests even as the first request.
        cloudflare_kwargs['allow_redirects'] = False

        self.wait()
        response = self.request(method, submit_url, **cloudflare_kwargs)
        if response:
            if 200 == getattr(response, 'status_code'):
                return response

            # legacy redirection handler (pre 2019.11.xx)
            location = response.headers.get('Location')
            try:
                r = urlparse(location)
            except (Exception, BaseException):
                # Something is wrong with the page, perhaps CF changed their anti-bot technique
                raise ValueError(
                    'Unable to find a new location from Cloudflare anti-bot IUAM page'
                )

            if not r.netloc or location.startswith('/'):
                location = urlunparse((parsed_url.scheme, domain, r.path,
                                       r.params, r.query, r.fragment))
            return self.request(resp.request.method, location,
                                **original_kwargs)
Esempio n. 5
0
    def _authorised(self, **kwargs):
        result = False
        if self.digest:
            digest = [x[::-1] for x in self.digest[::-1].rpartition('=')]
            self.digest = digest[2] + digest[1] + quote(unquote(digest[0]))
            params = dict(logged_in=(lambda y='': all([
                self.url and self.session.cookies.get_dict(
                    domain='.' + urlparse(self.url).netloc) and self.session.
                cookies.clear('.' + urlparse(self.url).netloc) is None or True
            ] + [
                'RSS' in y, 'type="password"' not in y,
                self.has_all_cookies(['speedian'], 'inSpeed_')
            ] + [(self.session.cookies.get('inSpeed_' + c) or 'sg!no!pw') in
                 self.digest for c in ['speedian']])),
                          failed_msg=(lambda y=None: None),
                          post_params={'login': False})
            result = super(SpeedCDProvider, self)._authorised(**params)

        if not result and not self.failure_count:
            if self.url and self.digest:
                self.get_url('%slogout.php' % self.url,
                             skip_auth=True,
                             post_data={
                                 'submit.x': 24,
                                 'submit.y': 11
                             })
            self.digest = ''
            params = dict(
                logged_in=(lambda y='': all([
                    self.session.cookies.get_dict(domain='.speed.cd') and self.
                    session.cookies.clear('.speed.cd') is None or True
                ] + [bool(y), not re.search('(?i)type="password"', y)] + [
                    re.search('(?i)Logout', y) or not self.digest or
                    (self.session.cookies.get('inSpeed_speedian') or 'sg!no!pw'
                     ) in self.digest
                ])),
                failed_msg=
                (lambda y='':
                 (re.search(
                     r'(?i)(username|password)((<[^>]+>)|\W)*' +
                     r'(or|and|/|\s)((<[^>]+>)|\W)*(password|incorrect)', y
                 ) and u'Invalid username or password for %s. Check settings'
                  or
                  u'Failed to authenticate or parse a response from %s, abort provider'
                  )),
                post_params={'form_tmpl': True})
            self.urls['login_action'] = self.urls.get('do_login')
            session = super(SpeedCDProvider, self)._authorised(session=None,
                                                               resp_sess=True,
                                                               **params)
            self.urls['login_action'] = ''
            if session:
                self.digest = 'inSpeed_speedian=%s' % session.cookies.get(
                    'inSpeed_speedian')
                sickbeard.save_config()
                result = True
                logger.log('Cookie details for %s updated.' % self.name,
                           logger.DEBUG)
            elif not self.failure_count:
                logger.log(
                    'Invalid cookie details for %s and login failed. Check settings'
                    % self.name, logger.ERROR)
        return result
Esempio n. 6
0
def get_url(
        url,  # type: AnyStr
        post_data=None,  # type: Optional
        params=None,  # type: Optional
        headers=None,  # type: Optional[Dict]
        timeout=30,  # type: int
        session=None,  # type: Optional[requests.Session]
        parse_json=False,  # type: bool
        raise_status_code=False,  # type: bool
        raise_exceptions=False,  # type: bool
        as_binary=False,  # type: bool
        encoding=None,  # type: Optional[AnyStr]
        **kwargs):
    # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]]
    """
    Either
    1) Returns a byte-string retrieved from the url provider.
    2) Return True/False if success after using kwargs 'savefile' set to file pathname.
    3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True.
    4) JSON Dict if parse_json=True.

    :param url: url
    :param post_data: post data
    :param params:
    :param headers: headers to add
    :param timeout: timeout
    :param session: optional session object
    :param parse_json: return JSON Dict
    :param raise_status_code: raise exception for status codes
    :param raise_exceptions: raise exceptions
    :param as_binary: return bytes instead of text
    :param encoding: overwrite encoding return header if as_binary is False
    :param kwargs:
    :return:
    """

    response_attr = ('text', 'content')[as_binary]

    # selectively mute some errors
    mute = filter_list(lambda x: kwargs.pop(x, False), [
        'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout',
        'mute_http_error'
    ])

    # reuse or instantiate request session
    resp_sess = kwargs.pop('resp_sess', None)
    if None is session:
        session = CloudflareScraper.create_scraper()
        session.headers.update({'User-Agent': USER_AGENT})

    # download and save file or simply fetch url
    savename = kwargs.pop('savename', None)
    if savename:
        # session streaming
        session.stream = True

    if not kwargs.pop('nocache', False):
        cache_dir = CACHE_DIR or get_system_temp_dir()
        session = CacheControl(sess=session,
                               cache=caches.FileCache(
                                   ek.ek(os.path.join, cache_dir, 'sessions')))

    provider = kwargs.pop('provider', None)

    # handle legacy uses of `json` param
    if kwargs.get('json'):
        parse_json = kwargs.pop('json')

    # session master headers
    req_headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip,deflate'
    }
    if headers:
        req_headers.update(headers)
    if hasattr(session, 'reserved') and 'headers' in session.reserved:
        req_headers.update(session.reserved['headers'] or {})
    session.headers.update(req_headers)

    # session parameters
    session.params = params

    # session ssl verify
    session.verify = False

    # don't trust os environments (auth, proxies, ...)
    session.trust_env = False

    response = None
    try:
        # sanitise url
        parsed = list(urlparse(url))
        parsed[2] = re.sub('/{2,}', '/',
                           parsed[2])  # replace two or more / with one
        url = urlunparse(parsed)

        # session proxies
        if PROXY_SETTING:
            (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url)
            msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
            if None is proxy_address:
                logger.debug('Proxy error, aborted the request using %s' % msg)
                return
            elif proxy_address:
                logger.debug('Using %s' % msg)
                session.proxies = {
                    'http': proxy_address,
                    'https': proxy_address
                }

        # decide if we get or post data to server
        if post_data or 'post_json' in kwargs:
            if True is post_data:
                post_data = None

            if post_data:
                kwargs.setdefault('data', post_data)

            if 'post_json' in kwargs:
                kwargs.setdefault('json', kwargs.pop('post_json'))

            response = session.post(url, timeout=timeout, **kwargs)
        else:
            response = session.get(url, timeout=timeout, **kwargs)
            if response.ok and not response.content and 'url=' in response.headers.get(
                    'Refresh', '').lower():
                url = response.headers.get('Refresh').lower().split(
                    'url=')[1].strip('/')
                if not url.startswith('http'):
                    parsed[2] = '/%s' % url
                    url = urlunparse(parsed)
                response = session.get(url, timeout=timeout, **kwargs)

        # if encoding is not in header try to use best guess
        # ignore downloads with savename
        if not savename and not as_binary:
            if encoding:
                response.encoding = encoding
            elif not response.encoding or 'charset' not in response.headers.get(
                    'Content-Type', ''):
                response.encoding = response.apparent_encoding

        # noinspection PyProtectedMember
        if provider and provider._has_signature(response.text):
            return getattr(response, response_attr)

        if raise_status_code:
            response.raise_for_status()

        if not response.ok:
            http_err_text = 'CloudFlare Ray ID' in response.text and \
                            'CloudFlare reports, "Website is offline"; ' or ''
            if response.status_code in http_error_code:
                http_err_text += http_error_code[response.status_code]
            elif response.status_code in range(520, 527):
                http_err_text += 'Origin server connection failure'
            else:
                http_err_text = 'Custom HTTP error code'
                if 'mute_http_error' not in mute:
                    logger.debug(
                        u'Response not ok. %s: %s from requested url %s' %
                        (response.status_code, http_err_text, url))
            return

    except requests.exceptions.HTTPError as e:
        if raise_status_code:
            response.raise_for_status()
        logger.warning(u'HTTP error %s while loading URL%s' %
                       (e.errno, _maybe_request_url(e)))
        return
    except requests.exceptions.ConnectionError as e:
        if 'mute_connect_err' not in mute:
            logger.warning(u'Connection error msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except requests.exceptions.ReadTimeout as e:
        if 'mute_read_timeout' not in mute:
            logger.warning(u'Read timed out msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except (requests.exceptions.Timeout, socket.timeout) as e:
        if 'mute_connect_timeout' not in mute:
            logger.warning(
                u'Connection timed out msg:%s while loading URL %s' %
                (ex(e), _maybe_request_url(e, url)))
        if raise_exceptions:
            raise e
        return
    except (BaseException, Exception) as e:
        if ex(e):
            logger.warning(
                u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s'
                % (url, ex(e), traceback.format_exc()))
        else:
            logger.warning(
                u'Unknown exception while loading URL %s\r\nDetail... %s' %
                (url, traceback.format_exc()))
        if raise_exceptions:
            raise e
        return

    if parse_json:
        try:
            data_json = response.json()
            if resp_sess:
                return ({}, data_json)[isinstance(data_json,
                                                  (dict, list))], session
            return ({}, data_json)[isinstance(data_json, (dict, list))]
        except (TypeError, Exception) as e:
            logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' %
                           (url, ex(e)))
            if raise_exceptions:
                raise e
            return None

    if savename:
        try:
            write_file(savename,
                       response,
                       raw=True,
                       raise_exceptions=raise_exceptions)
        except (BaseException, Exception) as e:
            if raise_exceptions:
                raise e
            return
        return True

    if resp_sess:
        return getattr(response, response_attr), session

    return getattr(response, response_attr)