Ejemplo n.º 1
0
    def IUAM_Challenge_Response(self, body, url, interpreter):
        try:
            challengeUUID = re.search(
                r'id="challenge-form" action="(?P<challengeUUID>\S+)"', body,
                re.M | re.DOTALL).groupdict().get('challengeUUID', '')

            payload = OrderedDict(
                re.findall(r'name="(r|jschl_vc|pass)"\svalue="(.*?)"', body))

        except AttributeError:
            sys.tracebacklimit = 0
            raise cloudflare_exceptions.Cloudflare_Error_IUAM(
                "Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
            )

        hostParsed = urlparse(url)

        try:
            payload['jschl_answer'] = JavaScriptInterpreter.dynamicImport(
                interpreter).solveChallenge(body, hostParsed.netloc)
        except Exception as e:
            raise cloudflare_exceptions.Cloudflare_Error_IUAM(
                'Unable to parse Cloudflare anti-bots page: {}'.format(
                    getattr(e, 'message', e)))

        return {
            'url':
            '{}://{}{}'.format(hostParsed.scheme, hostParsed.netloc,
                               self.unescape(challengeUUID)),
            'data':
            payload
        }
Ejemplo n.º 2
0
    def get_tokens(cls, url, **kwargs):
        scraper = cls.create_scraper(
            **{
                field: kwargs.pop(field, None)
                for field in [
                    'allow_brotli', 'browser', 'debug', 'delay', 'interpreter',
                    'recaptcha'
                ] if field in kwargs
            })

        try:
            resp = scraper.get(url, **kwargs)
            resp.raise_for_status()
        except Exception:
            logging.error(
                '"{}" returned an error. Could not collect tokens.'.format(
                    url))
            raise

        domain = urlparse(resp.url).netloc
        # noinspection PyUnusedLocal
        cookie_domain = None

        for d in scraper.cookies.list_domains():
            if d.startswith('.') and d in ('.{}'.format(domain)):
                cookie_domain = d
                break
        else:
            sys.tracebacklimit = 0
            raise cloudflare_exceptions.Cloudflare_Error_IUAM(
                "Unable to find Cloudflare cookies. Does the site actually "
                "have Cloudflare IUAM (I'm Under Attack Mode) enabled?")

        return ({
            '__cfduid':
            scraper.cookies.get('__cfduid', '', domain=cookie_domain),
            'cf_clearance':
            scraper.cookies.get('cf_clearance', '', domain=cookie_domain)
        }, scraper.headers['User-Agent'])
Ejemplo n.º 3
0
    def Challenge_Response(self, resp, **kwargs):
        if self.is_reCaptcha_Challenge(resp):
            # ------------------------------------------------------------------------------- #
            # double down on the request as some websites are only checking
            # if cfuid is populated before issuing reCaptcha.
            # ------------------------------------------------------------------------------- #

            resp = self.decodeBrotli(
                super(CloudScraper, self).request(resp.request.method,
                                                  resp.url, **kwargs))

            if not self.is_reCaptcha_Challenge(resp):
                return resp

            # ------------------------------------------------------------------------------- #
            # if no reCaptcha provider raise a runtime error.
            # ------------------------------------------------------------------------------- #

            if not self.recaptcha or not isinstance(
                    self.recaptcha,
                    dict) or not self.recaptcha.get('provider'):
                sys.tracebacklimit = 0
                raise cloudflare_exceptions.Cloudflare_reCaptcha_Provider(
                    "Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider "
                    "correctly via the 'recaptcha' parameter.")

            # ------------------------------------------------------------------------------- #
            # if provider is return_response, return the response without doing anything.
            # ------------------------------------------------------------------------------- #

            if self.recaptcha.get('provider') == 'return_response':
                return resp

            self.recaptcha['proxies'] = self.proxies
            submit_url = self.reCaptcha_Challenge_Response(
                self.recaptcha.get('provider'), self.recaptcha, resp.text,
                resp.url)
        else:
            # ------------------------------------------------------------------------------- #
            # Cloudflare requires a delay before solving the challenge
            # ------------------------------------------------------------------------------- #

            if not self.delay:
                try:
                    delay = float(
                        re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)',
                                  resp.text).group(1)) / float(1000)
                    if isinstance(delay, (int, float)):
                        self.delay = delay
                except (AttributeError, ValueError):
                    sys.tracebacklimit = 0
                    raise cloudflare_exceptions.Cloudflare_Error_IUAM(
                        "Cloudflare IUAM possibility malformed, issue extracing delay value."
                    )

            sleep(self.delay)

            # ------------------------------------------------------------------------------- #

            submit_url = self.IUAM_Challenge_Response(resp.text, resp.url,
                                                      self.interpreter)

        # ------------------------------------------------------------------------------- #
        # Send the Challenge Response back to Cloudflare
        # ------------------------------------------------------------------------------- #

        if submit_url:

            def updateAttr(obj, name, newValue):
                try:
                    obj[name].update(newValue)
                    return obj[name]
                except (AttributeError, KeyError):
                    obj[name] = {}
                    obj[name].update(newValue)
                    return obj[name]

            cloudflare_kwargs = deepcopy(kwargs)
            cloudflare_kwargs['allow_redirects'] = False
            cloudflare_kwargs['data'] = updateAttr(cloudflare_kwargs, 'data',
                                                   submit_url['data'])

            urlParsed = urlparse(resp.url)
            cloudflare_kwargs['headers'] = updateAttr(
                cloudflare_kwargs, 'headers', {
                    'Origin': '{}://{}'.format(urlParsed.scheme,
                                               urlParsed.netloc),
                    'Referer': resp.url
                })

            challengeSubmitResponse = self.request('POST', submit_url['url'],
                                                   **cloudflare_kwargs)

            # ------------------------------------------------------------------------------- #
            # Return response if Cloudflare is doing content pass through instead of 3xx
            # else request with redirect URL also handle protocol scheme change http -> https
            # ------------------------------------------------------------------------------- #

            if not challengeSubmitResponse.is_redirect:
                return challengeSubmitResponse
            else:
                cloudflare_kwargs = deepcopy(kwargs)
                cloudflare_kwargs['headers'] = updateAttr(
                    cloudflare_kwargs, 'headers',
                    {'Referer': challengeSubmitResponse.url})

                if not urlparse(
                        challengeSubmitResponse.headers['Location']).netloc:
                    redirect_location = urljoin(
                        challengeSubmitResponse.url,
                        challengeSubmitResponse.headers['Location'])
                else:
                    redirect_location = challengeSubmitResponse.headers[
                        'Location']

                return self.request(resp.request.method, redirect_location,
                                    **cloudflare_kwargs)

        # ------------------------------------------------------------------------------- #
        # We shouldn't be here...
        # Re-request the original query and/or process again....
        # ------------------------------------------------------------------------------- #

        return self.request(resp.request.method, resp.url, **kwargs)