Exemple #1
0
 def download_file(self, target, referer='', post=None, dest_name=None):
     if not self.download_dir:
         return None
     Url.headers['Referer'] = referer
     if self.use_auth:
         if not self.auth_try():
             return None
     try:
         if not dest_name:
             dest_name = os.path.basename(target)
         url = self.url_opener.open(
             urllib2.Request(url=target, data=post, headers=Url.headers))
         fl = open(os.path.join(self.download_dir, dest_name), "wb")
         fl.write(url.read())
         fl.close()
         return os.path.join(self.download_dir, dest_name)
     except urllib2.HTTPError, e:
         if int(e.getcode()) == 503:
             try:
                 from cfscrape import CloudflareScraper
                 scraper = CloudflareScraper()
                 self.log('Loading CF protected image %s > %s' %
                          (target, dest_name))
                 fl = open(os.path.join(self.download_dir, dest_name), "wb")
                 c = scraper.get(target).content
                 fl.write(c)
                 fl.close()
                 return os.path.join(self.download_dir, dest_name)
             except Exception:
                 pass
         if self.show_errors:
             xbmc.executebuiltin(
                 'XBMC.Notification("HTTP_ERROR", "%s", 3000, "")' % e)
         self.log(target + ' ' + e)
         return None
class Scraper:
    scraper = CloudflareScraper()
    ua = UserAgent()
    personality = ua.random

    def get_html(url):
        request: CloudflareScraper

        for i in range(120):
            try:
                request = Scraper.scraper.get(
                    url,
                    headers={'User-Agent': Scraper.personality},
                    timeout=0.7)
                if request.status_code == 200:
                    return request.content
                else:
                    Scraper.personality = Scraper.ua.random
                    continue
            except:
                pass

        print('Scraper can\'t do request')
        try:
            return request.content
        except:
            return ''
Exemple #3
0
def cloudflare(session, resp, **kwargs):
    """
    Bypass Cloudflare's anti-bot protection.

    A request handler that retries a request after bypassing Cloudflare anti-bot
    protection.
    """
    if CloudflareScraper.is_cloudflare_iuam_challenge(resp):
        log.debug('Cloudflare protection detected, trying to bypass it.')

        # Get the original request
        original_request = resp.request

        # Get the Cloudflare tokens and original user-agent
        tokens, user_agent = CloudflareScraper.get_tokens(original_request.url)

        # Add Cloudflare tokens to the session cookies
        session.cookies.update(tokens)
        # Add Cloudflare Tokens to the original request
        original_cookies = dict_from_cookiejar(original_request._cookies)
        original_cookies.update(tokens)
        original_request.prepare_cookies(original_cookies)

        # The same User-Agent must be used for the retry
        # Update the session with the Cloudflare User-Agent
        session.headers['User-Agent'] = user_agent
        # Update the original request with the Cloudflare User-Agent
        original_request.headers['User-Agent'] = user_agent

        # Resend the request
        kwargs = filtered_kwargs(kwargs)
        kwargs['allow_redirects'] = True
        cf_resp = session.send(
            original_request,
            **kwargs
        )
        cf_resp.raise_for_status()

        if cf_resp.ok:
            log.debug('Cloudflare successfully bypassed.')
        return cf_resp
    else:
        if CloudflareScraper.is_cloudflare_captcha_challenge(resp):
            log.warning("Cloudflare captcha challenge detected, it can't be bypassed.")

        return resp
Exemple #4
0
def _handle_submission_results(submission_id: str, session: CloudflareScraper):
    submission_update_url = get_submission_update_url(submission_id)

    finished_strings = [
        "Final score:", "An internal error occurred while grading."
        "Submission Aborted!", "Compilation Error"
    ]

    failed_tries = 0
    max_fail_attempts = 5
    last_attempt_time = 0
    delay = 0.5

    batch_index = 0
    testcase_index = 0

    while True:
        while time.time() - last_attempt_time < delay:
            pass

        last_attempt_time = time.time()
        # Tudor plz give better interface. Maybe JSON string?
        data = session.get(submission_update_url)
        if not data or data.status_code != 200:
            failed_tries += 1
            if failed_tries > max_fail_attempts:
                print("Something went wrong... Breaking!")
                break

            print("Failed attempt: re-attempt {} of {}".format(
                failed_tries, max_fail_attempts))

        soup = BeautifulSoup(data.text, "html.parser")
        batches = soup.find_all("table", "submissions-status-table")

        while batch_index < len(batches):
            testcases = list(batches[batch_index].find_all(
                "tr", {"class": "case-row"}))
            if testcase_index == len(testcases):
                if batch_index + 1 < len(batches):
                    batch_index += 1
                    print("Batch ${}".format(batch_index))
                else:
                    break

            else:
                print("\t{}".format(_format_case_row(
                    testcases[testcase_index])))
                testcase_index += 1

        if any(s in data.text for s in finished_strings):
            print("Finished")
            break
def get_binary_raw(session: CloudflareScraper,
                   url: str,
                   speed: Union[int, float] = 100) -> Optional[bytes]:
    assert isinstance(speed, int) or isinstance(speed, float)

    file_stream: Response = session.get(url, stream=True)

    file_stream.raise_for_status()

    file_binary: bytes = bytes()
    for chunk in file_stream.iter_content(chunk_size=1024):
        file_binary += chunk
        sleep(1 / speed) if speed > 0 else None

    if (length := int(file_stream.headers.get(
            "Content-Length", 0))) > 0 and length != len(file_binary):
        raise IncompleteRead(l := len(file_binary), length - l)
Exemple #6
0
def get_url(
        url,  # type: AnyStr
        post_data=None,  # type: Optional
        params=None,  # type: Optional
        headers=None,  # type: Optional[Dict]
        timeout=30,  # type: int
        session=None,  # type: Optional[requests.Session]
        parse_json=False,  # type: bool
        raise_status_code=False,  # type: bool
        raise_exceptions=False,  # type: bool
        as_binary=False,  # type: bool
        encoding=None,  # type: Optional[AnyStr]
        **kwargs):
    # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]]
    """
    Either
    1) Returns a byte-string retrieved from the url provider.
    2) Return True/False if success after using kwargs 'savefile' set to file pathname.
    3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True.
    4) JSON Dict if parse_json=True.

    :param url: url
    :param post_data: post data
    :param params:
    :param headers: headers to add
    :param timeout: timeout
    :param session: optional session object
    :param parse_json: return JSON Dict
    :param raise_status_code: raise exception for status codes
    :param raise_exceptions: raise exceptions
    :param as_binary: return bytes instead of text
    :param encoding: overwrite encoding return header if as_binary is False
    :param kwargs:
    :return:
    """

    response_attr = ('text', 'content')[as_binary]

    # selectively mute some errors
    mute = filter_list(lambda x: kwargs.pop(x, False), [
        'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout',
        'mute_http_error'
    ])

    # reuse or instantiate request session
    resp_sess = kwargs.pop('resp_sess', None)
    if None is session:
        session = CloudflareScraper.create_scraper()
        session.headers.update({'User-Agent': USER_AGENT})

    # download and save file or simply fetch url
    savename = kwargs.pop('savename', None)
    if savename:
        # session streaming
        session.stream = True

    if not kwargs.pop('nocache', False):
        cache_dir = CACHE_DIR or get_system_temp_dir()
        session = CacheControl(sess=session,
                               cache=caches.FileCache(
                                   ek.ek(os.path.join, cache_dir, 'sessions')))

    provider = kwargs.pop('provider', None)

    # handle legacy uses of `json` param
    if kwargs.get('json'):
        parse_json = kwargs.pop('json')

    # session master headers
    req_headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip,deflate'
    }
    if headers:
        req_headers.update(headers)
    if hasattr(session, 'reserved') and 'headers' in session.reserved:
        req_headers.update(session.reserved['headers'] or {})
    session.headers.update(req_headers)

    # session parameters
    session.params = params

    # session ssl verify
    session.verify = False

    # don't trust os environments (auth, proxies, ...)
    session.trust_env = False

    response = None
    try:
        # sanitise url
        parsed = list(urlparse(url))
        parsed[2] = re.sub('/{2,}', '/',
                           parsed[2])  # replace two or more / with one
        url = urlunparse(parsed)

        # session proxies
        if PROXY_SETTING:
            (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url)
            msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
            if None is proxy_address:
                logger.debug('Proxy error, aborted the request using %s' % msg)
                return
            elif proxy_address:
                logger.debug('Using %s' % msg)
                session.proxies = {
                    'http': proxy_address,
                    'https': proxy_address
                }

        # decide if we get or post data to server
        if post_data or 'post_json' in kwargs:
            if True is post_data:
                post_data = None

            if post_data:
                kwargs.setdefault('data', post_data)

            if 'post_json' in kwargs:
                kwargs.setdefault('json', kwargs.pop('post_json'))

            response = session.post(url, timeout=timeout, **kwargs)
        else:
            response = session.get(url, timeout=timeout, **kwargs)
            if response.ok and not response.content and 'url=' in response.headers.get(
                    'Refresh', '').lower():
                url = response.headers.get('Refresh').lower().split(
                    'url=')[1].strip('/')
                if not url.startswith('http'):
                    parsed[2] = '/%s' % url
                    url = urlunparse(parsed)
                response = session.get(url, timeout=timeout, **kwargs)

        # if encoding is not in header try to use best guess
        # ignore downloads with savename
        if not savename and not as_binary:
            if encoding:
                response.encoding = encoding
            elif not response.encoding or 'charset' not in response.headers.get(
                    'Content-Type', ''):
                response.encoding = response.apparent_encoding

        # noinspection PyProtectedMember
        if provider and provider._has_signature(response.text):
            return getattr(response, response_attr)

        if raise_status_code:
            response.raise_for_status()

        if not response.ok:
            http_err_text = 'CloudFlare Ray ID' in response.text and \
                            'CloudFlare reports, "Website is offline"; ' or ''
            if response.status_code in http_error_code:
                http_err_text += http_error_code[response.status_code]
            elif response.status_code in range(520, 527):
                http_err_text += 'Origin server connection failure'
            else:
                http_err_text = 'Custom HTTP error code'
                if 'mute_http_error' not in mute:
                    logger.debug(
                        u'Response not ok. %s: %s from requested url %s' %
                        (response.status_code, http_err_text, url))
            return

    except requests.exceptions.HTTPError as e:
        if raise_status_code:
            response.raise_for_status()
        logger.warning(u'HTTP error %s while loading URL%s' %
                       (e.errno, _maybe_request_url(e)))
        return
    except requests.exceptions.ConnectionError as e:
        if 'mute_connect_err' not in mute:
            logger.warning(u'Connection error msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except requests.exceptions.ReadTimeout as e:
        if 'mute_read_timeout' not in mute:
            logger.warning(u'Read timed out msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except (requests.exceptions.Timeout, socket.timeout) as e:
        if 'mute_connect_timeout' not in mute:
            logger.warning(
                u'Connection timed out msg:%s while loading URL %s' %
                (ex(e), _maybe_request_url(e, url)))
        if raise_exceptions:
            raise e
        return
    except (BaseException, Exception) as e:
        if ex(e):
            logger.warning(
                u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s'
                % (url, ex(e), traceback.format_exc()))
        else:
            logger.warning(
                u'Unknown exception while loading URL %s\r\nDetail... %s' %
                (url, traceback.format_exc()))
        if raise_exceptions:
            raise e
        return

    if parse_json:
        try:
            data_json = response.json()
            if resp_sess:
                return ({}, data_json)[isinstance(data_json,
                                                  (dict, list))], session
            return ({}, data_json)[isinstance(data_json, (dict, list))]
        except (TypeError, Exception) as e:
            logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' %
                           (url, ex(e)))
            if raise_exceptions:
                raise e
            return None

    if savename:
        try:
            write_file(savename,
                       response,
                       raw=True,
                       raise_exceptions=raise_exceptions)
        except (BaseException, Exception) as e:
            if raise_exceptions:
                raise e
            return
        return True

    if resp_sess:
        return getattr(response, response_attr), session

    return getattr(response, response_attr)
            request.url = request.url.replace(
                'https://' + result.hostname,
                'https://' + resolvedIP,
            )
            connection_pool_kwargs['server_hostname'] = result.hostname
            connection_pool_kwargs['assert_hostname'] = result.hostname
            request.headers['Host'] = result.hostname

        else:
            connection_pool_kwargs.pop('server_hostname', None)
            connection_pool_kwargs.pop('assert_hostname', None)

        return super(HostHeaderSSLAdapter, self).send(request, **kwargs)


cfs = CloudflareScraper()

cfs.mount('https://', HostHeaderSSLAdapter())

hParser = 'html.parser'

infoBanner = "[Marumaru-Downloader]"

header = {
    'User-agent': 'Mozilla/5.0',
    'Referer': baseURL,
}


def PrintBanner():
    print('''
def get(session: CloudflareScraper, path: str, **params) -> Response:
    return session.get(join_url(root, path), params=params)