def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30',
            verifySsl=True,
            flare=True,
            ignoreErrors=None,
            as_bytes=False):
    try:
        if not url: return None
        if url.startswith('//'): url = 'http:' + url
        try:
            url = py_tools.ensure_text(url, errors='ignore')
        except:
            pass

        if isinstance(post, dict):
            post = bytes(urlencode(post), encoding='utf-8')
        elif isinstance(post, str) and py_tools.isPY3:
            post = bytes(post, encoding='utf-8')

        handlers = []
        if proxy is not None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or close is not True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            urllib2.install_opener(opener)

        if not verifySsl and version_info >= (2, 7, 12):
            try:
                import ssl
                ssl_context = ssl._create_unverified_context()
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                urllib2.install_opener(opener)
            except:
                from fenomscrapers.modules import log_utils
                log_utils.error()

        if verifySsl and ((2, 7, 8) < version_info < (2, 7, 12)):
            # try:
            # import ssl
            # ssl_context = ssl.create_default_context()
            # ssl_context.check_hostname = False
            # ssl_context.verify_mode = ssl.CERT_NONE
            # handlers += [urllib2.HTTPSHandler(context=ssl_context)]
            # opener = urllib2.build_opener(*handlers)
            # urllib2.install_opener(opener)
            # except:
            # from fenomscrapers.modules import log_utils
            # log_utils.error()
            try:
                import ssl
                try:
                    import _ssl
                    CERT_NONE = _ssl.CERT_NONE
                except Exception:
                    CERT_NONE = ssl.CERT_NONE
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                urllib2.install_opener(opener)
            except:
                from fenomscrapers.modules import log_utils
                log_utils.error()

        try:
            headers.update(headers)
        except:
            headers = {}

        if 'User-Agent' in headers: pass
        elif mobile is not True:
            headers['User-Agent'] = cache.get(randomagent, 12)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in headers: pass
        elif referer is not None: headers['Referer'] = referer
        if 'Accept-Language' not in headers:
            headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in headers: pass
        elif XHR: headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in headers: pass
        elif cookie: headers['Cookie'] = cookie
        if 'Accept-Encoding' in headers: pass
        elif compression and limit is None: headers['Accept-Encoding'] = 'gzip'

        # if redirect is False:
        # class NoRedirection(urllib2.HTTPErrorProcessor):
        # def http_response(self, request, response):
        # return response
        # opener = urllib2.build_opener(NoRedirection)
        # urllib2.install_opener(opener)
        # try: del headers['Referer']
        # except: pass

        if redirect is False:

            class NoRedirectHandler(urllib2.HTTPRedirectHandler):
                def http_error_302(self, reqst, fp, code, msg, head):
                    infourl = addinfourl(fp, head, reqst.get_full_url())
                    infourl.status = code
                    infourl.code = code
                    return infourl

                http_error_300 = http_error_302
                http_error_301 = http_error_302
                http_error_303 = http_error_302
                http_error_307 = http_error_302

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)
            try:
                del headers['Referer']
            except:
                pass

        req = urllib2.Request(url, data=post)
        _add_request_header(req, headers)
        try:
            response = urllib2.urlopen(req, timeout=int(timeout))
        except HTTPError as error_response:  # if HTTPError, using "as response" will be reset after entire Exception code runs and throws error around line 247 as "local variable 'response' referenced before assignment", re-assign it
            response = error_response
            try:
                ignore = ignoreErrors and (int(response.code) == ignoreErrors
                                           or int(
                                               response.code) in ignoreErrors)
            except:
                ignore = False

            if not ignore:
                if response.code in [
                        301, 307, 308, 503, 403
                ]:  # 403:Forbidden added 3/3/21 for cloudflare, fails on bad User-Agent
                    cf_result = response.read(5242880)
                    try:
                        encoding = response.headers["Content-Encoding"]
                    except:
                        encoding = None
                    if encoding == 'gzip':
                        cf_result = gzip.GzipFile(
                            fileobj=StringIO(cf_result)).read()

                    if flare and 'cloudflare' in str(response.info()).lower():
                        from fenomscrapers.modules import log_utils
                        log_utils.log(
                            'client module calling cfscrape: url=%s' % url,
                            level=log_utils.LOGDEBUG)
                        try:
                            from fenomscrapers.modules import cfscrape
                            if isinstance(post, dict): data = post
                            else:
                                try:
                                    data = parse_qs(post)
                                except:
                                    data = None
                            scraper = cfscrape.CloudScraper()
                            if response.code == 403:  # possible bad User-Agent in headers, let cfscrape assign
                                response = scraper.request(
                                    method='GET' if post is None else 'POST',
                                    url=url,
                                    data=data,
                                    timeout=int(timeout))
                            else:
                                response = scraper.request(
                                    method='GET' if post is None else 'POST',
                                    url=url,
                                    headers=headers,
                                    data=data,
                                    timeout=int(timeout))
                            result = response.content
                            flare = 'cloudflare'  # Used below
                            try:
                                cookies = response.request._cookies
                            except:
                                log_utils.error()
                            if response.status_code == 403:  # if cfscrape server still responds with 403
                                log_utils.log(
                                    'cfscrape-Error url=(%s): %s' %
                                    (url, 'HTTP Error 403: Forbidden'),
                                    __name__,
                                    level=log_utils.LOGDEBUG)
                                return None
                        except:
                            log_utils.error()
                    elif 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s' % (urlparse(url).scheme,
                                              urlparse(url).netloc)
                        ua = headers['User-Agent']
                        cf = cache.get(cfcookie().get, 168, netloc, ua,
                                       timeout)
                        headers['Cookie'] = cf
                        req = urllib2.Request(url, data=post)
                        _add_request_header(req, headers)
                        response = urllib2.urlopen(req, timeout=int(timeout))
                    else:
                        if error is False:
                            from fenomscrapers.modules import log_utils
                            log_utils.error('Request-Error url=(%s)' % url)
                            return None
                else:
                    if error is False:
                        from fenomscrapers.modules import log_utils
                        log_utils.error('Request-Error url=(%s)' % url)
                        return None
                    elif error is True and response.code in [
                            401, 404, 405
                    ]:  # no point in continuing after this exception runs with these response.code's
                        try:
                            response_headers = dict(
                                [(item[0].title(), item[1])
                                 for item in list(response.info().items())]
                            )  # behaves differently 18 to 19. 18 I had 3 "Set-Cookie:" it combined all 3 values into 1 key. In 19 only the last keys value was present.
                        except:
                            from fenomscrapers.modules import log_utils
                            log_utils.error()
                            response_headers = response.headers
                        return (str(response), str(response.code),
                                response_headers)

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close is True: response.close()
            return result
        elif output == 'geturl':
            result = response.geturl()
            if close is True: response.close()
            return result
        elif output == 'headers':
            result = response.headers
            if close is True: response.close()
            return result
        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            try:
                result = response.read(16 * 1024)
            except:
                result = response  # testing
            if close is True: response.close()
            return result
        elif output == 'file_size':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = '0'
            if close is True: response.close()
            return content
        if flare != 'cloudflare':
            if limit == '0': result = response.read(224 * 1024)
            elif limit is not None: result = response.read(int(limit) * 1024)
            else: result = response.read(5242880)

        try:
            encoding = response.headers["Content-Encoding"]
        except:
            encoding = None

        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO(result)).read()
        if not as_bytes:
            result = py_tools.ensure_text(result, errors='ignore')

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)
            headers['Cookie'] = su
            req = urllib2.Request(url, data=post)
            _add_request_header(req, headers)
            response = urllib2.urlopen(req, timeout=int(timeout))
            if limit == '0': result = response.read(224 * 1024)
            elif limit is not None: result = response.read(int(limit) * 1024)
            else: result = response.read(5242880)
            try:
                encoding = response.headers["Content-Encoding"]
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO(result)).read()
        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)
            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict(
                    [(item[0].title(), item[1])
                     for item in list(response.info().items())]
                )  # behaves differently 18 to 19. 18 I had 3 "Set-Cookie:" it combined all 3 values into 1 key. In 19 only the last keys value was present.
            except:
                from fenomscrapers.modules import log_utils
                log_utils.error()
                response_headers = response.headers
            try:
                response_code = str(response.code)
            except:
                response_code = str(response.status_code
                                    )  # object from CFScrape Requests object.
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            if close is True: response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close is True: response.close()
            return result
    except:
        from fenomscrapers.modules import log_utils
        log_utils.error('Request-Error url=(%s)' % url)
        return None
Exemple #2
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30',
            ignoreSsl=False,
            flare=True,
            ignoreErrors=None):
    try:
        if not url: return None

        handlers = []
        if proxy is not None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close is True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'):
            url = 'http:' + url

        try:
            headers.update(headers)
        except:
            headers = {}

        if 'User-Agent' in headers: pass
        elif mobile is not True:
            headers['User-Agent'] = cache.get(randomagent, 12)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'

        if 'Referer' in headers: pass
        elif referer: headers['Referer'] = referer

        if 'Accept-Language' not in headers:
            headers['Accept-Language'] = 'en-US'

        if 'X-Requested-With' in headers: pass
        elif XHR: headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'Cookie' in headers: pass
        elif cookie: headers['Cookie'] = cookie

        if 'Accept-Encoding' in headers: pass
        elif compression and limit is None: headers['Accept-Encoding'] = 'gzip'

        if redirect is False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)
            try:
                del headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
            try:
                iter_items = post.iteritems()
            except:
                iter_items = post.items()
            for key, value in iter_items:
                try:
                    post[key] = value.encode('utf-8')
                except:
                    pass
            post = urlencode(post)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except HTTPError as response:
            try:
                ignore = ignoreErrors and (int(response.code) == ignoreErrors
                                           or int(
                                               response.code) in ignoreErrors)
            except:
                ignore = False

            if not ignore:
                if response.code in [301, 307, 308, 503]:
                    cf_result = response.read(5242880)
                    try:
                        encoding = response.info().getheader(
                            'Content-Encoding')
                    except:
                        encoding = None

                    if encoding == 'gzip':
                        cf_result = gzip.GzipFile(
                            fileobj=StringIO(cf_result)).read()

                    if flare and 'cloudflare' in str(response.info()).lower():
                        log_utils.log(
                            'client module calling cfscrape: url=%s' % url,
                            level=log_utils.LOGNOTICE)
                        try:
                            from fenomscrapers.modules import cfscrape
                            if isinstance(post, dict): data = post
                            else:
                                try:
                                    data = parse_qs(post)
                                except:
                                    data = None

                            scraper = cfscrape.CloudScraper()
                            response = scraper.request(
                                method='GET' if post is None else 'POST',
                                url=url,
                                headers=headers,
                                data=data,
                                timeout=int(timeout))
                            result = response.content
                            flare = 'cloudflare'  # Used below
                            try:
                                cookies = response.request._cookies
                            except:
                                log_utils.error()
                        except:
                            log_utils.error()

                    elif 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s' % (urlparse(url).scheme,
                                              urlparse(url).netloc)
                        ua = headers['User-Agent']
                        cf = cache.get(cfcookie().get, 168, netloc, ua,
                                       timeout)
                        headers['Cookie'] = cf
                        request = urllib2.Request(url, data=post)
                        _add_request_header(request, headers)
                        response = urllib2.urlopen(request,
                                                   timeout=int(timeout))
                    else:
                        log_utils.log('Request-Error (%s): %s' %
                                      (str(response.code), url),
                                      level=log_utils.LOGERROR)
                        if error is False: return None
                else:
                    log_utils.log('Request-Error (%s): %s' %
                                  (str(response.code), url),
                                  level=log_utils.LOGERROR)
                    if error is False: return None

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close is True: response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close is True: response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close is True: response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close is True: response.close()
            return result

        if flare != 'cloudflare':
            if limit == '0': result = response.read(224 * 1024)
            elif limit is not None: result = response.read(int(limit) * 1024)
            else: result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None

        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)
            headers['Cookie'] = su
            request = urllib2.Request(url, data=post)
            _add_request_header(request, headers)
            response = urllib2.urlopen(request, timeout=int(timeout))
            if limit == '0': result = response.read(224 * 1024)
            elif limit is not None: result = response.read(int(limit) * 1024)
            else: result = response.read(5242880)
            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)
            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers
            try:
                response_code = str(response.code)
            except:
                response_code = str(response.status_code
                                    )  # object from CFScrape Requests object.
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            if close is True: response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close is True: response.close()
            return result

    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      level=log_utils.LOGERROR)
        return None