def downloadpage(url, **opt): # logger.info() """ Open a url and return the data obtained @param url: url to open. @type url: str @param post: If it contains any value, it is sent by POST. @type post: str @param headers: Headers for the request, if it contains nothing the default headers will be used. @type headers: dict, list @param timeout: Timeout for the request. @type timeout: int @param follow_redirects: Indicates if redirects are to be followed. @type follow_redirects: bool @param cookies: Indicates whether cookies are to be used. @type cookies: bool @param replace_headers: If True, headers passed by the "headers" parameter will completely replace the default headers. If False, the headers passed by the "headers" parameter will modify the headers by default. @type replace_headers: bool @param add_referer: Indicates whether to add the "Referer" header using the domain of the url as a value. @type add_referer: bool @param only_headers: If True, only headers will be downloaded, omitting the content of the url. @type only_headers: bool @param random_headers: If True, use the method of selecting random headers. @type random_headers: bool @param ignore_response_code: If True, ignore the method for WebErrorException for error like 404 error in veseriesonline, but it is a functional data @type ignore_response_code: bool @return: Result of the petition @rtype: HTTPResponse @param use_requests: Use requests.session() @type: bool Parameter Type Description -------------------------------------------------- -------------------------------------------------- ------------ HTTPResponse.sucess: bool True: Request successful | False: Error when making the request HTTPResponse.code: int Server response code or error code if an error occurs HTTPResponse.error: str Description of the error in case of an error HTTPResponse.headers: dict Dictionary with server response headers HTTPResponse.data: str Response obtained from server HTTPResponse.json: dict Response obtained from the server in json format HTTPResponse.time: float Time taken to make the request """ url = scrapertools.unescape(url) domain = urlparse.urlparse(url).netloc global CF_LIST CF = False if domain in FORCE_CLOUDSCRAPER_LIST: from lib import cloudscraper session = cloudscraper.create_scraper() CF = True else: from lib import requests session = requests.session() if domain in CF_LIST or opt.get('CF', False): url = 'https://web.archive.org/save/' + url CF = True if config.get_setting('resolver_dns') and not opt.get( 'use_requests', False): from specials import resolverdns session.mount('https://', resolverdns.CipherSuiteAdapter(domain, CF)) req_headers = default_headers.copy() # Headers passed as parameters if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt['headers']) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['url_save'] = url opt['post_save'] = opt.get('post', None) response = {} info_dict = [] payload = dict() files = {} file_name = '' session.verify = opt.get('verify', True) if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) proxy_data = {'dict': {}} inicio = time.time() if opt.get( 'timeout', None) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get('file', None) is not None: if opt.get('post', None) is not None: # Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] # Verify 'file' and 'file_name' options to upload a buffer or file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get('file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: # Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: from lib import requests req = requests.Response() if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [ 429, 503, 403 ] and not opt.get('CF', False): if domain not in CF_LIST: opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) response['data'] = req.content if req.content else '' if CF: import re response['data'] = re.sub('["|\']/save/[^"]*(https?://[^"]+)', '"\\1', response['data']) response['url'] = req.url if type(response['data']) != str: response['data'] = response['data'].decode('UTF-8') if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) # is_channel = inspect.getmodule(inspect.currentframe().f_back) # is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") # if is_channel and isinstance(response_code, int): # if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): # if response_code > 399: # show_infobox(info_dict) # raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response)
def downloadpage(url, **opt): # logger.info() """ Open a url and return the data obtained @param url: url to open. @type url: str @param post: If it contains any value, it is sent by POST. @type post: str @param headers: Headers for the request, if it contains nothing the default headers will be used. @type headers: dict, list @param timeout: Timeout for the request. @type timeout: int @param follow_redirects: Indicates if redirects are to be followed. @type follow_redirects: bool @param cookies: Indicates whether cookies are to be used. @type cookies: bool @param replace_headers: If True, headers passed by the "headers" parameter will completely replace the default headers. If False, the headers passed by the "headers" parameter will modify the headers by default. @type replace_headers: bool @param add_referer: Indicates whether to add the "Referer" header using the domain of the url as a value. @type add_referer: bool @param only_headers: If True, only headers will be downloaded, omitting the content of the url. @type only_headers: bool @param random_headers: If True, use the method of selecting random headers. @type random_headers: bool @param ignore_response_code: If True, ignore the method for WebErrorException for error like 404 error in veseriesonline, but it is a functional data @type ignore_response_code: bool @return: Result of the petition @rtype: HTTPResponse @param use_requests: Use requests.session() @type: bool Parameter Type Description -------------------------------------------------- -------------------------------------------------- ------------ HTTPResponse.success: bool True: Request successful | False: Error when making the request HTTPResponse.code: int Server response code or error code if an error occurs HTTPResponse.error: str Description of the error in case of an error HTTPResponse.headers: dict Dictionary with server response headers HTTPResponse.data: str Response obtained from server HTTPResponse.json: dict Response obtained from the server in json format HTTPResponse.time: float Time taken to make the request """ url = scrapertools.unescape(url) parse = urlparse.urlparse(url) domain = parse.netloc from lib import requests session = requests.session() if config.get_setting('resolver_dns') and not opt.get( 'use_requests', False): from core import resolverdns session.mount('https://', resolverdns.CipherSuiteAdapter(domain)) req_headers = default_headers.copy() # Headers passed as parameters if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt['headers']) if domain in directIP.keys() and not opt.get('disable_directIP', False): req_headers['Host'] = domain url = urlparse.urlunparse(parse._replace(netloc=directIP.get(domain))) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['url_save'] = url opt['post_save'] = opt.get('post', None) response = {} info_dict = [] payload = dict() files = {} file_name = '' session.verify = opt.get('verify', True) if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) proxy_data = {'dict': {}} inicio = time.time() if opt.get( 'timeout', None) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get('file', None) is not None: if opt.get('post', None) is not None: # Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] # Verify 'file' and 'file_name' options to upload a buffer or file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get('file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: # Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: from lib import requests req = requests.Response() if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): response['data'] = '' response['success'] = False info_dict.append(('Success', 'False')) import traceback response['code'] = traceback.format_exc() info_dict.append(('Response code', str(e))) info_dict.append(('Finished in', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req.status_code = str(e) else: response['data'] = '' response['success'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code response['url'] = req.url response['data'] = req.content if req.content else '' if type(response['data']) != str: try: response['data'] = response['data'].decode('utf-8') except: response['data'] = response['data'].decode('ISO-8859-1') if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403]\ and not opt.get('CF', False) and 'Ray ID' in response['data'] and not opt.get('post', None): logger.debug("CF retry... for domain: %s" % domain) from lib import proxytranslate gResp = proxytranslate.process_request_proxy(url) if gResp: req = gResp['result'] response_code = req.status_code response['url'] = gResp['url'] response['data'] = gResp['data'] if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) if not config.get_setting("debug"): logger.info('Page URL:', url) return type('HTTPResponse', (), response)