def downloadpage(url, **opt): logger.info() from . import scrapertools """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro Tipo Descripción ---------------------------------------------------------------------------------------------------------------- HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: str Descripción del error en caso de producirse un error HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor HTTPResponse.data: str Respuesta obtenida del servidor HTTPResponse.json: dict Respuesta obtenida del servidor en formato json HTTPResponse.time: float Tiempo empleado para realizar la petición """ load_cookies() import requests # Headers por defecto, si no se especifica nada req_headers = default_headers.copy() # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 domain = urlparse.urlparse(url)[1] global CS_stat if domain in CF_LIST or opt.get( 'CF', False): #Está en la lista de CF o viene en la llamada from lib import cloudscraper session = cloudscraper.create_scraper( ) #El dominio necesita CloudScraper session.verify = True CS_stat = True else: session = requests.session() session.verify = False CS_stat = False if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) # Prepara la url en caso de necesitar proxy, o si se envía "proxy_addr_forced" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxy_addr_forced', {}): session.proxies = opt['proxy_addr_forced'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None: if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in list(post.items()): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload a buffer or a file if opt.get('file', None) is not None: if len(opt['file']) < 256 and os.path.isfile( opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) import traceback logger.error(traceback.format_exc(1)) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [ 429, 503, 403 ] and not opt.get('CF', False): domain = urlparse.urlparse(url)[1] if domain not in CF_LIST: opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) response['data'] = req.content try: encoding = req.encoding if not encoding: encoding = 'utf8' if PY3 and isinstance(response['data'], bytes) and 'Content-Type' in req.headers \ and ('text/' in req.headers['Content-Type'] or 'json' in req.headers['Content-Type'] \ or 'xml' in req.headers['Content-Type']): response['data'] = response['data'].decode(encoding) except: import traceback logger.error(traceback.format_exc(1)) try: if PY3 and isinstance(response['data'], bytes) and 'Content-Type' in req.headers \ and (not 'application' in req.headers['Content-Type'] \ or 'javascript' in req.headers['Content-Type']): response['data'] = "".join( chr(x) for x in bytes(response['data'])) except: import traceback logger.error(traceback.format_exc(1)) try: if 'Content-Type' in req.headers and ('text/' in req.headers['Content-Type'] \ or 'json' in req.headers['Content-Type'] or 'xml' in req.headers['Content-Type']): response['data'] = response['data'].replace('Á', 'Á').replace('É', 'É')\ .replace('Í', 'Í').replace('Ó', 'Ó').replace('Ú', 'Ú')\ .replace('Ü', 'Ü').replace('¡', '¡').replace('¿', '¿')\ .replace('Ñ', 'Ñ').replace('ñ', 'n').replace('ü', 'ü')\ .replace('á', 'á').replace('é', 'é').replace('í', 'í')\ .replace('ó', 'ó').replace('ú', 'ú').replace('ª', 'ª')\ .replace('º', 'º') except: import traceback logger.error(traceback.format_exc(1)) response['url'] = req.url if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response['data'], response['sucess'], url, opt = proxy_post_processing( url, proxy_data, response, opt) if opt.get('out_break', False): break return type('HTTPResponse', (), response)
def downloadpage(url, **opt): logger.info() """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro Tipo Descripción ---------------------------------------------------------------------------------------------------------------- HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: str Descripción del error en caso de producirse un error HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor HTTPResponse.data: str Respuesta obtenida del servidor HTTPResponse.json: dict Respuesta obtenida del servidor en formato json HTTPResponse.time: float Tiempo empleado para realizar la petición """ load_cookies() import requests from lib import cloudscraper # Headers por defecto, si no se especifica nada req_headers = default_headers.copy() # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 session = cloudscraper.create_scraper() session.verify = False if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) # Prepara la url en caso de necesitar proxy, o si se envía "proxies" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxies', None) is not None: session.proxies = opt['proxies'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None: if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload un buffer o file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception, e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code response['data'] = req.content response['url'] = req.url if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response['data'], response['sucess'], url, opt = proxy_post_processing( url, proxy_data, response, opt) if opt.get('out_break', False): break
logger.info("Response code: %s" % (response["code"])) logger.info("Response error: %s" % (response["error"])) logger.info("Response data length: %s" % (len(response["data"]))) logger.info("Response headers:") server_cloudflare = "" for header in response["headers"]: logger.info("- %s: %s" % (header, response["headers"][header])) if "cloudflare" in response["headers"][header]: server_cloudflare = "cloudflare" is_channel = inspect.getmodule(inspect.currentframe().f_back) # error 4xx o 5xx se lanza excepcion # response["code"] = 400 if type(response["code"]) == int and "\\servers\\" not in str(is_channel): if response["code"] > 399 and (server_cloudflare == "cloudflare" and response["code"] != 503): raise WebErrorException(urlparse.urlparse(url)[1]) if cookies: save_cookies() logger.info("Encoding: %s" % (response["headers"].get('content-encoding'))) if response["headers"].get('content-encoding') == 'gzip': logger.info("Descomprimiendo...") try: response["data"] = gzip.GzipFile(fileobj=StringIO(response["data"])).read() logger.info("Descomprimido") except: logger.info("No se ha podido descomprimir") # Anti Cloudflare
def downloadpage(url, **opt): """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str (datos json), dict @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_host: Indica si añadir el header Host al principio, como si fuese navegador común. Desactivado por defecto, solo utilizarse con webs problemáticas (da problemas con proxies). @type add_host: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param referer: Si se establece, agrega el header "Referer" usando el parámetro proporcionado como valor. @type referer: str @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @param hide_infobox: Si es True, no muestra la ventana de información en el log cuando hay una petición exitosa (no hay un response_code de error). @type hide_infobox: bool @param soup: Si es True, establece un elemento BeautifulSoup en el atributo soup de HTTPResponse @type soup: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro | Tipo | Descripción ----------------------|----------|------------------------------------------------------------------------------- HTTPResponse.sucess: | bool | True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: | int | Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: | str | Descripción del error en caso de producirse un error HTTPResponse.headers: | dict | Diccionario con los headers de respuesta del servidor HTTPResponse.data: | str | Respuesta obtenida del servidor HTTPResponse.json: | dict | Respuesta obtenida del servidor en formato json HTTPResponse.soup: | bs4/None | Objeto BeautifulSoup, si se solicita. None de otra forma HTTPResponse.time: | float | Tiempo empleado para realizar la petición """ global CF_LIST if not opt.get('alfa_s', False): logger.info() from . import scrapertools load_cookies(opt.get('alfa_s', False)) cf_ua = config.get_setting('cf_assistant_ua', None) url = url.strip() # Headers por defecto, si no se especifica nada req_headers = OrderedDict() if opt.get('add_host', False): req_headers['Host'] = urlparse.urlparse(url).netloc req_headers.update(default_headers.copy()) if opt.get('add_referer', False): req_headers['Referer'] = "/".join(url.split("/")[:3]) if isinstance(opt.get('referer'), str) and '://' in opt.get('referer'): req_headers['Referer'] = opt.get('referer') # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() if not PY3: url = urllib.quote(url.encode('utf-8'), safe="%/:=&?~#+!$,;'@()*[]") else: url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) if opt.get('forced_proxy_opt', None) and channel_proxy_list(url): if opt['forced_proxy_opt'] in ['ProxyCF', 'ProxyDirect']: if 'cliver' not in url: opt['forced_proxy_opt'] = 'ProxyJSON' else: opt['forced_proxy'] = opt['forced_proxy_opt'] else: opt['forced_proxy'] = opt['forced_proxy_opt'] while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 domain = urlparse.urlparse(url)[1] global CS_stat if (domain in CF_LIST or opt.get('CF', False)) and opt.get( 'CF_test', True): #Está en la lista de CF o viene en la llamada from lib import cloudscraper session = cloudscraper.create_scraper( ) #El dominio necesita CloudScraper session.verify = True CS_stat = True if cf_ua and cf_ua != 'Default' and get_cookie( url, 'cf_clearance'): req_headers['User-Agent'] = cf_ua else: session = requests.session() session.verify = False CS_stat = False if opt.get('cookies', True): session.cookies = cj if not opt.get('keep_alive', True): #session.keep_alive = opt['keep_alive'] req_headers['Connection'] = "close" # Prepara la url en caso de necesitar proxy, o si se envía "proxy_addr_forced" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxy_addr_forced', {}): session.proxies = opt['proxy_addr_forced'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] if opt.get('headers_proxy', {}): req_headers.update(dict(opt['headers_proxy'])) session.headers = req_headers.copy() inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None or opt.get('files', {}): if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except Exception: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in list(post.items()): try: payload[key] = value[0] except Exception: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload a buffer or a file if opt.get('files', {}): files = opt['files'] file_name = opt.get('file_name', 'File Object') elif opt.get('file', None) is not None: if len(opt['file']) < 256 and os.path.isfile( opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt.get('timeout', None), params=opt.get('params', {})) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) except Exception as e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) import traceback logger.error(traceback.format_exc(1)) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' response['soup'] = None return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403] \ and not opt.get('CF', False) and opt.get('CF_test', True): domain = urlparse.urlparse(url)[1] if domain not in CF_LIST: CF_LIST += [domain] opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) if req.headers.get('Server', '') == 'Alfa' and response_code in [429, 503, 403] \ and not opt.get('cf_v2', False) and opt.get('CF_test', True): opt["cf_v2"] = True logger.debug("CF Assistant retry... for domain: %s" % urlparse.urlparse(url)[1]) return downloadpage(url, **opt) response['data'] = req.content try: response['encoding'] = str(req.encoding).lower( ) if req.encoding and req.encoding is not None else None if opt.get('encoding') and opt.get('encoding') is not None: encoding = opt["encoding"] else: encoding = response['encoding'] if not encoding: encoding = 'utf-8' if PY3 and isinstance(response['data'], bytes) and encoding is not None \ and ('text/' in req.headers.get('Content-Type', '') \ or 'json' in req.headers.get('Content-Type', '') \ or 'xml' in req.headers.get('Content-Type', '')): response['data'] = response['data'].decode(encoding) except Exception: import traceback logger.error(traceback.format_exc(1)) try: if PY3 and isinstance(response['data'], bytes) \ and not ('application' in req.headers.get('Content-Type', '') \ or 'javascript' in req.headers.get('Content-Type', '') \ or 'image' in req.headers.get('Content-Type', '')): response['data'] = "".join( chr(x) for x in bytes(response['data'])) except Exception: import traceback logger.error(traceback.format_exc(1)) try: if 'text/' in req.headers.get('Content-Type', '') \ or 'json' in req.headers.get('Content-Type', '') \ or 'xml' in req.headers.get('Content-Type', ''): response['data'] = response['data'].replace('Á', 'Á').replace('É', 'É')\ .replace('Í', 'Í').replace('Ó', 'Ó').replace('Ú', 'Ú')\ .replace('Ü', 'Ü').replace('¡', '¡').replace('¿', '¿')\ .replace('Ñ', 'Ñ').replace('ñ', 'n').replace('ü', 'ü')\ .replace('á', 'á').replace('é', 'é').replace('í', 'í')\ .replace('ó', 'ó').replace('ú', 'ú').replace('ª', 'ª')\ .replace('º', 'º') except Exception: import traceback logger.error(traceback.format_exc(1)) response['url'] = req.url if not response['data']: response['data'] = '' response['soup'] = None if opt.get("soup", False): try: from bs4 import BeautifulSoup response["soup"] = BeautifulSoup(req.content, "html5lib", from_encoding=opt.get( 'encoding', response['encoding'])) except Exception: import traceback logger.error("Error creando sopa") logger.error(traceback.format_exc()) try: if 'bittorrent' not in req.headers.get('Content-Type', '') \ and 'octet-stream' not in req.headers.get('Content-Type', '') \ and 'zip' not in req.headers.get('Content-Type', '') \ and opt.get('json_to_utf8', True): response['json'] = to_utf8(req.json()) else: response['json'] = dict() except Exception: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies if response['code'] == 200: response['sucess'] = True else: response['sucess'] = False if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: info_dict, response = fill_fields_post( info_dict, req, response, req_headers, inicio) show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if not 'api.themoviedb' in url and not 'api.trakt' in url and not opt.get( 'alfa_s', False) and not opt.get("hide_infobox"): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response, url, opt = proxy_post_processing(url, proxy_data, response, opt) # Si proxy ordena salir del loop, se sale if opt.get('out_break', False): break return type('HTTPResponse', (), response)