def _doLogin(self): login_params = { 'username': sickbeard.TORRENTLEECH_USERNAME, 'password': sickbeard.TORRENTLEECH_PASSWORD, 'remember_me': 'on', 'login': '******' } self.session = cloudscraper.create_scraper() logger.log("[" + self.name + "] Attempting to Login") try: response = self.session.post( "{0}user/account/login".format(self.url), data=login_params, timeout=30, verify=False ) except (exceptions.ConnectionError, exceptions.HTTPError), e: logger.log("[{0}] {1} Error: {2}".format( self.name, self.funcName(), e ), logger.ERROR ) return False
def _doLogin(self): if not self.session: self.session = cloudscraper.create_scraper() if sickbeard.TORRENTDAY_EMAIL_URL: self._handleEmailLink() if sickbeard.TORRENTDAY_UID and sickbeard.TORRENTDAY_PASS: self.checkAuthCookies() response = self.session.get(self.url + '/browse.php') if 'login.php' in response.url: if sickbeard.TORRENTDAY_ANTICAPTCHA_KEY and sickbeard.TORRENTDAY_USERNAME and sickbeard.TORRENTDAY_PASSWORD: if not self._bypassCaptcha(): return False else: logger.log( "[{0}] {1} Appears we cannot authenicate with TorrentDay.". format(self.name, self.funcName()), logger.ERROR) return False if not self._getPassKey() or not self.rss_uid or not self.rss_passkey: logger.log( "[{0}] {1} Could not extract rss uid/passkey... aborting.". format(self.name, self.funcName()), logger.ERROR) return False return True
def __init__(self): generic.TorrentProvider.__init__(self, "Torrentz") self.cache = TORRENTZCache(self) self.url = 'https://torrentz2.eu/' self.name = "Torrentz" self.supportsBacklog = True self.session = cloudscraper.create_scraper() self.funcName = lambda n=0: sys._getframe(n + 1).f_code.co_name + "()" logger.log("[" + self.name + "] initializing...")
def _doLogin(self): self.switchURL() self.session = cloudscraper.create_scraper() logger.log("[{}] Attempting to Login".format(self.name)) try: response = self.session.post("{}/take_login.php".format(self.url), data={ 'username': sickbeard.IPTORRENTS_USERNAME, 'password': sickbeard.IPTORRENTS_PASSWORD, }, timeout=30, verify=False) except (exceptions.ConnectionError, exceptions.HTTPError), e: self.session = None logger.log( "[{}] {} Error: {}".foramt(self.name, self.funcName(), str(e)), logger.ERROR) return False
def downloadpage(url, **opt): # logger.info() """ Open a url and return the data obtained @param url: url to open. @type url: str @param post: If it contains any value, it is sent by POST. @type post: str @param headers: Headers for the request, if it contains nothing the default headers will be used. @type headers: dict, list @param timeout: Timeout for the request. @type timeout: int @param follow_redirects: Indicates if redirects are to be followed. @type follow_redirects: bool @param cookies: Indicates whether cookies are to be used. @type cookies: bool @param replace_headers: If True, headers passed by the "headers" parameter will completely replace the default headers. If False, the headers passed by the "headers" parameter will modify the headers by default. @type replace_headers: bool @param add_referer: Indicates whether to add the "Referer" header using the domain of the url as a value. @type add_referer: bool @param only_headers: If True, only headers will be downloaded, omitting the content of the url. @type only_headers: bool @param random_headers: If True, use the method of selecting random headers. @type random_headers: bool @param ignore_response_code: If True, ignore the method for WebErrorException for error like 404 error in veseriesonline, but it is a functional data @type ignore_response_code: bool @return: Result of the petition @rtype: HTTPResponse @param use_requests: Use requests.session() @type: bool Parameter Type Description -------------------------------------------------- -------------------------------------------------- ------------ HTTPResponse.sucess: bool True: Request successful | False: Error when making the request HTTPResponse.code: int Server response code or error code if an error occurs HTTPResponse.error: str Description of the error in case of an error HTTPResponse.headers: dict Dictionary with server response headers HTTPResponse.data: str Response obtained from server HTTPResponse.json: dict Response obtained from the server in json format HTTPResponse.time: float Time taken to make the request """ url = scrapertools.unescape(url) domain = urlparse.urlparse(url).netloc global CF_LIST CF = False if domain in FORCE_CLOUDSCRAPER_LIST: from lib import cloudscraper session = cloudscraper.create_scraper() CF = True else: from lib import requests session = requests.session() if domain in CF_LIST or opt.get('CF', False): url = 'https://web.archive.org/save/' + url CF = True if config.get_setting('resolver_dns') and not opt.get( 'use_requests', False): from specials import resolverdns session.mount('https://', resolverdns.CipherSuiteAdapter(domain, CF)) req_headers = default_headers.copy() # Headers passed as parameters if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt['headers']) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['url_save'] = url opt['post_save'] = opt.get('post', None) response = {} info_dict = [] payload = dict() files = {} file_name = '' session.verify = opt.get('verify', True) if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) proxy_data = {'dict': {}} inicio = time.time() if opt.get( 'timeout', None) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get('file', None) is not None: if opt.get('post', None) is not None: # Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] # Verify 'file' and 'file_name' options to upload a buffer or file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get('file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: # Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: from lib import requests req = requests.Response() if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [ 429, 503, 403 ] and not opt.get('CF', False): if domain not in CF_LIST: opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) response['data'] = req.content if req.content else '' if CF: import re response['data'] = re.sub('["|\']/save/[^"]*(https?://[^"]+)', '"\\1', response['data']) response['url'] = req.url if type(response['data']) != str: response['data'] = response['data'].decode('UTF-8') if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) # is_channel = inspect.getmodule(inspect.currentframe().f_back) # is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") # if is_channel and isinstance(response_code, int): # if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): # if response_code > 399: # show_infobox(info_dict) # raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response)
# -*- coding: utf-8 -*- # ------------------------------------------------------------ # Canale per AnimeUnity # ------------------------------------------------------------ import requests, json, copy from core import support from platformcode import autorenumber try: from lib import cloudscraper except: from lib import cloudscraper host = support.config.get_channel_url() response = cloudscraper.create_scraper().get(host + '/archivio') csrf_token = support.match(response.text, patron='name="csrf-token" content="([^"]+)"').match headers = { 'content-type': 'application/json;charset=UTF-8', 'x-csrf-token': csrf_token, 'Cookie': '; '.join([x.name + '=' + x.value for x in response.cookies]) } @support.menu def mainlist(item): top = [('Ultimi Episodi', ['', 'news'])] menu = [('Anime {bullet bold}', ['', 'menu', {}, 'tvshow']), ('Film {submenu}', ['', 'menu', {
def downloadpage(url, **opt): logger.info() from . import scrapertools """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro Tipo Descripción ---------------------------------------------------------------------------------------------------------------- HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: str Descripción del error en caso de producirse un error HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor HTTPResponse.data: str Respuesta obtenida del servidor HTTPResponse.json: dict Respuesta obtenida del servidor en formato json HTTPResponse.time: float Tiempo empleado para realizar la petición """ load_cookies() import requests # Headers por defecto, si no se especifica nada req_headers = default_headers.copy() # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 domain = urlparse.urlparse(url)[1] global CS_stat if domain in CF_LIST or opt.get( 'CF', False): #Está en la lista de CF o viene en la llamada from lib import cloudscraper session = cloudscraper.create_scraper( ) #El dominio necesita CloudScraper session.verify = True CS_stat = True else: session = requests.session() session.verify = False CS_stat = False if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) # Prepara la url en caso de necesitar proxy, o si se envía "proxy_addr_forced" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxy_addr_forced', {}): session.proxies = opt['proxy_addr_forced'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None: if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in list(post.items()): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload a buffer or a file if opt.get('file', None) is not None: if len(opt['file']) < 256 and os.path.isfile( opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) import traceback logger.error(traceback.format_exc(1)) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [ 429, 503, 403 ] and not opt.get('CF', False): domain = urlparse.urlparse(url)[1] if domain not in CF_LIST: opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) response['data'] = req.content try: encoding = req.encoding if not encoding: encoding = 'utf8' if PY3 and isinstance(response['data'], bytes) and 'Content-Type' in req.headers \ and ('text/' in req.headers['Content-Type'] or 'json' in req.headers['Content-Type'] \ or 'xml' in req.headers['Content-Type']): response['data'] = response['data'].decode(encoding) except: import traceback logger.error(traceback.format_exc(1)) try: if PY3 and isinstance(response['data'], bytes) and 'Content-Type' in req.headers \ and (not 'application' in req.headers['Content-Type'] \ or 'javascript' in req.headers['Content-Type']): response['data'] = "".join( chr(x) for x in bytes(response['data'])) except: import traceback logger.error(traceback.format_exc(1)) try: if 'Content-Type' in req.headers and ('text/' in req.headers['Content-Type'] \ or 'json' in req.headers['Content-Type'] or 'xml' in req.headers['Content-Type']): response['data'] = response['data'].replace('Á', 'Á').replace('É', 'É')\ .replace('Í', 'Í').replace('Ó', 'Ó').replace('Ú', 'Ú')\ .replace('Ü', 'Ü').replace('¡', '¡').replace('¿', '¿')\ .replace('Ñ', 'Ñ').replace('ñ', 'n').replace('ü', 'ü')\ .replace('á', 'á').replace('é', 'é').replace('í', 'í')\ .replace('ó', 'ó').replace('ú', 'ú').replace('ª', 'ª')\ .replace('º', 'º') except: import traceback logger.error(traceback.format_exc(1)) response['url'] = req.url if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response['data'], response['sucess'], url, opt = proxy_post_processing( url, proxy_data, response, opt) if opt.get('out_break', False): break return type('HTTPResponse', (), response)
def downloadpage(url, **opt): logger.info() """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro Tipo Descripción ---------------------------------------------------------------------------------------------------------------- HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: str Descripción del error en caso de producirse un error HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor HTTPResponse.data: str Respuesta obtenida del servidor HTTPResponse.json: dict Respuesta obtenida del servidor en formato json HTTPResponse.time: float Tiempo empleado para realizar la petición """ load_cookies() import requests from lib import cloudscraper # Headers por defecto, si no se especifica nada req_headers = default_headers.copy() # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 session = cloudscraper.create_scraper() session.verify = False if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) # Prepara la url en caso de necesitar proxy, o si se envía "proxies" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxies', None) is not None: session.proxies = opt['proxies'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None: if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload un buffer o file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception, e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code response['data'] = req.content response['url'] = req.url if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response['data'], response['sucess'], url, opt = proxy_post_processing( url, proxy_data, response, opt) if opt.get('out_break', False): break
def downloadpage(url, **opt): """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str (datos json), dict @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_host: Indica si añadir el header Host al principio, como si fuese navegador común. Desactivado por defecto, solo utilizarse con webs problemáticas (da problemas con proxies). @type add_host: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param referer: Si se establece, agrega el header "Referer" usando el parámetro proporcionado como valor. @type referer: str @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @param hide_infobox: Si es True, no muestra la ventana de información en el log cuando hay una petición exitosa (no hay un response_code de error). @type hide_infobox: bool @param soup: Si es True, establece un elemento BeautifulSoup en el atributo soup de HTTPResponse @type soup: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro | Tipo | Descripción ----------------------|----------|------------------------------------------------------------------------------- HTTPResponse.sucess: | bool | True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: | int | Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: | str | Descripción del error en caso de producirse un error HTTPResponse.headers: | dict | Diccionario con los headers de respuesta del servidor HTTPResponse.data: | str | Respuesta obtenida del servidor HTTPResponse.json: | dict | Respuesta obtenida del servidor en formato json HTTPResponse.soup: | bs4/None | Objeto BeautifulSoup, si se solicita. None de otra forma HTTPResponse.time: | float | Tiempo empleado para realizar la petición """ global CF_LIST if not opt.get('alfa_s', False): logger.info() from . import scrapertools load_cookies(opt.get('alfa_s', False)) cf_ua = config.get_setting('cf_assistant_ua', None) url = url.strip() # Headers por defecto, si no se especifica nada req_headers = OrderedDict() if opt.get('add_host', False): req_headers['Host'] = urlparse.urlparse(url).netloc req_headers.update(default_headers.copy()) if opt.get('add_referer', False): req_headers['Referer'] = "/".join(url.split("/")[:3]) if isinstance(opt.get('referer'), str) and '://' in opt.get('referer'): req_headers['Referer'] = opt.get('referer') # Headers pasados como parametros if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt('headers')) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() if not PY3: url = urllib.quote(url.encode('utf-8'), safe="%/:=&?~#+!$,;'@()*[]") else: url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['proxy_retries_counter'] = 0 opt['url_save'] = url opt['post_save'] = opt.get('post', None) if opt.get('forced_proxy_opt', None) and channel_proxy_list(url): if opt['forced_proxy_opt'] in ['ProxyCF', 'ProxyDirect']: if 'cliver' not in url: opt['forced_proxy_opt'] = 'ProxyJSON' else: opt['forced_proxy'] = opt['forced_proxy_opt'] else: opt['forced_proxy'] = opt['forced_proxy_opt'] while opt['proxy_retries_counter'] <= opt.get('proxy_retries', 1): response = {} info_dict = [] payload = dict() files = {} file_name = '' opt['proxy_retries_counter'] += 1 domain = urlparse.urlparse(url)[1] global CS_stat if (domain in CF_LIST or opt.get('CF', False)) and opt.get( 'CF_test', True): #Está en la lista de CF o viene en la llamada from lib import cloudscraper session = cloudscraper.create_scraper( ) #El dominio necesita CloudScraper session.verify = True CS_stat = True if cf_ua and cf_ua != 'Default' and get_cookie( url, 'cf_clearance'): req_headers['User-Agent'] = cf_ua else: session = requests.session() session.verify = False CS_stat = False if opt.get('cookies', True): session.cookies = cj if not opt.get('keep_alive', True): #session.keep_alive = opt['keep_alive'] req_headers['Connection'] = "close" # Prepara la url en caso de necesitar proxy, o si se envía "proxy_addr_forced" desde el canal url, proxy_data, opt = check_proxy(url, **opt) if opt.get('proxy_addr_forced', {}): session.proxies = opt['proxy_addr_forced'] elif proxy_data.get('dict', {}): session.proxies = proxy_data['dict'] if opt.get('headers_proxy', {}): req_headers.update(dict(opt['headers_proxy'])) session.headers = req_headers.copy() inicio = time.time() if opt.get( 'timeout', None ) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get( 'file', None) is not None or opt.get('files', {}): if opt.get('post', None) is not None: ### Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except Exception: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in list(post.items()): try: payload[key] = value[0] except Exception: payload[key] = '' else: payload = opt['post'] ### Verifies 'file' and 'file_name' options to upload a buffer or a file if opt.get('files', {}): files = opt['files'] file_name = opt.get('file_name', 'File Object') elif opt.get('file', None) is not None: if len(opt['file']) < 256 and os.path.isfile( opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get( 'file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) else: ### Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt.get('timeout', None), params=opt.get('params', {})) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) ### Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt.get('timeout', None), params=opt.get('params', {})) except Exception as e: if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): req = requests.Response() response['data'] = '' response['sucess'] = False info_dict.append(('Success', 'False')) response['code'] = str(e) info_dict.append(('Response code', str(e))) info_dict.append(('Finalizado en', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) import traceback logger.error(traceback.format_exc(1)) return type('HTTPResponse', (), response) else: req = requests.Response() req.status_code = str(e) else: response['data'] = '' response['sucess'] = False response['code'] = '' response['soup'] = None return type('HTTPResponse', (), response) response_code = req.status_code if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403] \ and not opt.get('CF', False) and opt.get('CF_test', True): domain = urlparse.urlparse(url)[1] if domain not in CF_LIST: CF_LIST += [domain] opt["CF"] = True with open(CF_LIST_PATH, "a") as CF_File: CF_File.write("%s\n" % domain) logger.debug("CF retry... for domain: %s" % domain) return downloadpage(url, **opt) if req.headers.get('Server', '') == 'Alfa' and response_code in [429, 503, 403] \ and not opt.get('cf_v2', False) and opt.get('CF_test', True): opt["cf_v2"] = True logger.debug("CF Assistant retry... for domain: %s" % urlparse.urlparse(url)[1]) return downloadpage(url, **opt) response['data'] = req.content try: response['encoding'] = str(req.encoding).lower( ) if req.encoding and req.encoding is not None else None if opt.get('encoding') and opt.get('encoding') is not None: encoding = opt["encoding"] else: encoding = response['encoding'] if not encoding: encoding = 'utf-8' if PY3 and isinstance(response['data'], bytes) and encoding is not None \ and ('text/' in req.headers.get('Content-Type', '') \ or 'json' in req.headers.get('Content-Type', '') \ or 'xml' in req.headers.get('Content-Type', '')): response['data'] = response['data'].decode(encoding) except Exception: import traceback logger.error(traceback.format_exc(1)) try: if PY3 and isinstance(response['data'], bytes) \ and not ('application' in req.headers.get('Content-Type', '') \ or 'javascript' in req.headers.get('Content-Type', '') \ or 'image' in req.headers.get('Content-Type', '')): response['data'] = "".join( chr(x) for x in bytes(response['data'])) except Exception: import traceback logger.error(traceback.format_exc(1)) try: if 'text/' in req.headers.get('Content-Type', '') \ or 'json' in req.headers.get('Content-Type', '') \ or 'xml' in req.headers.get('Content-Type', ''): response['data'] = response['data'].replace('Á', 'Á').replace('É', 'É')\ .replace('Í', 'Í').replace('Ó', 'Ó').replace('Ú', 'Ú')\ .replace('Ü', 'Ü').replace('¡', '¡').replace('¿', '¿')\ .replace('Ñ', 'Ñ').replace('ñ', 'n').replace('ü', 'ü')\ .replace('á', 'á').replace('é', 'é').replace('í', 'í')\ .replace('ó', 'ó').replace('ú', 'ú').replace('ª', 'ª')\ .replace('º', 'º') except Exception: import traceback logger.error(traceback.format_exc(1)) response['url'] = req.url if not response['data']: response['data'] = '' response['soup'] = None if opt.get("soup", False): try: from bs4 import BeautifulSoup response["soup"] = BeautifulSoup(req.content, "html5lib", from_encoding=opt.get( 'encoding', response['encoding'])) except Exception: import traceback logger.error("Error creando sopa") logger.error(traceback.format_exc()) try: if 'bittorrent' not in req.headers.get('Content-Type', '') \ and 'octet-stream' not in req.headers.get('Content-Type', '') \ and 'zip' not in req.headers.get('Content-Type', '') \ and opt.get('json_to_utf8', True): response['json'] = to_utf8(req.json()) else: response['json'] = dict() except Exception: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies if response['code'] == 200: response['sucess'] = True else: response['sucess'] = False if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) is_channel = inspect.getmodule(inspect.currentframe().f_back) is_channel = scrapertools.find_single_match(str(is_channel), "<module '(channels).*?'") if is_channel and isinstance(response_code, int): if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): if response_code > 399: info_dict, response = fill_fields_post( info_dict, req, response, req_headers, inicio) show_infobox(info_dict) raise WebErrorException(urlparse.urlparse(url)[1]) info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if not 'api.themoviedb' in url and not 'api.trakt' in url and not opt.get( 'alfa_s', False) and not opt.get("hide_infobox"): show_infobox(info_dict) # Si hay error del proxy, refresca la lista y reintenta el numero indicada en proxy_retries response, url, opt = proxy_post_processing(url, proxy_data, response, opt) # Si proxy ordena salir del loop, se sale if opt.get('out_break', False): break return type('HTTPResponse', (), response)
def downloadpage(url, **opt): # logger.info() """ Open a url and return the data obtained @param url: url to open. @type url: str @param post: If it contains any value, it is sent by POST. @type post: str @param headers: Headers for the request, if it contains nothing the default headers will be used. @type headers: dict, list @param timeout: Timeout for the request. @type timeout: int @param follow_redirects: Indicates if redirects are to be followed. @type follow_redirects: bool @param cookies: Indicates whether cookies are to be used. @type cookies: bool @param replace_headers: If True, headers passed by the "headers" parameter will completely replace the default headers. If False, the headers passed by the "headers" parameter will modify the headers by default. @type replace_headers: bool @param add_referer: Indicates whether to add the "Referer" header using the domain of the url as a value. @type add_referer: bool @param only_headers: If True, only headers will be downloaded, omitting the content of the url. @type only_headers: bool @param random_headers: If True, use the method of selecting random headers. @type random_headers: bool @param ignore_response_code: If True, ignore the method for WebErrorException for error like 404 error in veseriesonline, but it is a functional data @type ignore_response_code: bool @return: Result of the petition @rtype: HTTPResponse @param use_requests: Use requests.session() @type: bool Parameter Type Description -------------------------------------------------- -------------------------------------------------- ------------ HTTPResponse.success: bool True: Request successful | False: Error when making the request HTTPResponse.code: int Server response code or error code if an error occurs HTTPResponse.error: str Description of the error in case of an error HTTPResponse.headers: dict Dictionary with server response headers HTTPResponse.data: str Response obtained from server HTTPResponse.json: dict Response obtained from the server in json format HTTPResponse.time: float Time taken to make the request """ url = scrapertools.unescape(url) parse = urlparse.urlparse(url) domain = parse.netloc if opt.get('cloudscraper'): from lib import cloudscraper session = cloudscraper.create_scraper() else: from lib import requests session = requests.session() if config.get_setting('resolver_dns') and not opt.get( 'use_requests', False): from core import resolverdns session.mount('https://', resolverdns.CipherSuiteAdapter(domain)) req_headers = default_headers.copy() # Headers passed as parameters if opt.get('headers', None) is not None: if not opt.get('replace_headers', False): req_headers.update(dict(opt['headers'])) else: req_headers = dict(opt['headers']) if domain in directIP.keys() and not opt.get('disable_directIP', False): req_headers['Host'] = domain url = urlparse.urlunparse(parse._replace(netloc=directIP.get(domain))) if opt.get('random_headers', False) or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: req_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") opt['url_save'] = url opt['post_save'] = opt.get('post', None) response = {} info_dict = [] payload = dict() files = {} file_name = '' session.verify = opt.get('verify', True) if opt.get('cookies', True): session.cookies = cj session.headers.update(req_headers) proxy_data = {'dict': {}} inicio = time.time() if opt.get( 'timeout', None) is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: opt['timeout'] = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if opt['timeout'] == 0: opt['timeout'] = None if len(url) > 0: try: if opt.get('post', None) is not None or opt.get('file', None) is not None: if opt.get('post', None) is not None: # Convert string post in dict try: json.loads(opt['post']) payload = opt['post'] except: if not isinstance(opt['post'], dict): post = urlparse.parse_qs(opt['post'], keep_blank_values=1) payload = dict() for key, value in post.items(): try: payload[key] = value[0] except: payload[key] = '' else: payload = opt['post'] # Verify 'file' and 'file_name' options to upload a buffer or file if opt.get('file', None) is not None: if os.path.isfile(opt['file']): if opt.get('file_name', None) is None: path_file, opt['file_name'] = os.path.split( opt['file']) files = { 'file': (opt['file_name'], open(opt['file'], 'rb')) } file_name = opt['file'] else: files = { 'file': (opt.get('file_name', 'Default'), opt['file']) } file_name = opt.get('file_name', 'Default') + ', Buffer de memoria' info_dict = fill_fields_pre(url, opt, proxy_data, file_name) if opt.get('only_headers', False): # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: # Makes the request with POST method req = session.post(url, data=payload, allow_redirects=opt.get( 'follow_redirects', True), files=files, timeout=opt['timeout']) elif opt.get('only_headers', False): info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with HEAD method req = session.head(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) else: info_dict = fill_fields_pre(url, opt, proxy_data, file_name) # Makes the request with GET method req = session.get(url, allow_redirects=opt.get( 'follow_redirects', True), timeout=opt['timeout']) except Exception as e: from lib import requests req = requests.Response() if not opt.get('ignore_response_code', False) and not proxy_data.get('stat', ''): response['data'] = '' response['success'] = False info_dict.append(('Success', 'False')) import traceback response['code'] = traceback.format_exc() info_dict.append(('Response code', str(e))) info_dict.append(('Finished in', time.time() - inicio)) if not opt.get('alfa_s', False): show_infobox(info_dict) return type('HTTPResponse', (), response) else: req.status_code = str(e) else: response['data'] = '' response['success'] = False response['code'] = '' return type('HTTPResponse', (), response) response_code = req.status_code response['url'] = req.url response['data'] = req.content if req.content else '' if type(response['data']) != str: try: response['data'] = response['data'].decode('utf-8') except: response['data'] = response['data'].decode('ISO-8859-1') if req.headers.get('Server', '').startswith('cloudflare') and response_code in [429, 503, 403]\ and not opt.get('CF', False) and 'Ray ID' in response['data'] and not opt.get('post', None): logger.debug("CF retry... for domain: %s" % domain) from lib import proxytranslate gResp = proxytranslate.process_request_proxy(url) if gResp: req = gResp['result'] response_code = req.status_code response['url'] = gResp['url'] response['data'] = gResp['data'] if not response['data']: response['data'] = '' try: response['json'] = to_utf8(req.json()) except: response['json'] = dict() response['code'] = response_code response['headers'] = req.headers response['cookies'] = req.cookies info_dict, response = fill_fields_post(info_dict, req, response, req_headers, inicio) if opt.get('cookies', True): save_cookies(alfa_s=opt.get('alfa_s', False)) if not 'api.themoviedb' in url and not opt.get('alfa_s', False): show_infobox(info_dict) if not config.get_setting("debug"): logger.info('Page URL:', url) return type('HTTPResponse', (), response)