def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger.debug("Addic7ed: using random user agents") self.session.headers['User-Agent'] = AGENT_LIST[randint( 0, len(AGENT_LIST) - 1)] self.session.headers['Referer'] = self.server_url # login if self.username and self.password: def check_verification(cache_region): try: rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, headers={"Referer": self.server_url}) if rr.status_code == 302: logger.info('Addic7ed: Login expired') cache_region.delete("addic7ed_data") else: logger.info('Addic7ed: Re-using old login') self.logged_in = True return True except ConnectionError as e: logger.debug( "Addic7ed: There was a problem reaching the server: %s." % e) raise IPAddressBlocked( "Addic7ed: Your IP is temporarily blocked.") if load_verification("addic7ed", self.session, callback=check_verification): return logger.info('Addic7ed: Logging in') data = { 'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', 'remember': 'true' } tries = 0 while tries <= 3: tries += 1 r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) if "g-recaptcha" in r.text or "grecaptcha" in r.text: logger.info( 'Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') for g, s in (("g-recaptcha-response", r'g-recaptcha.+?data-sitekey=\"(.+?)\"'), ("recaptcha_response", r'grecaptcha.execute\(\'(.+?)\',')): site_key = re.search(s, r.text).group(1) if site_key: break if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return pitcher = pitchers.get_pitcher()( "Addic7ed", self.server_url + 'login.php', site_key, user_agent=self.session.headers["User-Agent"], cookies=self.session.cookies.get_dict(), is_invisible=True) result = pitcher.throw() if not result: if tries >= 3: raise Exception( "Addic7ed: Couldn't solve captcha!") logger.info( "Addic7ed: Couldn't solve captcha! Retrying") continue data[g] = result r = self.session.post( self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) if "relax, slow down" in r.text: raise TooManyRequests(self.username) if "Wrong password" in r.text or "doesn't exist" in r.text: raise AuthenticationError(self.username) if r.status_code != 302: if tries >= 3: logger.error( "Addic7ed: Something went wrong when logging in") raise AuthenticationError(self.username) logger.info( "Addic7ed: Something went wrong when logging in; retrying" ) continue break store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True
def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % subliminal.__short_version__ from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger.debug("Addic7ed: using random user agents") self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] self.session.headers['Referer'] = self.server_url # login if self.username and self.password: def check_verification(cache_region): rr = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10, headers={"Referer": self.server_url}) if rr.status_code == 302: logger.info('Addic7ed: Login expired') cache_region.delete("addic7ed_data") else: logger.info('Addic7ed: Re-using old login') self.logged_in = True return True if load_verification("addic7ed", self.session, callback=check_verification): return logger.info('Addic7ed: Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in', 'url': '', 'remember': 'true'} tries = 0 while tries < 3: r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) if "grecaptcha" in r.content: logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return pitcher = pitchers.get_pitcher()("Addic7ed", self.server_url + 'login.php', site_key, user_agent=self.session.headers["User-Agent"], cookies=self.session.cookies.get_dict(), is_invisible=True) result = pitcher.throw() if not result: raise Exception("Addic7ed: Couldn't solve captcha!") data["recaptcha_response"] = result r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) if "relax, slow down" in r.content: raise TooManyRequests(self.username) if r.status_code != 302: if "User <b></b> doesn't exist" in r.content and tries <= 2: logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3) tries += 1 continue raise AuthenticationError(self.username) break store_verification("addic7ed", self.session) logger.debug('Addic7ed: Logged in') self.logged_in = True
def _request(self, method, url, *args, **kwargs): ourSuper = super(CloudScraper, self) resp = ourSuper.request(method, url, *args, **kwargs) if resp.headers.get('Content-Encoding') == 'br': if self.allow_brotli and resp._content: resp._content = brotli.decompress(resp.content) else: logging.warning( 'Brotli content detected, But option is disabled, we will not continue.' ) return resp # Debug request if self.debug: self.debugRequest(resp) # Check if Cloudflare anti-bot is on try: if self.isChallengeRequest(resp): if resp.request.method != 'GET': # Work around if the initial request is not a GET, # Supersede with a GET then re-request the original METHOD. CloudScraper.request(self, 'GET', resp.url) resp = ourSuper.request(method, url, *args, **kwargs) else: # Solve Challenge resp = self.sendChallengeResponse(resp, **kwargs) except ValueError as e: if PY3: error = str(e) else: error = e.message if error == "Captcha": parsed_url = urlparse(url) domain = parsed_url.netloc # solve the captcha site_key = re.search(r'data-sitekey="(.+?)"', resp.text).group(1) challenge_s = re.search( r'type="hidden" name="s" value="(.+?)"', resp.text).group(1) challenge_ray = re.search(r'data-ray="(.+?)"', resp.text).group(1) if not all([site_key, challenge_s, challenge_ray]): raise Exception("cf: Captcha site-key not found!") pitcher = pitchers.get_pitcher()( "cf: %s" % domain, resp.request.url, site_key, user_agent=self.headers["User-Agent"], cookies=self.cookies.get_dict(), is_invisible=True) parsed_url = urlparse(resp.url) logger.info("cf: %s: Solving captcha", domain) result = pitcher.throw() if not result: raise Exception("cf: Couldn't solve captcha!") submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format( parsed_url.scheme, domain) method = resp.request.method cloudflare_kwargs = { 'allow_redirects': False, 'headers': { 'Referer': resp.url }, 'params': OrderedDict([('s', challenge_s), ('g-recaptcha-response', result)]) } return CloudScraper.request(self, method, submit_url, **cloudflare_kwargs) return resp
def query(self, languages, title, season=None, episode=None, year=None, video=None): items_per_page = 10 current_page = 1 used_languages = languages lang_strings = [str(lang) for lang in used_languages] # handle possible duplicate use of Serbian Latin if "sr" in lang_strings and "sr-Latn" in lang_strings: logger.info( 'Duplicate entries <Language [sr]> and <Language [sr-Latn]> found, filtering languages' ) used_languages = filter( lambda l: l != Language.fromietf('sr-Latn'), used_languages) logger.info('Filtered language list %r', used_languages) # convert list of languages into search string langs = '|'.join(map(str, [l.titlovi for l in used_languages])) # set query params params = {'prijevod': title, 'jezik': langs} is_episode = False if season and episode: is_episode = True params['s'] = season params['e'] = episode if year: params['g'] = year # loop through paginated results logger.info('Searching subtitles %r', params) subtitles = [] while True: # query the server try: r = self.session.get(self.search_url, params=params, timeout=10) r.raise_for_status() except RequestException as e: captcha_passed = False if e.response.status_code == 403 and "data-sitekey" in e.response.content: logger.info( 'titlovi: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') site_key = re.search(r'data-sitekey="(.+?)"', e.response.content).group(1) challenge_s = re.search( r'type="hidden" name="s" value="(.+?)"', e.response.content).group(1) challenge_ray = re.search(r'data-ray="(.+?)"', e.response.content).group(1) if not all([site_key, challenge_s, challenge_ray]): raise Exception("titlovi: Captcha site-key not found!") pitcher = pitchers.get_pitcher()( "titlovi", e.request.url, site_key, user_agent=self.session.headers["User-Agent"], cookies=self.session.cookies.get_dict(), is_invisible=True) result = pitcher.throw() if not result: raise Exception("titlovi: Couldn't solve captcha!") s_params = { "s": challenge_s, "id": challenge_ray, "g-recaptcha-response": result, } r = self.session.get(self.server_url + "/cdn-cgi/l/chk_captcha", params=s_params, timeout=10, allow_redirects=False) r.raise_for_status() r = self.session.get(self.search_url, params=params, timeout=10) r.raise_for_status() store_verification("titlovi", self.session) captcha_passed = True if not captcha_passed: logger.exception('RequestException %s', e) break else: try: soup = BeautifulSoup(r.content, 'lxml') # number of results result_count = int( soup.select_one('.results_count b').string) except: result_count = None # exit if no results if not result_count: if not subtitles: logger.debug('No subtitles found') else: logger.debug("No more subtitles found") break # number of pages with results pages = int(math.ceil(result_count / float(items_per_page))) # get current page if 'pg' in params: current_page = int(params['pg']) try: sublist = soup.select( 'section.titlovi > ul.titlovi > li.subtitleContainer.canEdit' ) for sub in sublist: # subtitle id sid = sub.find(attrs={ 'data-id': True }).attrs['data-id'] # get download link download_link = self.download_url + sid # title and alternate title match = title_re.search(sub.a.string) if match: _title = match.group('title') alt_title = match.group('altitle') else: continue # page link page_link = self.server_url + sub.a.attrs['href'] # subtitle language match = lang_re.search( sub.select_one('.lang').attrs['src']) if match: try: # decode language lang = Language.fromtitlovi( match.group('lang') + match.group('script')) except ValueError: continue # relase year or series start year match = year_re.search( sub.find(attrs={ 'data-id': True }).parent.i.string) if match: r_year = int(match.group('year')) # fps match = fps_re.search(sub.select_one('.fps').string) if match: fps = match.group('fps') # releases releases = str( sub.select_one('.fps').parent.contents[0].string) # handle movies and series separately if is_episode: # season and episode info sxe = sub.select_one('.s0xe0y').string r_season = None r_episode = None if sxe: match = season_re.search(sxe) if match: r_season = int(match.group('season')) match = episode_re.search(sxe) if match: r_episode = int(match.group('episode')) subtitle = self.subtitle_class( lang, page_link, download_link, sid, releases, _title, alt_title=alt_title, season=r_season, episode=r_episode, year=r_year, fps=fps, asked_for_release_group=video.release_group, asked_for_episode=episode) else: subtitle = self.subtitle_class( lang, page_link, download_link, sid, releases, _title, alt_title=alt_title, year=r_year, fps=fps, asked_for_release_group=video.release_group) logger.debug('Found subtitle %r', subtitle) # prime our matches so we can use the values later subtitle.get_matches(video) # add found subtitles subtitles.append(subtitle) finally: soup.decompose() # stop on last page if current_page >= pages: break # increment current page params['pg'] = current_page + 1 logger.debug('Getting page %d', params['pg']) return subtitles
def request(self, method, url, *args, **kwargs): # self.headers = ( # OrderedDict( # [ # ('User-Agent', self.headers['User-Agent']), # ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), # ('Accept-Language', 'en-US,en;q=0.5'), # ('Accept-Encoding', 'gzip, deflate'), # ('Connection', 'close'), # ('Upgrade-Insecure-Requests', '1') # ] # ) # ) self.headers = self._hdrs.copy() resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) if resp.headers.get('content-encoding') == 'br' and brotli_available: resp._content = brdec(resp._content) # Debug request if self.debug: self.debugRequest(resp) # Check if Cloudflare anti-bot is on try: if self.is_cloudflare_challenge(resp): self._was_cf = True # Work around if the initial request is not a GET, # Superseed with a GET then re-request the orignal METHOD. if resp.request.method != 'GET': self.request('GET', resp.url) resp = self.request(method, url, *args, **kwargs) else: resp = self.solve_cf_challenge(resp, **kwargs) except NeedsCaptchaException: # solve the captcha self._was_cf = True site_key = re.search(r'data-sitekey="(.+?)"', resp.content).group(1) challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', resp.content).group(1) challenge_ray = re.search(r'data-ray="(.+?)"', resp.content).group(1) if not all([site_key, challenge_s, challenge_ray]): raise Exception("cf: Captcha site-key not found!") pitcher = pitchers.get_pitcher()("cf", resp.request.url, site_key, user_agent=self.headers["User-Agent"], cookies=self.cookies.get_dict(), is_invisible=True) parsed_url = urlparse(resp.url) domain = parsed_url.netloc logger.info("cf: %s: Solving captcha", domain) result = pitcher.throw() if not result: raise Exception("cf: Couldn't solve captcha!") submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format(parsed_url.scheme, domain) method = resp.request.method cloudflare_kwargs = { 'allow_redirects': False, 'headers': {'Referer': resp.url}, 'params': OrderedDict( [ ('s', challenge_s), ('g-recaptcha-response', result) ] ) } return self.request(method, submit_url, **cloudflare_kwargs) return resp
def _request(self, method, url, *args, **kwargs): ourSuper = super(CloudScraper, self) resp = ourSuper.request(method, url, *args, **kwargs) if resp.headers.get('Content-Encoding') == 'br': if self.allow_brotli and resp._content: resp._content = brotli.decompress(resp.content) else: logging.warning('Brotli content detected, But option is disabled, we will not continue.') return resp # Debug request if self.debug: self.debugRequest(resp) # Check if Cloudflare anti-bot is on try: if self.isChallengeRequest(resp): if resp.request.method != 'GET': # Work around if the initial request is not a GET, # Supersede with a GET then re-request the original METHOD. CloudScraper.request(self, 'GET', resp.url) resp = ourSuper.request(method, url, *args, **kwargs) else: # Solve Challenge resp = self.sendChallengeResponse(resp, **kwargs) except ValueError, e: if e.message == "Captcha": parsed_url = urlparse(url) domain = parsed_url.netloc # solve the captcha site_key = re.search(r'data-sitekey="(.+?)"', resp.content).group(1) challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', resp.content).group(1) challenge_ray = re.search(r'data-ray="(.+?)"', resp.content).group(1) if not all([site_key, challenge_s, challenge_ray]): raise Exception("cf: Captcha site-key not found!") pitcher = pitchers.get_pitcher()("cf: %s" % domain, resp.request.url, site_key, user_agent=self.headers["User-Agent"], cookies=self.cookies.get_dict(), is_invisible=True) parsed_url = urlparse(resp.url) logger.info("cf: %s: Solving captcha", domain) result = pitcher.throw() if not result: raise Exception("cf: Couldn't solve captcha!") submit_url = '{}://{}/cdn-cgi/l/chk_captcha'.format(parsed_url.scheme, domain) method = resp.request.method cloudflare_kwargs = { 'allow_redirects': False, 'headers': {'Referer': resp.url}, 'params': OrderedDict( [ ('s', challenge_s), ('g-recaptcha-response', result) ] ) } return CloudScraper.request(self, method, submit_url, **cloudflare_kwargs)