def _get_data(self): # Extract the important bits from the embed page, with thanks to the # code I saw from github user py7hon in his/her mp4upload-direct # program as inspiration for this. Only with regex. source_parts_re = re.compile( r'.*?(www\d+).*?\|video\|(.*?)\|(\d+)\|.*?', re.DOTALL) mp4u_embed = session.get(self.url).text domain, video_id, protocol = source_parts_re.match(mp4u_embed).groups() logging.debug('Domain: %s, Video ID: %s, Protocol: %s' % (domain, video_id, protocol)) url = self.url.replace('embed-', '') # Return to non-embed page to collect title mp4u_page = BeautifulSoup(session.get(url).text, 'html.parser') title = mp4u_page.find('span', {'class': 'dfilename'}).text title = title[:title.rfind('_')][:title.rfind('.')].replace(' ', '_') logging.debug('Title is %s' % title) # Create the stream url stream_url = 'https://{}.mp4upload.com:{}/d/{}/{}.mp4' stream_url = stream_url.format(domain, protocol, video_id, title) logging.debug('Stream URL: %s' % stream_url) return { 'stream_url': stream_url, 'meta': { 'title': title, 'thumbnail': '' } }
def _ranged_download(self): http_chunksize = self.options['range_size'] range_start = 0 range_end = http_chunksize # Make a new file, maybe not the best way with open(self.path, 'w'): pass r = session.get(self.url, headers={'referer': self.referer}, stream=True) while self.downloaded < self.total_size: r = session.get(self.url, headers=set_range(range_start, range_end, self.referer), stream=True) if r.status_code == 206: with open(self.path, 'ab') as f: for chunk in r.iter_content(chunk_size=self.chunksize): if chunk: f.write(chunk) self.report_chunk_downloaded() if range_end == '': break range_start = os.stat(self.path).st_size range_end += http_chunksize if range_end > self.total_size: range_end = ''
def search(cls, query): r = session.get('https://www4.9anime.is/search?', params={'keyword': query}, headers=desktop_headers) logging.debug(r.url) soup = BeautifulSoup(r.text, 'html.parser') search_results = soup.find('div', { 'class': 'film-list' }).find_all('div', {'class': 'item'}) ret = [] logging.debug('Search results') for item in search_results: s = SearchResult(title=item.find('a', { 'class': 'name' }).contents[0], url=item.find('a')['href'], poster=item.find('img')['src']) meta = dict() m = item.find('div', {'class': 'status'}) for item in m.find_all('div'): meta[item.attrs['class'][0]] = item.text.strip() s.meta = meta logging.debug(s) ret.append(s) return ret
def _scarpe_episodes(self, soup): ts = soup.find('html')['data-ts'] self._episodeClass.ts = ts logging.debug('data-ts: {}'.format(ts)) # TODO: !HACK! # The below code should be refractored whenever I'm not lazy. # This was done as a fix to 9anime's switch to lazy loading of # episodes. I'm busy and lazy now, so I'm writing bad code. # Gomen'nasai api_url = "https://www8.9anime.is/ajax/film/servers/{}" api_url = api_url.format( self.url.rsplit('watch/', 1)[1].rsplit('.', 1)[1].split('/')[0]) params = {} params['_'] = int(generate_(params)) params['_'] = 648 soup = BeautifulSoup( session.get(api_url, params=params).json()['html'], 'html.parser') episodes = soup.find('div', {'class': 'server', 'data-name': 33}) episodes = episodes.find_all('li') if episodes == []: err = 'No episodes found in url "{}"'.format(self.url) args = [self.url] raise NotFoundError(err, *args) episode_ids = [] for x in episodes: for a in x.find_all('a'): ep_id = a.get('data-id') episode_ids.append(ep_id) return episode_ids
def _get_sources(self): params = { 'v': '1.1', 'episode_id': self.url.split('id=')[-1], } headers = desktop_headers headers['referer'] = self.url res = session.get(self._episode_list_url, params=params, headers=headers) url = res.json()['value'] headers = desktop_headers headers['referer'] = self.url res = session.get('https:' + url, headers=headers) return [('no_extractor', res.json()['playlist'][0]['file'])]
def _get_sources(self): soup = BeautifulSoup(session.get(self.url).text, 'html.parser') url = 'https:'+soup.select_one('li.anime a').get('data-video') res = requests.get(url) ep_re = re.compile(r"file:.*?'(.*?)'") stream_urls = ep_re.findall(res.text) return [('no_extractor', url) for url in stream_urls]
def _non_range_download(self): r = session.get(self.url, stream=True) if r.status_code == 200: with open(self.path, 'wb') as f: for chunk in r.iter_content(chunk_size=self.chunksize): if chunk: f.write(chunk) self.report_chunk_downloaded()
def search(self, query): r = session.get('https://twist.moe') soup = BeautifulSoup(r.text, 'html.parser') all_anime = soup.select_one('nav.series').select('li') animes = [] for anime in all_anime: animes.append(SearchResult( title=anime.find('span').contents[0].strip(), url='https://twist.moe' + anime.find('a')['href'], poster='', )) animes = [ani[0] for ani in process.extract(query, animes)] return animes
def _ranged_download(self): http_chunksize = self.range_size range_start = 0 range_end = http_chunksize url = self.source.stream_url headers = self.source.headers if 'user-agent' not in headers: headers[ 'user-agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101Firefox/56.0" if self.source.referer: headers['Referer'] = self.source.referer # Make a new file, maybe not the best way with open(self.path, 'w'): pass r = session.get(url, headers=headers, stream=True) while self.downloaded < self._total_size: r = session.get(url, headers=set_range(range_start, range_end, headers), stream=True) if r.status_code == 206: with open(self.path, 'ab') as f: for chunk in r.iter_content(chunk_size=self.chunksize): if chunk: f.write(chunk) self.report_chunk_downloaded() if range_end == '': break range_start = os.stat(self.path).st_size range_end += http_chunksize if range_end > self._total_size: range_end = ''
def _get_data(self): url = self.url + '&q=' + self.quality logging.debug('Calling Rapid url: {}'.format(url)) headers = self.headers headers['referer'] = url try: r = session.get(url, headers=headers) # This is a fix for new rapidvideo logic # It will return OK for a get request # even if there is a click button # This will make sure a source link is present soup = BeautifulSoup(r.text, 'html.parser') get_source(soup, self.quality) except: r = session.post(url, { 'confirm.x': 12, 'confirm.y': 12, 'block': 1, }, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') # TODO: Make these a different function. Can be reused in other classes # too title_re = re.compile(r'"og:title" content="(.*)"') image_re = re.compile(r'"og:image" content="(.*)"') try: stream_url = get_source(soup, self.quality) except IndexError: stream_url = None try: title = str(title_re.findall(r.text)[0]) thumbnail = str(image_re.findall(r.text)[0]) except Exception as e: title = '' thumbnail = '' logging.debug(e) pass return { 'stream_url': stream_url, 'meta': { 'title': title, 'thumbnail': thumbnail, }, }
def _non_range_download(self): url = self.source.stream_url headers = { 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101Firefox/56.0" } if self.source.referer: headers['Referer'] = self.source.referer r = session.get(url, headers=headers, stream=True) if r.status_code == 200: with open(self.path, 'wb') as f: for chunk in r.iter_content(chunk_size=self.chunksize): if chunk: f.write(chunk) self.report_chunk_downloaded()
def _get_sources(self): soup = BeautifulSoup(session.get(self.url).text, 'html.parser') extractors_url = [] for element in soup.select('.anime_muti_link > ul > li'): extractor_class = element.get('class')[0] source_url = element.a.get('data-video') # only use mp4upload and rapidvideo as sources if extractor_class == 'mp4': extractor_class = 'mp4upload' elif extractor_class != 'rapidvideo': continue logging.debug('%s: %s' % (extractor_class, source_url)) extractors_url.append((extractor_class, source_url,)) return extractors_url
def _get_data(self): url = self.url + '&q=' + self.quality logging.debug('Calling Rapid url: {}'.format(url)) headers = self.headers headers['referer'] = url try: r = session.get(url, headers=headers) except: r = session.post(url, { 'cursor.x': 12, 'cursor.y': 12, 'block': 1, }, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') # TODO: Make these a different function. Can be reused in other classes # too src_re = re.compile(r'src: "(.*)"') title_re = re.compile(r'"og:title" content="(.*)"') image_re = re.compile(r'"og:image" content="(.*)"') try: stream_url = soup.find_all('source')[0].get('src') except IndexError: try: stream_url = str(src_re.findall(r.text)[0]) except IndexError: stream_url = None try: title = str(title_re.findall(r.text)[0]) thumbnail = str(image_re.findall(r.text)[0]) except Exception as e: title = '' thumbnail = '' logging.debug(e) pass return { 'stream_url': stream_url, 'meta': { 'title': title, 'thumbnail': thumbnail, }, }
def _scarpe_episodes(self, soup): anime_id = soup.select_one('input#movie_id').attrs['value'] params = { 'default_ep': 0, 'ep_start': 0, 'ep_end': 999999, # Using a very big number works :) 'id': anime_id, } res = session.get(self._episode_list_url, params=params) soup = BeautifulSoup(res.text, 'html.parser') epurls = list( reversed(['https://www2.gogoanime.se'+a.get('href').strip() for a in soup.select('li a')]) ) return epurls
def get_data(self): anime_name = self.url.split('/a/')[-1].split('/')[0] url = self._api_url.format(anime_name) episodes = session.get( url, headers={ 'x-access-token': '1rj2vRtegS8Y60B3w3qNZm5T2Q0TN2NR' } ) episodes = episodes.json() self.title = anime_name episode_urls = ['https://eu1.twist.moe' + decrypt(episode['source'].encode('utf-8'), KEY).decode('utf-8') for episode in episodes] self._episode_urls = [(i+1, episode_url) for i, episode_url in enumerate(episode_urls)] self._len = len(self._episode_urls) return self._episode_urls
def _get_data(self): #Need a javascript deobsufication api/python, so someone smarter #than me can work on that for now I will add the pattern I observed #alternatively you can pattern match on `src` for stream_url part source_parts_re = re.compile( r'action=\"([^"]+)\".*value=\"([^"]+)\".*Click Here to Download', re.DOTALL) #Kwik servers don't have direct link access you need to be referred #from somewhere, I will just use the url itself. download_url = self.url.replace('kwik.cx/e/', 'kwik.cx/f/') kwik_text = session.get(download_url, headers={ 'referer': download_url }).text post_url, token = source_parts_re.search(kwik_text).group(1, 2) stream_url = session.post(post_url, headers={ 'referer': download_url }, data={ '_token': token }, allow_redirects=False).headers['Location'] title = stream_url.rsplit('/', 1)[-1].rsplit('.', 1)[0] logging.debug('Stream URL: %s' % stream_url) return { 'stream_url': stream_url, 'meta': { 'title': title, 'thumbnail': '' }, 'referer': None }
def bypass_hcaptcha(url): """ :param url: url to page which gives hcaptcha :return: Returns Response object (cookies stored for future use) """ host = urlparse(url).netloc bypassed = False session = requests.session() headers = { 'User-Agent': choice(( 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)', 'Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' )) } logger.info("Bypassing captcha...") #Retry until success while not bypassed: site_key = str(uuid4()) response = session.post('https://hcaptcha.com/getcaptcha', headers=headers, data={ 'sitekey': site_key, 'host': host }).json() try: key = response['key'] tasks = [row['task_key'] for row in response['tasklist']] job = response['request_type'] timestamp = round(time()) + choice(range(30, 120)) answers = dict( zip(tasks, [choice(['true', 'false']) for index in range(len(tasks))])) mouse_movements = [] last_movement = timestamp for index in range(choice(range(1000, 10000))): last_movement += choice(range(10)) mouse_movements.append( [choice(range(500)), choice(range(500)), last_movement]) json = { 'job_mode': job, 'answers': answers, 'serverdomain': host, 'sitekey': site_key, 'motionData': { 'st': timestamp, 'dct': timestamp, 'mm': mouse_movements } } response = session.post(f'https://hcaptcha.com/checkcaptcha/{key}', json=json) response = response.json() bypassed = response['pass'] except (TypeError, KeyError): pass if bypassed: token = response['generated_pass_UUID'] resp = helpers.soupify(session.get(url)) bypass_url = f'https://{host}{resp.form.get("action")}' data = dict((x.get('name'), x.get('value')) for x in resp.select('form > input')) data.update({ 'id': resp.strong.text, 'g-recaptcha-response': token, 'h-captcha-response': token }) resp = session.post(bypass_url, data=data) if resp.status_code == 200: pickle.dump(resp.cookies, open(f'{tempfile.gettempdir()}/{host}', 'wb')) logger.info("Succesfully bypassed captcha!") return resp else: bypassed = False