def get_video(url, session): while url.strip().endswith('/'): url = url[:-1] html = downloader.read_html(url, session=session) soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] params = re.findall('VodParameter *= *[\'"]([^\'"]+)[\'"]', html)[0] params += '&adultView=ADULT_VIEW&_={}'.format(int(time() * 1000)) url_xml = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def get_imgs(url, title=None, customWidget=None): username = re.findall('/member/([^/]+)', url)[0] url = 'https://worldcosplay.net/member/{}'.format(username) html = downloader.read_html(url) soup = Soup(html) userid = re.find('"member_id" *: *([0-9]+)', html) if userid is None: raise Exception('no userid') print('userid:', userid) p = 1 imgs = [] while True: url = 'http://worldcosplay.net/en/api/member/photos?member_id={}&page={}&limit=100000&rows=16&p3_photo_list=1'.format( userid, p) html = downloader.read_html(url) j = json.loads(html) list = j['list'] print(len(list)) if not list: break for img in list: photo = img['photo'] id = photo['id'] url_img = photo['sq300_url'] sizes = re.findall('/max-([0-9]+)/', url_img) if sizes: size = sizes[0] else: size = 3000 url_img = url_img.replace('-350x600', '-{}'.format(size)) img = Image(url_img, id) imgs.append(img) p += 1 if customWidget is not None: if not customWidget.alive: break customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {} - {}')".format( tr_(u'읽는 중...'), title, len(imgs)))) return imgs
def get_id(url): url = url.lower() if '/prof-video-click/upload/' in url: return url.split('/prof-video-click/upload/')[1].split('/')[1] return re.findall( '[0-9]+', url.split('xvideos.')[1].split('/')[1].split('?')[0].split('#')[0])[0]
def get_pages(url, cw=None): print_ = get_print(cw) url = get_main(url).replace('comic.naver.', 'm.comic.naver.') id = get_id(url) print('id:', id) print(url) html = downloader.read_html(url) soup = Soup(html) try: info = soup.find('div', class_='area_info') artist = info.find('span', class_='author').text.strip() except Exception as e: print(e) try: title = ('\n').join( soup.find( 'div', class_='title').text.strip().split('\n')[:-1]).strip() except: title = 'artist not found' raise Exception(title) print('artist:', artist) title = soup.find('meta', {'property': 'og:title'}).attrs['content'] pages = [] nos = set() for p in range(1, 100): if p == 1: url_page = url else: url_page = set_page(url, p) html = downloader.read_html(url_page) print('read page:', url_page) soup = Soup(html) view = soup.findAll('ul', class_='section_episode_list')[(-1)] for lst in view.findAll('li'): url_page = urljoin(url, lst.find('a').attrs['href']) if 'detail.nhn' not in url_page.lower(): continue print_('url_page: {}'.format(url_page)) text = lst.find('strong', class_='title').find('span', class_='name').text.strip() no = int(re.findall('[?&]no=([0-9]+)', url_page)[0]) if no in nos: print('duplicate no: {}'.format(no)) continue nos.add(no) text = '{:04} - {}'.format(no, text) page = Page(url_page, text, p) pages.append(page) btn_next = soup.find('a', class_='btn_next') if btn_next is None or btn_next.attrs['href'] == '#': print('end of page') break info = Info(id, title, artist) return (info, pages)
def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() oid = _get_page_id(html) if not oid: raise Exception('no page_id') uids = re.findall(r'uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.find(r"CONFIG\['onick'\]='(.+?)'", html) or soup.find( 'div', class_=lambda c: c and c.startswith('ProfileHeader_name' )).text.strip() if not name: raise Exception('no name') break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name
def get_video(url, session, cw): print_ = get_print(cw) html = downloader.read_html(url, session=session) if "document.location.href='https://login." in html: raise errors.LoginRequired() soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] print_('url_thumb: {}'.format(url_thumb)) params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter') params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000)) url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) if '<flag>PARTIAL_ADULT</flag>' in html: raise errors.LoginRequired() title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def __init__(self, illust, url, ugoira_data=None, format_name=None): self.illust = illust self.id = illust.id self.type = illust.type self.title = illust.title self.artist = illust.user.name self.url = LazyUrl('https://app-api.pixiv.net/', lambda _: url, self) ps = re.findall('_p([0-9]+)', url) p = ps[(-1)] if ps else 0 self.p = p self.ext = os.path.splitext(url.split('?')[0].split('#')[0])[1] if self.type == 'ugoira': self.ugoira_data = ugoira_data if format_name: name = format_name.replace('id', '###id*').replace( 'page', '###page*').replace('artist', '###artist*').replace( 'title', '###title*') name = name.replace('###id*', str(self.id)).replace( '###page*', str(self.p)).replace('###artist*', self.artist).replace( '###title*', self.title) self.filename = clean_title( name.strip(), allow_dot=True, n=-len(self.ext)) + self.ext else: self.filename = os.path.basename(url.split('?')[0].split('#')[0]) self.utime = get_time(illust)
def id_(self): ids = re.findall('.com/([^/]+)', self.url) if ids: id = ids[0] else: id = self.url return id
def __init__(self, title, url): if title.startswith('NEW'): title = title.replace('NEW', '', 1).strip() title = fix_title_page(title) self.title = clean_title(title) self.url = url self.id = int(re.findall('wr_id=([0-9]+)', url)[0])
def fix_url(cls, url): url = url.replace('weibo.cn', 'weibo.com').split('?')[0] if 'weibo.com/p/' in url: id = re.findall('weibo.com/p/([^/]+)', url)[0] url = 'https://weibo.com/p/{}'.format(id) elif 'weibo.com/u/' in url: id = re.findall('weibo.com/u/([^/]+)', url)[0] url = 'https://weibo.com/u/{}'.format(id) elif 'weibo.com/' in url: id = re.findall('weibo.com/([^/]+)', url)[0] url = 'https://weibo.com/{}'.format(id) else: id = url url = 'https://weibo.com/u/{}'.format(id) url = fix_protocol(url) return url
def __init__(self, url, cw=None): self.cw = cw self.url = re.findall(r'archive.[^/]+/(?:cdx/search/cdx\?url=|(?:web/)?(?:[^/]+/))(.+)', url.lower())[0].strip('/') self.base_url = self.url.split('&')[0].strip('/') self.md5 = md5(self.url.encode('utf8')).hexdigest()[:8] self.mode = self.__get_mode() self.title = self.__get_title()
def get_imgs(page, cw=None): print_ = get_print(cw) html = downloader.read_html(page.url) soup = Soup(html) type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html) print_('type: {}'.format(type_)) imgs = [] if type_ == 'DEFAULT': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772 view = soup.find('div', class_='toon_view_lst') for img in view.findAll('img'): img = img.attrs.get('data-src') if not img: continue img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'CUTTOON': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803 view = soup.find('div', class_='swiper-wrapper') for div in view.findAll('div', class_='swiper-slide'): if div.parent != view: continue if div.find('div', class_='cut_viewer_last'): print('cut_viewer_last') continue if div.find('div', class_='cut_viewer_recomm'): print('cut_viewer_recomm') continue img = div.find('img') img = img.attrs['data-src'] img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'EFFECTTOON': #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144 img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/' print('img_base:', img_base) url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html) data_raw = downloader.read_html(url_api, page.url) data = json.loads(data_raw) for img in data['assets']['stillcut'].values( ): # ordered in python3.7+ img = urljoin(img_base, img) img = Image(img, page, len(imgs)) imgs.append(img) else: _imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html) if not _imgs: raise Exception('no imgs') for img in _imgs: img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) return imgs
def following(self, p, r18=False): url = 'https://www.pixiv.net/bookmark_new_illust_r18.php' if r18 else 'https://www.pixiv.net/bookmark_new_illust.php' if p > 1: url += '?p={}'.format(p) html = downloader.read_html(url, session=self.session) ids = [] ids_set = set() for id_ in re.findall('([0-9]+)_p0_master1200', html): if id_ in ids_set: continue ids_set.add(id_) ids.append(id_) return ids
def get_page(url): qs = query_url(url) page = qs.get('p') if page: page = int(page[0]) else: page = re.findall('_p([0-9]+)', url) if page: page = int(page[0]) else: page = None if page == 1: page = None return page
def setPage(url, page): # Always use HTTPS url = url.replace('http://', 'https://') # Main page if re.findall(r'https://[\w]*[.]?donmai.us/?$', url): url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '') # Change the page if 'page=' in url: url = re.sub('page=[0-9]*', 'page={}'.format(page), url) else: url += '&page={}'.format(page) return url
def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() m = _get_page_id(html) if not m: raise Exception('no page_id') oid = m.groups()[0] uids = re.findall('uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.findall("CONFIG\\['onick'\\]='(.+?)'", html)[0] break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name
def __init__(self, url): if 'fbid=' in url: id = int(re.findall('fbid=([0-9]+)', url)[0]) elif 'photos/' in url: id = int(url.split('photos/')[1].split('/')[1]) else: id = int(url) self.id = id def f(_): img = get_img(url) ext = os.path.splitext(img.split('?')[0])[1] self.filename = u'{}{}'.format(id, ext) return img self.url = LazyUrl(url, f, self)
def __init__(self, title, update, url, session, single): if single: self.p = None self.title = title else: self.p = int(re.findall('/([0-9]+)', url)[(-1)]) title = (u'[{:04}] {}').format(self.p, title) title = clean_title(title, n=-4) self.title = title self.filename = (u'{}.txt').format(self.title) def f(url): text = get_text(url, self.title, update, session) f = BytesIO() f.write(text.encode('utf8')) f.seek(0) return f self.url = LazyUrl(url, f, self)
def get(self, url): ''' get ''' cw = self.cw session = self.session print_ = get_print(cw) if self._url: return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id') print_('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) url_test = url.replace('pornhubpremium.com', 'pornhub.com') try: html = downloader.read_html(url_test, session=session) soup = Soup(html) if soup.find('div', id='lockedPlayer'): print_('Locked player') raise Exception('Locked player') url = url_test except: #3511 url = url.replace('pornhub.com', 'pornhubpremium.com') html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) html = str(soup) # removed if soup.find('div', class_='removed'): raise Exception('removed') gif = soup.find('div', {'id': 'gifImageSection'}) if gif: print_('GIF') id_ = url.split('/gif/')[1] id_ = re.findall('[0-9a-zA-Z]+', id_)[0] jss = list(gif.children) for js in jss: if 'data-mp4' in getattr(js, 'attrs', {}): break else: raise Exception('gif mp4 url not found') title = js['data-gif-title'] url = js['data-mp4'] url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb') file = File('gif_{}'.format(id_), title, url, url_thumb) else: if id_ is None: raise Exception('no id') print_('Video') # 1968 #title = j['video_title'] title = soup.find('h1', class_='title').text.strip() ydl = ytdl.YoutubeDL(cw=self.cw) info = ydl.extract_info(url) url_thumb = info['thumbnail'] videos = [] for f in info['formats']: video = {} video['height'] = f['height'] video['quality'] = f['height'] or 0 video['protocol'] = f['protocol'] video['videoUrl'] = f['url'] if f['protocol'] == 'm3u8': video['quality'] -= 1 print_('[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl'])) videos.append(video) if not videos: raise Exception('No videos') videos = sorted(videos, key=lambda video: video['quality']) res = get_resolution() videos_good = [video for video in videos if video['quality'] <= res] if videos_good: video = videos_good[-1] else: video = videos[0] print_('\n[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl'])) file = File(id_, title, video['videoUrl'].strip(), url_thumb) self._url = file.url self.title = file.title self.filename = file.filename self.thumb = file.thumb return self._url
def album(self): if 'album_id=' in self.url: album = re.findall('album_id=([0-9]+)', self.url)[0] else: album = None return album
def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None): if False:# raise NotImplementedError('Not Implemented') print_ = get_print(customWidget) print_(u'types: {}'.format(', '.join(types))) # Range max_pid = get_max_range(customWidget, 2000) local_ids = {} if customWidget is not None: dir = customWidget.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') url_old = 'https://{}.sankakucomplex.com'.format(type) if customWidget is not None: customWidget.exec_queue.put((customWidget, u"customWidget.setTitle(u'{} {}')".format(tr_(u'읽는 중...'), title))) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break sleep(1)# #url = setPage(url, page) print_(url) html = downloader.read_html(url, referer=url_old, session=session) if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') for i in range(120): sleep(1) if customWidget and not customWidget.alive: return [] continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = re.findall('show/([0-9]+)', url_img)[0] if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d) imgs.append(img) if len(imgs) >= max_pid: break if customWidget and not customWidget.alive: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if customWidget is not None: customWidget.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') return imgs
def find_url(html, url): href = re.findall('[\'"]([^\'"]+page.php[^\'"]+manga_detail[^\'"]+)[\'"]', html)[0] href = html_unescape(href) return urljoin(url, href)
def get_imgs_page_legacy(page, session, cw=None, depth=0): if cw is not None and not cw.alive: return print_ = get_print(cw) try: html = read_html(page.url, session) except Exception as e: print_('get_imgs_page_legacy error: {}'.format(e)) if e.args and e.args[0] == 502: return [] raise if isProtected(html): data = get_soup(page.url, cw=cw, session=session) page.url = data['url'] html = data['html'] soup = Soup(html, 'html5lib') # 1653 # skip empty pages if not html: print_(u'empty page: {}'.format(page.title)) return [] # skip invalid pages err = soup.find('span', class_='cf-error-code') if err: print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title)) if depth > 0: return [] else: return get_imgs_page_legacy(page, session, cw, depth + 1) #page.title = get_title_page(soup) matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', '')) matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', '')) img_list = json.loads(matches[0]) if matches else [] img_list1 = json.loads(matches1[0]) if matches1 else [] # 1780 img_list = [img for img in img_list if img] img_list1 = [img for img in img_list1 if img] # 1589 ''' if not img_list and not img_list1: print_((u'no imgs; retry... {}').format(page.title)) raise Exception('No images') ''' for script in soup.findAll('script'): script = script.text if 'var img_list =' in script: break else: raise Exception('No script') seed = int(re.find('view_cnt *= *([0-9]+)', script)) chapter = int(re.find('var +chapter *= *([0-9]+)', script)) try: cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script), '[]') cdn_domains = json.loads(cdn_domains) except Exception as e: print(e) cdn_domains = [] n = max(len(img_list), len(img_list1)) img_list += [''] * (n - len(img_list)) img_list1 += [''] * (n - len(img_list1)) print_(u'{} chapter:{} seed:{} domains:{}'.format( page.title, chapter, seed, len(cdn_domains))) if seed != 0: return 'seed' imgs = [] for p, (img, img1) in enumerate(zip(img_list, img_list1)): # fix img url img = fix_img_url(img, cdn_domains, chapter, p) img1 = fix_img_url(img1, cdn_domains, chapter, p) img = urljoin(page.url, img) if img else '' img1 = urljoin(page.url, img1) if img1 else '' # most likely googledrive if img.strip('/').count('/') == 2: #1425 continue img = Image(img, page, p, img1) imgs.append(img) return imgs
def get(self, url): ''' get ''' cw = self.cw session = self.session print_ = get_print(cw) if self._url: return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id') print_('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) url_test = url.replace('pornhubpremium.com', 'pornhub.com') try: html = downloader.read_html(url_test, session=session) soup = Soup(html) if soup.find('div', id='lockedPlayer'): print_('Locked player') raise Exception('Locked player') url = url_test except: #3511 url = url.replace('pornhub.com', 'pornhubpremium.com') html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) html = soup.html # removed if soup.find('div', class_='removed'): raise Exception('removed') gif = soup.find('div', {'id': 'gifImageSection'}) if gif: print_('GIF') id_ = url.split('/gif/')[1] id_ = re.findall('[0-9a-zA-Z]+', id_)[0] jss = list(gif.children) for js in jss: if 'data-mp4' in getattr(js, 'attrs', {}): break else: raise Exception('gif mp4 url not found') title = js['data-gif-title'] url = js['data-mp4'] url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb') file = File('gif_{}'.format(id_), title, url, url_thumb) else: if id_ is None: raise Exception('no id') print_('Video') # 1968 #title = j['video_title'] title = soup.find('h1', class_='title').text.strip() video_urls = [] video_urls_set = set() def int_or_none(s): try: return int(s) except: return None def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None flashvars = json.loads(re.find(r'var\s+flashvars_\d+\s*=\s*({.+?});', html, err='no flashvars')) url_thumb = flashvars.get('image_url') media_definitions = flashvars.get('mediaDefinitions') if isinstance(media_definitions, list): for definition in media_definitions: if not isinstance(definition, dict): continue video_url = definition.get('videoUrl') if not video_url or not isinstance(video_url, str): continue if video_url in video_urls_set: continue video_urls_set.add(video_url) video_urls.append( (video_url, int_or_none(definition.get('quality')))) def extract_js_vars(webpage, pattern, default=object()): assignments = re.find(pattern, webpage, default=default) if not assignments: return {} assignments = assignments.split(';') js_vars = {} def remove_quotes(s): if s is None or len(s) < 2: return s for quote in ('"', "'", ): if s[0] == quote and s[-1] == quote: return s[1:-1] return s def parse_js_value(inp): inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) if '+' in inp: inps = inp.split('+') return functools.reduce( operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] return remove_quotes(inp) for assn in assignments: assn = assn.strip() if not assn: continue assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) return js_vars def add_video_url(video_url): v_url = url_or_none(video_url) if not v_url: return if v_url in video_urls_set: return video_urls.append((v_url, None)) video_urls_set.add(v_url) def parse_quality_items(quality_items): q_items = json.loads(quality_items) if not isinstance(q_items, list): return for item in q_items: if isinstance(item, dict): add_video_url(item.get('url')) if not video_urls: print_('# extract video_urls 2') FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( html, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): if key.startswith(FORMAT_PREFIXES[-1]): parse_quality_items(format_url) elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', html): raise Exception('Video is locked') ## if not video_urls: ## print_('# extract video_urls 3') ## js_vars = extract_js_vars( ## dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') ## add_video_url(js_vars['mediastring']) for mobj in re.finditer( r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', html): video_url = mobj.group('url') if video_url not in video_urls_set: video_urls.append((video_url, None)) video_urls_set.add(video_url) video_urls_ = video_urls video_urls = [] for video_url, height in video_urls_: if '/video/get_media' in video_url: print_(video_url) medias = downloader.read_json(video_url, session=session) if isinstance(medias, list): for media in medias: if not isinstance(media, dict): continue video_url = url_or_none(media.get('videoUrl')) if not video_url: continue height = int_or_none(media.get('quality')) video_urls.append((video_url, height)) continue video_urls.append((video_url, height)) videos = [] for video_url, height in video_urls: video = {} video['height'] = height or int_or_none(re.find(r'(?P<height>\d+)[pP]?_\d+[kK]', video_url)) video['quality'] = video['height'] or 0 video['videoUrl'] = video_url ext = get_ext(video_url) video['ext'] = ext if ext.lower() == '.m3u8': video['quality'] -= 1 print_('[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl'])) videos.append(video) if not videos: raise Exception('No videos') videos = sorted(videos, key=lambda video: video['quality']) res = get_resolution() videos_good = [video for video in videos if video['quality'] <= res] if videos_good: video = videos_good[-1] else: video = videos[0] print_('\n[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl'])) file = File(id_, title, video['videoUrl'].strip(), url_thumb) self._url = file.url self.title = file.title self.filename = file.filename self.thumb = file.thumb return self._url
def get(self, url): ''' get ''' cw = self.cw session = self.session print_ = get_print(cw) if self._url: return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ re.find(r'/embed/(\w+)', url, re.IGNORECASE) print('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) html = str(soup) # removed if soup.find('div', class_='removed'): raise Exception('removed') gif = soup.find('div', {'id': 'gifImageSection'}) if gif: print_('GIF') id_ = url.split('/gif/')[1] id_ = re.findall('[0-9a-zA-Z]+', id_)[0] jss = list(gif.children) for js in jss: if 'data-mp4' in getattr(js, 'attrs', {}): break else: raise Exception('gif mp4 url not found') title = js['data-gif-title'] url = js['data-mp4'] url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb') file = File('gif_{}'.format(id_), title, url, url_thumb) else: if id_ is None: raise Exception('no id') print_('Video') j = decode(html, cw) # 1968 #title = j['video_title'] title = soup.find('h1', class_='title').text.strip() url_thumb = j['image_url'] videos = [] for video in j['mediaDefinitions']: url_ = video.get('videoUrl').strip() ext = get_ext(url_) if ext.lower() not in ['.mp4', '.m3u8']: print('not mp4: {}'.format(ext)) continue quality = video.get('quality', 0) if isinstance(quality, list): quality = quality[0] video['quality'] = int(quality) print_('[{}p] {}'.format(quality, url_)) videos.append(video) if not videos: raise Exception('No videos') videos = sorted(videos, key=lambda video: video['quality']) res = get_resolution() videos_good = [ video for video in videos if video['quality'] <= res ] if videos_good: video = videos_good[-1] else: video = videos[0] print_('\n[{}p] {}'.format(video['quality'], video['videoUrl'])) file = File(id_, title, video['videoUrl'].strip(), url_thumb) self._url = file.url self.title = file.title self.filename = file.filename self.thumb = file.thumb return self._url
def get_videos(url, cw=None, depth=0): print_ = get_print(cw) if utils.ui_setting: res_text = compatstr(utils.ui_setting.youtubeCombo_res.currentText()) res = { '720p': 720, '1080p': 1080, '2K': 1440, '4K': 2160, '8K': 4320 }[res_text] else: res = 720 mobj = re.match(_VALID_URL, url) video_id = mobj.group('id') anime_id = mobj.group('anime_id') print(video_id, anime_id) print_ = get_print(cw) html = downloader.read_html(url, methods={'requests'}) soup = Soup(html) title = soup.find('h1').attrs['title'].strip() url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] p = get_page(url) if p is None: p = 1 print('page:', p) if p > 1: pages = get_pages(html) cid = pages[(p - 1)]['cid'] else: cid = re.findall('\\bcid(?:["\\\']:|=)(\\d+)', html)[0] print_('cid: {}'.format(cid)) headers = {'Referer': url} entries = [] RENDITIONS = [ 'qn={}&quality={}&type='.format(qlt, qlt) for qlt in RESOLS.keys() ] # + ['quality=2&type=mp4'] for num, rendition in enumerate(RENDITIONS, start=1): print('####', num, rendition) payload = 'appkey=%s&cid=%s&otype=json&%s' % (_APP_KEY, cid, rendition) sign = hashlib.md5( (payload + _BILIBILI_KEY).encode('utf-8')).hexdigest() url_json = 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % ( payload, sign) s_json = downloader.read_html(url_json) print(s_json[:1000]) video_info = json.loads(s_json) if not video_info: continue if 'durl' not in video_info: print('#### error', num) if num < len(RENDITIONS): continue msg = video_info.get('message') if msg: raise Exception(msg) quality = video_info['quality'] resolution = get_resolution(quality) s = (u'resolution: {}').format(resolution) print_(s) # 2184 if int(re.find('([0-9]+)p', resolution)) > res: print_('skip resolution') continue for idx, durl in enumerate(video_info['durl']): # 1343 if idx == 0: size = downloader.get_size(durl['url'], referer=url) if size < 1024 * 1024 and depth == 0: print_('size is too small') return get_videos(url, cw, depth + 1) formats = [{ 'url': durl['url'], 'filesize': int_or_none(durl['size']) }] for backup_url in durl.get('backup_url', []): formats.append({ 'url': backup_url, 'preference': -2 if 'hd.mp4' in backup_url else -3 }) for a_format in formats: a_format.setdefault('http_headers', {}).update({'Referer': url}) entries.append({ 'id': '%s_part%s' % (video_id, idx), 'duration': float_or_none(durl.get('length'), 1000), 'formats': formats }) break videos = [] for entry in entries: url_video = entry['formats'][0]['url'] video = Video(url_video, url, video_id, len(videos)) videos.append(video) info = {'title': clean_title(title), 'url_thumb': url_thumb} return (videos, info)
def twitter(): return '@' + re.findall('twitter.[^/]+/([^/*?]+)', self.url)[0]
def get_id(url): if '/watch/' in url: id = re.findall('/watch/([a-zA-Z0-9]+)', url)[0] else: id = url return id
def get_imgs(url, filter_, directory, session=Session(), cw=None): print_ = get_print(cw) if not os.path.exists(directory): os.makedirs(directory) urls_path = os.path.join(directory, '{}.urls'.format(filter_.md5)) bitmap_path = os.path.join(directory, '{}.bitmap'.format(filter_.md5)) count_path = os.path.join(directory, '{}.count'.format(filter_.md5)) for path in [urls_path, bitmap_path, count_path]: if not os.path.exists(path): open(path, 'x').close() with open(count_path) as file: num_complete = (lambda x: int(x) if x else 0)(file.read()) snapshots = WaybackMachineAPI(session, cw).snapshots(url) bitmap = Bitmap(cw=cw).load(len(snapshots), bitmap_path) if num_complete else Bitmap(len(snapshots), cw=cw) base_url = 'https://web.archive.org/web/{}im_/{}' def get_imgs_snapshot(id_, snapshot): @sleep_and_retry @limits(1, 5) def get_soup(): try: return downloader.read_soup(f'https://web.archive.org/web/{snapshot[0]}id_/{snapshot[1]}') except Exception as exception: print_(print_error(exception)[0]) return None def get_imgs_soup(soup): if not soup: return [] def default(): return [base_url.format(snapshot[0], img['src']) for img in soup.find_all('img', src=True)] def twitter(): return [base_url.format(snapshot[0], img['src']) for img in soup.find_all('img', src=True) if 'twimg.com/media/' in img['src']] return [ default, twitter ][filter_.mode]() return id_, get_imgs_soup(get_soup()) with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(get_imgs_snapshot, id_, snapshot) for id_, snapshot in enumerate(snapshots) if not bitmap.get(id_)] with open(urls_path, 'a') as urls_file: for future in concurrent.futures.as_completed(futures): id_, urls = future.result() urls_file.writelines([f'{url}\n' for url in urls]) bitmap.update(id_, bitmap_path) num_complete += 1 with open(count_path, 'w') as count_file: count_file.write(str(num_complete)) msg = f'{filter_.title} - {num_complete}' cw.setTitle(msg) if cw else print_(msg) with open(urls_path) as file: urls = set() for url in file.readlines(): urls.update(re.findall(r'^\S+$', url)) os.remove(urls_path) os.remove(bitmap_path) os.remove(count_path) return urls