def get(self, _): if self._url_cache: return self._url_cache print_ = get_print(self.cw) for try_ in range(self.try_n): try: d = ytdl.YoutubeDL() info = d.extract_info(self._url) url = info['url'] ext = get_ext(url) self.ext = ext print_('get_video: {} {}'.format(url, ext)) if ext.lower() == '.m3u8': url = M3u8_stream(url, n_thread=self.n_thread, post_processing=True) self._url_cache = url return url except Exception as e: e_ = e msg = print_error(e)[(-1)] print_('\nTwitter video Error:\n{}'.format(msg)) if try_ < self.try_n - 1: sleep(10, self.cw) else: raise e_
def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown']: browser.hide() sd['shown'] = False try: info['uid'] = soup.find('h2', class_='share-title').text.strip() info['nickname'] = soup.find( 'h1', class_='share-sub-title').text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() for div in soup.findAll('div', class_='video-feed-item'): a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4
def get_imgs_single(url, session, types, format='[%y-%m-%d] id_ppage', cw=None): print_ = get_print(cw) id = re.find('/status/([0-9]+)', url) if id is None: raise Exception('no id') data = TwitterAPI(session, cw).tweet(id, url) tweets = data["globalObjects"]["tweets"] id = tweets[id].get('retweeted_status_id_str') or id tweet = tweets[id] time = get_time(tweet) img = Image(url, url, id, time, 0, format, cw, True, try_n=1, n_thread=4) try: img.url() return [img] except Exception as e: print(print_error(e)[-1]) return get_imgs_from_tweet(tweet, session, types, format, cw)
def post_processing(self): cw = self.customWidget ui_setting = self.ui_setting format = self._format if cw is not None and format is not None: try: dither = ui_setting.checkDither.isChecked() quality = ui_setting.ugoira_quality.value() except Exception as e: print(e) dither = True quality = 90 imgs_ugoira = [] for img in self.imgs: if img.url not in cw.urls: continue if img.type == 'ugoira': if os.path.splitext(img.url)[1].lower() == '.zip': imgs_ugoira.append(img) for j, img in enumerate(imgs_ugoira): if not cw.valid or not cw.alive: return self.exec_queue.put( (cw, (u'customWidget.pbar.setFormat(u"[%v/%m] {} [{}/{}]")' ).format(tr_(u'움짤 변환...'), j, len(imgs_ugoira)))) filename = os.path.join(self.dir, img.filename) out = os.path.splitext(filename)[0] + '.' + format cw.print_((u'convert ugoira: {} --> {}').format(filename, out)) try: duration = [ frame.delay for frame in img.ugoira_data.frames ] self.print_((u'Duration: {}').format(duration)) ffmpeg.gif(filename, out, duration=duration, dither=dither, quality=quality, cw=cw) except Exception as e: self.print_(print_error(e)[0]) continue if not cw.valid or not cw.alive: return try: self.removeDirList.append((filename, False)) cw.dones.add(out) i = cw.urls.index(img.url) cw.imgs[i] = out if i == 0: cw.firstImg = out cw.setIcon(out) except Exception as e: return self.Invalid(e=e) self.exec_queue.put( (cw, u'customWidget.pbar.setFormat("[%v/%m]")'))
def process_ids(ids, info, imgs, cw, depth=0): print_ = get_print(cw) max_pid = get_max_range(cw) for i, id_illust in enumerate(ids): try: info_illust = get_info( 'https://www.pixiv.net/en/artworks/{}'.format(id_illust), cw, depth=depth + 1) except Exception as e: if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired ): # logout during extraction raise e print_('process_ids error ({}):\n{}'.format( depth, print_error(e)[0])) continue imgs += info_illust['imgs'] s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs) >= max_pid: break if depth == 0: check_alive(cw)
def get(self, url): cw = self.cw d = self.d print_ = get_print(cw) for try_ in range(4): wait(cw) html = '' try: html = downloader.read_html(url, referer=self.referer, session=self.session) #url = 'https:' + re.findall('[Oo]riginal:? ?<a href="(//[0-9a-zA-Z_-]{2,2}.sankakucomplex.com/data/.{0,320}?)"', html)[0] soup = Soup(html) highres = soup.find(id='highres') url = urljoin(url, highres['href'] if highres else soup.find(id='image')['src']) break except Exception as e: e_msg = print_error(e)[0] if '429 Too many requests'.lower() in html.lower(): t_sleep = 120 * min(try_ + 1, 2) e = '429 Too many requests... wait {} secs'.format(t_sleep) elif 'post-content-notification' in html: # sankaku plus print_('Sankaku plus: {}'.format(self.id)) return '' else: t_sleep = 5 s = '[Sankaku] failed to read image (id:{}): {}'.format(self.id, e) print_(s) sleep(t_sleep, cw) else: raise Exception('can not find image (id:{})\n{}'.format(self.id, e_msg)) soup = Soup('<p>{}</p>'.format(url)) url = soup.string ext = os.path.splitext(url)[1].split('?')[0] self.filename = '{}{}'.format(self.id, ext) return url
def _pagination(self, url_api, params=None, entry_tweet="tweet-", entry_cursor="cursor-bottom-"): if params is None: params = self.params.copy() while True: cursor = None if params.get("cursor"): self.print_('cursor: {}'.format(params.get("cursor"))) # 2303 n_try = RETRY_PAGINATION for try_ in range(n_try): try: data = self._call(url_api, params=params) if 'globalObjects' not in data: try_ = n_try raise Exception(str(data['errors'])) tweets = data["globalObjects"]["tweets"] break except Exception as e: e_ = e e_msg = print_error(e)[0] if try_ < n_try - 1: self.print_('retry... _pagination ({})\n{}'.format( try_ + 1, e_msg)) sleep(30, self.cw) else: break #raise e_ #3392 users = data["globalObjects"]["users"] for instr in data["timeline"]["instructions"]: for entry in instr.get("addEntries", {}).get("entries", []): if entry["entryId"].startswith(entry_tweet): tid = entry["content"]["item"]["content"]["tweet"][ "id"] if tid not in tweets: self.print_( "Skipping unavailable Tweet {}".format(tid)) continue #print('tid:', tid)# tweet = tweets[tid] tweet["user"] = users[tweet["user_id_str"]] yield tweet elif entry["entryId"].startswith(entry_cursor): cursor = entry["content"]["operation"]["cursor"][ "value"] if not cursor or params.get('cursor') == cursor: print('same cursor') return params["cursor"] = cursor if params.get("cursor") is None: # nothing self.print_('no cursor') break
def __init__(self, tweet, format, types, session, cw): print_ = get_print(cw) self.tweet = tweet self.session = session self.username = tweet.attrs['data-screen-name'] self.id = int(tweet.attrs['data-tweet-id']) for span in tweet.findAll('span'): time = span.attrs.get('data-time') if time: break else: time_ms = (id >> 22) + 1288834974657 time = time_ms / 1000 self.time = time self.url = urljoin('https://twitter.com', tweet.attrs['data-permalink-path']) self.withheld = 'withheld-tweet' in tweet.attrs if self.withheld: print_((' withheld: {}').format(self.id)) urls = [] if 'img' in types: for div in tweet.findAll('div'): url = div.attrs.get('data-image-url') if not url: continue if ':' not in os.path.basename(url): url += ':orig' urls.append(url) if 'img' in types: for a in tweet.findAll('a'): url = a.attrs.get('data-expanded-url', '') if '//twitpic.com/' not in url: continue print_(('twitpic: {}, {}').format(self.id, url)) try: url = get_twitpic(url, session) if url in urls: print('duplicate twitpic') continue urls.append(url) except Exception as e: print_(('Failed to read twitpic:\n{}').format( print_error(e)[(-1)])) if 'grid-tweet' in tweet.attrs['class']: url = tweet.attrs['data-url'] + ':large' urls.append(url) self.imgs = [] for page, url in enumerate(urls): img = Image(url, self.url, self.id, self.time, page, format, cw) self.imgs.append(img) if 'PlayableMedia-container' in str(tweet): self.isVideo = True if 'video' in types: img = Image(self.url, self.url, self.id, self.time, 0, format, cw, True) self.imgs.append(img)
def get_imgs_page(id_art, session, date=None, cw=None): print_ = get_print(cw) url_json = 'https://www.artstation.com/projects/{}.json'.format(id_art) post_url = 'https://www.artstation.com/artwork/{}'.format(id_art) try: html = downloader.read_html(url_json, session=session, referer=post_url) data = json.loads(html) imgs_ = data['assets'] except Exception as e: print_(print_error(e)[(-1)]) return [] if date is None: date = data['created_at'][2:10] imgs = [] for page, img in enumerate(imgs_): if not img['has_image']: print('no img') continue url = None video = None embed = img.get('player_embedded') if embed: soup = Soup(embed) url_embed = soup.find('iframe').attrs['src'] print_('embed: {}'.format(url_embed)) try: html = downloader.read_html(url_embed, session=session, referer=post_url) soup = Soup(html) url = soup.find('video').find('source').attrs['src'] except Exception as e: pass if not url: try: url = soup.find('link', {'rel': 'canonical'}).attrs['href'] print_('YouTube: {}'.format(url)) raise Exception('YouTube') ## from extractor import youtube_downloader ## video = youtube_downloader.Video(url, cw=cw) except Exception as e: print(e) url = None if not url: url = img['image_url'] if video: img = video else: img = Image(post_url, date, url, page) img.data = data # imgs.append(img) return imgs
def get(self, _): print_ = get_print(self.cw) url = self._url ext = get_ext(url) if ext.lower() == '.gif': print_('get_ext: {}, {}'.format(self.id_, url)) try: ext = downloader.get_ext(url) except Exception as e: #3235 print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0]) self.filename = '{}_p{}{}'.format(self.id_, self.p, ext) return url
def get_imgs_from_tweet(tweet, session, types, format, cw=None): print_ = get_print(cw) id = tweet['id_str'] if 'extended_entities' not in tweet: tweet['extended_entities'] = {'media': []} for url_ in tweet['entities'].get('urls', []): url_ = url_['expanded_url'] if '//twitpic.com/' in url_: print_('twitpic: {}'.format(url_)) try: url_ = get_twitpic(url_, session) tweet['extended_entities']['media'].append({ 'type': 'photo', 'media_url': url_, 'expanded_url': 'https://twitter.com' }) except Exception as e: print_('Invalid twitpic') print_(print_error(e)[-1]) media = tweet['extended_entities']['media'] time = get_time(tweet) imgs = [] for m in media: type_ = m['type'] if type_ == 'photo': type_ = 'img' elif type_ == 'animated_gif': type_ = 'video' if type_ not in types: continue if type_ == 'video': url_media = sorted(m['video_info']['variants'], key=lambda x: x.get('bitrate', 0))[-1]['url'] elif type_ == 'img': url_media = m['media_url'] if ':' not in os.path.basename(url_media): url_media += ':orig' else: raise NotImplementedError('unknown type') url = m['expanded_url'] img = Image(url_media, url, id, time, len(imgs), format, cw, type_ == 'video') imgs.append(img) return imgs
def get(self, _): print_ = get_print(self.cw) url = self._url ext = get_ext(url) if ext.lower()[1:] not in ['jpg', 'png', 'mp4']: #4645 print_('get_ext: {}, {}'.format(self.id_, url)) try: ext = downloader.get_ext(url, referer=_) except Exception as e: #3235 print_('Err: {}, {}\n'.format(self.id_, url) + print_error(e)[0]) self.filename = '{}_p{}{}'.format(self.id_, self.p, ext) return url
def pp(self, filename): cw = self.cw print_ = get_print(cw) ui_setting = utils.ui_setting ext = os.path.splitext(filename)[1].lower() if not os.path.isfile(filename): print('no file: {}'.format(filename)) return filename_new = None if self.type == 'video' and (self.audio is not None or ext != '.mp4') and not self.stream.live: # UHD or non-mp4 if self.audio is not None: # merge print_('Download audio: {}'.format(self.audio)) hash = uuid() path = os.path.join(os.path.dirname(filename), '{}_a.tmp'.format(hash)) if cw is not None: cw.trash_can.append(path) if constants.FAST: downloader_v3.download(self.audio, chunk=1024*1024, n_threads=2, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True) else: downloader.download(self.audio, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True) ext, out = ffmpeg.merge(filename, path, cw=cw, vcodec=self.vcodec) #print(out) name, ext_old = os.path.splitext(filename) if ext_old.lower() != ext.lower(): print_('rename ext {} --> {}'.format(ext_old, ext)) filename_new = '{}{}'.format(name, ext) if os.path.isfile(filename_new): os.remove(filename_new) os.rename(filename, filename_new) else: # convert non-mp4 video -> mp4 name, ext_old = os.path.splitext(filename) filename_new = '{}.mp4'.format(name) print_('Convert video: {} -> {}'.format(filename, filename_new)) ffmpeg.convert(filename, filename_new, cw=cw) elif self.type == 'audio' and ext != '.mp3': # convert non-mp3 audio -> mp3 name, ext_old = os.path.splitext(filename) filename_new = '{}.mp3'.format(name) ffmpeg.convert(filename, filename_new, '-shortest -preset ultrafast -b:a {}k'.format(get_abr()), cw=cw) if self.type == 'audio' and ui_setting.albumArt.isChecked(): try: self.thumb.seek(0)# ffmpeg.add_cover(filename_new, self.thumb, {'artist':self.username, 'title':self.title}, cw=cw) except Exception as e: s = print_error(e)[-1] print_(s) utils.pp_subtitle(self, filename, cw) return filename_new
def read(self): ui_setting = self.ui_setting cw = self.cw print_ = get_print(cw) if self.yt_type == 'video': res = get_resolution() info = get_videos(self.url, type=self.yt_type, max_res=res, only_mp4=False, audio_included=not True, cw=cw) else: abr = get_abr() info = get_videos(self.url, type=self.yt_type, max_abr=abr, cw=cw) videos = info['videos'] if not videos: raise Exception('No videos') self.enableSegment(overwrite=True) # first video must be valid while videos: video = videos[0] try: video.url() break except Exception as e: e_ = e self.print_(print_error(e)[0]) videos.remove(video) else: raise e_ if info['type'] != 'single': video = self.process_playlist(info['title'], videos) else: self.urls.append(video.url) self.title = video.title if video.stream.live: self.lock = False self.artist = video.username self.setIcon(video.thumb)
def foo(url, soup, info, reblog=False): #print('foo', info['c'], len(info['ids'])) for post in soup.findAll('div', class_='wrap-post'): try: id = int(re.find('[0-9]+', post.attrs['class'][1])) except Exception as e: print(print_error(e)[-1]) continue if id in info['ids']: continue info['ids'].add(id) info['last'] = id if not reblog and post.find('div', class_='ogname'): continue for p, mag in enumerate(post.findAll(['a', 'div'], class_='magnify')): post = Post(mag.attrs['href'], url, id, p) info['posts'].append(post) info['c'] += 20 if info['c'] else 5
def get(self, _=None): if self._url_cache: return self._url_cache print_ = get_print(self.cw) for try_ in range(self.try_n): try: d = ytdl.YoutubeDL(cw=self.cw) info = d.extract_info(self.referer) fs = info['formats'] for f in fs: print_('{} {} - {}'.format(f.get('height'), f['protocol'], f['url'])) def key(f): h = f.get('height', 0) if not f['protocol'].startswith('http'): h -= .1 return h for f in sorted(fs, key=key, reverse=True): if downloader.ok_url(f['url'], self.referer): #4185 break else: print_('invalid video: {}'.format(f['url'])) else: raise Exception('no valid videos') url = f['url'] ext = get_ext(url) self.ext = ext print_('get_video: {} {}'.format(url, ext)) if ext.lower() == '.m3u8': url = ffmpeg.Stream(url) url._live = False self._url_cache = url return url except Exception as e: e_ = e msg = print_error(e)[0] print_('\nTwitter video Error:\n{}'.format(msg)) if try_ < self.try_n - 1: sleep(10, self.cw) else: raise e_
def run(self): while self.alive: try: id_, res, i = self.queue.popleft() except Exception as e: sleep(.1) continue try: info_illust = get_info( 'https://www.pixiv.net/en/artworks/{}'.format(id_), cw, depth=depth + 1) res[i] = info_illust['imgs'] except Exception as e: if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired ): # logout during extraction res[i] = e print_('process_ids error ({}):\n{}'.format( depth, print_error(e)[0])) finally: Thread.add_rem(-1)
def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None): if False: # raise NotImplementedError('Not Implemented') print_ = get_print(customWidget) print_(u'types: {}'.format(', '.join(types))) # Range max_pid = get_max_range(customWidget, 2000) local_ids = {} if customWidget is not None: dir = customWidget.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') url_old = 'https://{}.sankakucomplex.com'.format(type) if customWidget is not None: customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {}')".format( tr_(u'읽는 중...'), title))) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break sleep(1) # #url = setPage(url, page) print_(url) html = downloader.read_html(url, referer=url_old, session=session) if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') for i in range(120): sleep(1) if customWidget and not customWidget.alive: return [] continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = re.find('show/([0-9]+)', url_img) print_(article) if id is None: # sankaku plus continue if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d) imgs.append(img) if len(imgs) >= max_pid: break if customWidget and not customWidget.alive: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if customWidget is not None: customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') return imgs
def get(self, url, force=False): if self._url: return self._url type = self.type only_mp4 = self.only_mp4 audio_included = self.audio_included max_res = self.max_res max_abr = self.max_abr cw = self.cw print_ = get_print(cw) if force: max_abr = 0 print('max_res: {}'.format(max_res)) for try_ in range(8): try: yt = ytdl.YouTube(url) break except Exception as e: e_ = e s = print_error(e)[-1] print_('### youtube retry...\n{}'.format(s)) sleep(try_ / 2, cw) else: raise e_ streams = yt.streams.all() print_streams(streams, cw) #3528 time = datetime.strptime(yt.info['upload_date'], '%Y%m%d') self.utime = (time - datetime(1970, 1, 1)).total_seconds() print_('utime: {}'.format(self.utime)) if type == 'video': streams[:] = [ stream for stream in streams if stream.video_codec is not None ] # Only mp4 if only_mp4: streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.subtype == 'mp4': streams.append(stream) # Audio included; Non-HD if audio_included: streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.audio_codec is not None: streams.append(stream) # Maximum resolution streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.resolution is None: continue res = int(stream.resolution.replace('p', '')) if max_res is None or res <= max_res: streams.append(stream) print_('') elif type == 'audio': streams[:] = [stream for stream in streams if stream.abr] # Maximum abr abrs = [stream.abr for stream in streams] max_abr = min(max(abrs), max_abr) streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.abr is None: continue abr = stream.abr if max_abr is None or abr >= max_abr: streams.append(stream) #''' else: raise Exception(u'type "{}" is not supported'.format(type)) # Pick the best while streams: if type == 'video': ress = [ int_(stream.resolution.replace('p', '')) for stream in streams ] m = max(ress) prefer_format = 'mp4' elif type == 'audio': ress = [stream.abr for stream in streams] m = min(ress) prefer_format = 'webm' print('Resolutions:', ress) stream_final = None for stream, res in zip(streams, ress): if res == m: if type == 'video': foo = (stream_final is not None) and ( stream_final.audio_codec is None) and bool( stream.audio_codec) elif type == 'audio': foo = False if stream_final is None or ( stream_final.fps <= stream.fps and (foo or (stream_final.subtype.lower() != prefer_format and stream.subtype.lower() == prefer_format) or stream_final.fps < stream.fps)): #print(foo) print_(u'# stream_final {} {} {} {} {} {}fps'.format( stream, stream.format, stream.resolution, stream.subtype, stream.audio_codec, stream.fps)) stream_final = stream ok = downloader.ok_url(stream_final.url, referer=url) if isinstance( stream_final.url, str) else True if ok: break else: print_('stream is not valid') streams.remove(stream_final) else: if type == 'audio' and not force: return self.get(url, force=True) # 1776 raise Exception('No videos') stream = stream_final ## if stream.video_codec and stream_final.video_codec.lower().startswith('av'): ## self.vcodec = 'h264' self.yt = yt self.id = yt.video_id self.stream = stream self.username = yt.info['uploader'] self.stream_audio = None self.audio = None self.thumb = None self.thumb_url = None self.subtitles = yt.subtitles if type == 'audio' and 'DASH' in self.stream.format: self.stream.setDashType('audio') # Audio if type == 'video' and stream.audio_codec is None: print('audio required') streams = [stream for stream in yt.streams.all() if stream.abr] print_streams(streams, cw) # only mp4; https://github.com/KurtBestor/Hitomi-Downloader-issues/issues/480 def isGood(stream): return stream.audio_codec.lower().startswith('mp4') streams_good = [stream for stream in streams if isGood(stream)] if streams_good: streams = streams_good print_streams(streams, cw) # only audio? if any(stream.resolution is None for stream in streams): streams = [ stream for stream in streams if stream.resolution is None ] print_streams(streams, cw) best_audio = None best_abr = 0 for stream in streams: abr = stream.abr if abr > best_abr: best_abr = abr best_audio = stream if best_audio is None: raise Exception('No audio') print(best_audio) self.stream_audio = best_audio if 'DASH' in self.stream_audio.format: self.stream_audio.setDashType('audio') self.audio = best_audio.url if callable(self.audio): self.audio = self.audio() # Thumbnail for quality in ['sddefault', 'hqdefault', 'mqdefault', 'default']: print('####', yt.thumbnail_url) self.thumb_url = yt.thumbnail_url.replace('default', quality) f = BytesIO() try: downloader.download(self.thumb_url, buffer=f) data = f.read() if len(data) == 0: raise AssertionError('Zero thumbnail') if data == empty_thumbnail: raise AssertionError('Empty thumbnail') f.seek(0) break except Exception as e: print(print_error(e)[-1]) self.thumb = f # _url = self.stream.url if callable(_url): _url = _url() self._url = _url title = yt.title #soup = Soup(yt.watch_html) #title = soup.title.text.replace('- YouTube', '').strip() self.title = title ext = u'.' + self.stream.subtype self.filename = format_filename(title, self.id, ext) print_(u'Resolution: {}'.format(stream.resolution)) print_(u'Codec: {} / {}'.format(stream.video_codec, stream.audio_codec)) print_(u'Abr: {}'.format(stream.abr)) print_(u'Subtype: {}'.format(stream.subtype)) print_(u'FPS: {}\n'.format(stream.fps)) return self._url
def _pp(self, filename): cw = self.cw print_ = get_print(cw) ui_setting = utils.ui_setting ext = os.path.splitext(filename)[1].lower() if not os.path.isfile(filename): print(u'no file: {}'.format(filename)) return filename_new = None if self.type == 'video' and (self.audio is not None or ext != '.mp4'): # UHD or non-mp4 if self.audio is not None: # merge print_(u'Download audio: {}'.format(self.audio)) hash = uuid() path = os.path.join(os.path.dirname(filename), '{}_a.tmp'.format(hash)) if cw is not None: cw.trash_can.append(path) if constants.FAST: downloader_v3.download(self.audio, chunk=1024 * 1024, n_threads=2, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True) else: downloader.download(self.audio, outdir=os.path.dirname(path), fileName=os.path.basename(path), customWidget=cw, overwrite=True) ext, out = ffmpeg.merge(filename, path, cw=cw, vcodec=self.vcodec) #print(out) name, ext_old = os.path.splitext(filename) if ext_old.lower() != ext.lower(): print_(u'rename ext {} --> {}'.format(ext_old, ext)) filename_new = u'{}{}'.format(name, ext) if os.path.isfile(filename_new): os.remove(filename_new) os.rename(filename, filename_new) else: # convert non-mp4 video -> mp4 name, ext_old = os.path.splitext(filename) filename_new = u'{}.mp4'.format(name) print_(u'Convert video: {} -> {}'.format( filename, filename_new)) ffmpeg.convert(filename, filename_new, cw=cw) elif self.type == 'audio' and ext != '.mp3': # convert non-mp3 audio -> mp3 name, ext_old = os.path.splitext(filename) filename_new = u'{}.mp3'.format(name) ffmpeg.convert(filename, filename_new, '-shortest -preset ultrafast -b:a {}k'.format( get_abr()), cw=cw) if self.type == 'audio' and ui_setting.albumArt.isChecked(): try: self.thumb.seek(0) # ffmpeg.add_cover(filename_new, self.thumb, { 'artist': self.username, 'title': self.title }, cw=cw) except Exception as e: s = print_error(e)[-1] print_(s) if ui_setting and ui_setting.subtitle.isChecked(): lang = { 'korean': 'ko', 'english': 'en', 'japanese': 'ja' }[compatstr(ui_setting.subtitleCombo.currentText()).lower()] if lang in self.subtitles: try: subtitle = self.subtitles[lang] filename_sub = u'{}.vtt'.format( os.path.splitext(filename)[0]) downloader.download( subtitle, os.path.dirname(filename_sub), fileName=os.path.basename(filename_sub), overwrite=True) filename_sub_new = u'{}.srt'.format( os.path.splitext(filename_sub)[0]) cw.imgs.append(filename_sub_new) cw.dones.add( os.path.realpath(filename_sub_new).replace( '\\\\?\\', '')) srt_converter.convert(filename_sub, filename_sub_new) cw.setSubtitle(True) finally: try: os.remove(filename_sub) except: pass return filename_new
def init(self): self.url = clean_url(self.url) url = self.url # Determine the type if 'bookmark.php?type=user' in url or url.startswith(headers['following']): type = 'following' elif 'bookmark.php' in url or url.startswith(headers['bookmark']) or '/bookmarks/' in url: type = 'bookmark' elif 'illust_id=' in url or url.startswith(headers['illust']) or '/artworks/' in url: type = 'illust' elif 'search.php' in url or url.startswith(headers['search']): type = 'search' order = query_url(url).get('order', ['date_d'])[0] # data_d, date, popular_d, popular_male_d, popular_female_d scd = query_url(url).get('scd', [None])[0] # 2019-09-27 ecd = query_url(url).get('ecd', [None])[0] # 2019-09-28 blt = query_url(url).get('blt', [None])[0] # 5000 bgt = query_url(url).get('bgt', [None])[0] # 9999 type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira self.info = {'order': order, 'scd': scd, 'ecd': ecd, 'blt': blt, 'bgt': bgt, 'type': type_} elif '/tags/' in url: type = 'search' order = query_url(url).get('order', ['date_d'])[0] scd = query_url(url).get('scd', [None])[0] ecd = query_url(url).get('ecd', [None])[0] blt = query_url(url).get('blt', [None])[0] bgt = query_url(url).get('bgt', [None])[0] type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira if type_ is None: try: type_ = url.split('/tags/')[1].split('/')[1] except IndexError: type_ = None type_ = {'illustrations': 'illust'}.get(type_, type_) self.info = {'order': order, 'scd': scd, 'ecd': ecd, 'blt': blt, 'bgt': bgt, 'type': type_} elif 'id=' in url and 'mode=' not in url or url.startswith(headers['user']) or 'pixiv.me' in url or '/users/' in url: type = 'user' else: self.Invalid((u'[pixiv] Can not determine type: {}').format(url)) return 'stop' header = headers[type] if 'pixiv.net' in url or 'pixiv.me' in url: if not url.startswith('http://') and not url.startswith('https://'): url = u'https://' + url self.url = url else: url = url.replace('bmk_', '').replace('illust_', '').replace('pixiv_', '').replace('search_', '') if type == 'user': url = 'https://www.pixiv.net/member_illust.php?id={}'.format(url) elif type == 'bookmark': url = 'https://www.pixiv.net/bookmark.php?id={}'.format(url) elif type == 'illust': url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(url) elif type == 'search': url = 'https://www.pixiv.net/search.php?s_mode=s_tag&word={}'.format(url) url = clean_url(url) else: self.Invalid('{}{}: ???'.format(header, url)) return 'stop' self.url = url self.print_('PIXIV_TYPE: {}'.format(type)) self.pixiv_type = type try: self.api = pixiv_auth.get_api() if 'error' in self.api.user_detail(11): self.api = pixiv_auth.get_api(force=True) except Exception as e: self.print_(print_error(e)[0]) self.Invalid(tr_('로그인 실패: {}{}\n[옵션 - 설정 - 픽시브 설정 - 로그인] 에서 설정해주세요.').format(header, url)) return 'stop'
def get_imgs(url, n_max=2000, title=None, cw=None, session=None): print_ = get_print(cw) for try_ in range(4): try: html = read_html(url, session, cw) m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)', html) if m is None: raise Exception('Invalid page') break except Exception as e: e_ = e print_(print_error(e)[0]) else: raise e_ n = int(m.groups()[0]) n = min(n, n_max) data = get_sd(url, html=html, cw=cw) uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] csrf_token = data['config']['csrf_token'] # session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com') cursor = '' edges = [] bad = 0 while True: check_alive(cw) variables = { 'id': uploader_id, 'first': 12, } if cursor: variables['after'] = cursor #print_(variables)# media = None try: j = get_query('003056d32c2554def87228bc3fd9668a', variables, session, cw) media = j['data']['user']['edge_owner_to_timeline_media'] sleep(2) # except Exception as e: if bad > 10: raise Exception('no media') else: print_(u'no media.. retry... ({}) {}'.format( bad + 1, print_error(e)[0])) sleep(12 * bad, cw) bad += 1 continue bad = 0 edges_new = media.get('edges') if not edges_new or not isinstance(edges_new, list): print('no edges_new') break edges += edges_new s = u'{} {} ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n) if cw is not None: cw.setTitle(s) if not cw.alive: return [] else: print(s) if len(edges) >= n: break page_info = media.get('page_info') if not page_info: break if not page_info.get('has_next_page'): break cursor = page_info.get('end_cursor') if not cursor: break if len(edges) <= n / 2: raise Exception(u'Too short: {} / {}'.format(len(edges), n)) imgs = [] for edge in edges: node = edge['node'] type = node['__typename'] id = node['shortcode'] url = u'https://www.instagram.com/p/{}/'.format(id) ## if type in ['GraphVideo', 'GraphImage']: ## single = True ## else: ## single = False for img in Node(url, session=session, cw=cw, media=node).imgs: imgs.append(img) if len(imgs) >= n_max: break return imgs
def _pagination(self, url_api, params=None, entry_tweet="tweet-", entry_cursor="cursor-bottom-"): if params is None: params = self.params.copy() while True: cursor = None self.print_('cursor: {}'.format(params.get("cursor"))) # 2303 n_try = 20 for try_ in range(n_try): try: data = self._call(url_api, params=params) tweets = data["globalObjects"]["tweets"] break except Exception as e: e_ = e e_msg = print_error(e)[0] if try_ < n_try - 1: self.print_('retry... _pagination ({})\n{}'.format( try_ + 1, e_msg)) sleep(30) else: raise e_ users = data["globalObjects"]["users"] for instr in data["timeline"]["instructions"]: for entry in instr.get("addEntries", {}).get("entries", []): if entry["entryId"].startswith(entry_tweet): tid = entry["content"]["item"]["content"]["tweet"][ "id"] if tid not in tweets: self.print_( "Skipping unavailable Tweet {}".format(tid)) continue tweet = tweets[tid] tweet["user"] = users[tweet["user_id_str"]] ## if "quoted_status_id_str" in tweet: ## quoted = tweets[tweet["quoted_status_id_str"]] ## tweet["author"] = tweet["user"] ## if "extended_entities" in quoted: ## tweet["extended_entities"] = \ ## quoted["extended_entities"] ## elif "retweeted_status_id_str" in tweet: ## retweet = tweets[tweet["retweeted_status_id_str"]] ## tweet["author"] = users[retweet["user_id_str"]] ## else: ## tweet["author"] = tweet["user"] yield tweet elif entry["entryId"].startswith(entry_cursor): cursor = entry["content"]["operation"]["cursor"][ "value"] if not cursor or params.get('cursor') == cursor: print('same cursor') return params["cursor"] = cursor if params.get("cursor") is None: # nothing break
def get_imgs_legacy(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None): print_ = get_print(cw) print_('types: {}'.format(', '.join(types))) artist, username = get_artist_username(username, session) # # Range n = max(n, get_max_range(cw)) max_pos = None ids_set = set() if imgs: for img in imgs: ids_set.add(img.id) else: imgs = [] f**k = 0 min_position = None while len(imgs) < n: if mode == 'media': if method == 'tab': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print_('max_pos={}, imgs={}'.format(max_pos, len(imgs))) elif method == 'search': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format( q) print_('max_id={}, imgs={}'.format(max_id, len(imgs))) elif method == 'search2': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format( q, foo) print_('max_pos={}, max_id={}, imgs={}'.format( max_pos, max_id, len(imgs))) else: raise Exception('Invalid method: {}'.format(method)) elif mode == 'likes': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print(url) hdr = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "******", } for try_ in range(16): if cw and not cw.alive: return try: html = downloader.read_html( url, session=session, referer='https://twitter.com/{}'.format(username), headers=hdr) #err except Exception as e: e_msg = print_error(e)[-1] print_('retry... ({}) {}\n{}'.format(try_, url, e_msg)) change_ua(session) continue try: data = json.loads(html) except Exception as e: change_ua(session) soup = Soup(html) login = soup.find('div', class_='LoginForm-input') if login and method == 'tab': raise Exception('Login required!') print_('can not load json: {}'.format(e)) sleep(1) continue break else: print_('over try') if not imgs: raise Exception('No imgs') break if 'items_html' in data: html = data['items_html'] else: print_('no items_html') session.cookies.clear() # ??? #break soup = Soup(html) tweets = soup.findAll('div', class_='tweet') + soup.findAll( 'span', class_='grid-tweet') ids = [] for tweet in tweets: id = int(tweet.attrs['data-tweet-id']) if id in ids_set: print('duplicate') continue ids.append(id) ids_set.add(id) tweet = Tweet(tweet, format, types, session, cw) for img in tweet.imgs: imgs.append(img) if n is not None and len(imgs) >= n: break if not ids: foo = 4 if method != 'search2' else 16 if len(imgs) == 0: raise Exception('No Image') elif f**k > foo: if method == 'tab': ### search method = 'search' f**k = 0 continue elif method == 'search' and not ids and min_position is not None: ### search2 method = 'search2' max_pos = min_position #min_position = None f**k = 0 continue else: print('too much f**k') break else: print('f**k!!!!!') change_ua(session) f**k += 1 elif f**k: print('reset f**k') f**k = 0 max_pos_new = data.get('min_position') # 1028 if max_pos_new is None: if ids: max_pos_new = min(ids) else: max_pos_new = max_pos # max_pos = max_pos_new if data.get('min_position'): min_position = data['min_position'] print('min_position:', min_position) try: if cw is not None: if not cw.alive: break cw.setTitle('{} {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs))) except Exception as e: print(e) raise return imgs