def get(self, url, force=False): if self._url: return self._url type = self.type only_mp4 = self.only_mp4 audio_included = self.audio_included max_res = self.max_res max_abr = self.max_abr cw = self.cw print_ = get_print(cw) if force: max_abr = 0 print('max_res: {}'.format(max_res)) for try_ in range(8): try: yt = ytdl.YouTube(url) break except Exception as e: e_ = e s = print_error(e)[-1] print_('### youtube retry...\n{}'.format(s)) sleep(try_ / 2, cw) else: raise e_ streams = yt.streams.all() print_streams(streams, cw) if type == 'video': streams[:] = [ stream for stream in streams if stream.video_codec is not None ] # Only mp4 if only_mp4: streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.subtype == 'mp4': streams.append(stream) # Audio included; Non-HD if audio_included: streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.audio_codec is not None: streams.append(stream) # Maximum resolution streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.resolution is None: continue res = int(stream.resolution.replace('p', '')) if max_res is None or res <= max_res: streams.append(stream) print_('') elif type == 'audio': streams[:] = [stream for stream in streams if stream.abr] # Maximum abr abrs = [stream.abr for stream in streams] max_abr = min(max(abrs), max_abr) streams_ = list(streams) streams[:] = [] for stream in streams_: if stream.abr is None: continue abr = stream.abr if max_abr is None or abr >= max_abr: streams.append(stream) #''' else: raise Exception(u'type "{}" is not supported'.format(type)) # Pick the best while streams: if type == 'video': ress = [ int_(stream.resolution.replace('p', '')) for stream in streams ] m = max(ress) prefer_format = 'mp4' elif type == 'audio': ress = [stream.abr for stream in streams] m = min(ress) prefer_format = 'webm' print('Resolutions:', ress) stream_final = None for stream, res in zip(streams, ress): if res == m: if type == 'video': foo = (stream_final is not None) and ( stream_final.audio_codec is None) and bool( stream.audio_codec) elif type == 'audio': foo = False if stream_final is None or ( stream_final.fps <= stream.fps and (foo or (stream_final.subtype.lower() != prefer_format and stream.subtype.lower() == prefer_format) or stream_final.fps < stream.fps)): #print(foo) print_(u'# stream_final {} {} {} {} {} {}fps'.format( stream, stream.format, stream.resolution, stream.subtype, stream.audio_codec, stream.fps)) stream_final = stream ok = downloader.ok_url(stream_final.url, referer=url) if isinstance( stream_final.url, str) else True if ok: break else: print_('stream is not valid') streams.remove(stream_final) else: if type == 'audio' and not force: return self.get(url, force=True) # 1776 raise Exception('No videos') stream = stream_final ## if stream.video_codec and stream_final.video_codec.lower().startswith('av'): ## self.vcodec = 'h264' self.yt = yt self.id = yt.video_id self.stream = stream self.username = yt.info['uploader'] self.stream_audio = None self.audio = None self.thumb = None self.thumb_url = None self.subtitles = yt.subtitles if type == 'audio' and 'DASH' in self.stream.format: self.stream.setDashType('audio') # Audio if type == 'video' and stream.audio_codec is None: print('audio required') streams = [stream for stream in yt.streams.all() if stream.abr] print_streams(streams, cw) # only mp4; https://github.com/KurtBestor/Hitomi-Downloader-issues/issues/480 def isGood(stream): return stream.audio_codec.lower().startswith('mp4') streams_good = [stream for stream in streams if isGood(stream)] if streams_good: streams = streams_good print_streams(streams, cw) # only audio? if any(stream.resolution is None for stream in streams): streams = [ stream for stream in streams if stream.resolution is None ] print_streams(streams, cw) best_audio = None best_abr = 0 for stream in streams: abr = stream.abr if abr > best_abr: best_abr = abr best_audio = stream if best_audio is None: raise Exception('No audio') print(best_audio) self.stream_audio = best_audio if 'DASH' in self.stream_audio.format: self.stream_audio.setDashType('audio') self.audio = best_audio.url if callable(self.audio): self.audio = self.audio() # Thumbnail for quality in ['sddefault', 'hqdefault', 'mqdefault', 'default']: print('####', yt.thumbnail_url) self.thumb_url = yt.thumbnail_url.replace('default', quality) f = BytesIO() try: downloader.download(self.thumb_url, buffer=f) data = f.read() if len(data) == 0: raise AssertionError('Zero thumbnail') if data == empty_thumbnail: raise AssertionError('Empty thumbnail') f.seek(0) break except Exception as e: print(print_error(e)[-1]) self.thumb = f # _url = self.stream.url if callable(_url): _url = _url() self._url = _url title = yt.title #soup = Soup(yt.watch_html) #title = soup.title.text.replace('- YouTube', '').strip() self.title = title ext = u'.' + self.stream.subtype self.filename = format_filename(title, self.id, ext) print_(u'Resolution: {}'.format(stream.resolution)) print_(u'Codec: {} / {}'.format(stream.video_codec, stream.audio_codec)) print_(u'Abr: {}'.format(stream.abr)) print_(u'Subtype: {}'.format(stream.subtype)) print_(u'FPS: {}\n'.format(stream.fps)) return self._url
def _pagination(self, url_api, params=None, entry_tweet="tweet-", entry_cursor="cursor-bottom-"): if params is None: params = self.params.copy() while True: cursor = None self.print_('cursor: {}'.format(params.get("cursor"))) # 2303 n_try = 20 for try_ in range(n_try): try: data = self._call(url_api, params=params) tweets = data["globalObjects"]["tweets"] break except Exception as e: e_ = e e_msg = print_error(e)[0] if try_ < n_try - 1: self.print_('retry... _pagination ({})\n{}'.format( try_ + 1, e_msg)) sleep(30) else: raise e_ users = data["globalObjects"]["users"] for instr in data["timeline"]["instructions"]: for entry in instr.get("addEntries", {}).get("entries", []): if entry["entryId"].startswith(entry_tweet): tid = entry["content"]["item"]["content"]["tweet"][ "id"] if tid not in tweets: self.print_( "Skipping unavailable Tweet {}".format(tid)) continue tweet = tweets[tid] tweet["user"] = users[tweet["user_id_str"]] ## if "quoted_status_id_str" in tweet: ## quoted = tweets[tweet["quoted_status_id_str"]] ## tweet["author"] = tweet["user"] ## if "extended_entities" in quoted: ## tweet["extended_entities"] = \ ## quoted["extended_entities"] ## elif "retweeted_status_id_str" in tweet: ## retweet = tweets[tweet["retweeted_status_id_str"]] ## tweet["author"] = users[retweet["user_id_str"]] ## else: ## tweet["author"] = tweet["user"] yield tweet elif entry["entryId"].startswith(entry_cursor): cursor = entry["content"]["operation"]["cursor"][ "value"] if not cursor or params.get('cursor') == cursor: print('same cursor') return params["cursor"] = cursor if params.get("cursor") is None: # nothing break
def get_imgs_legacy(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None): print_ = get_print(cw) print_('types: {}'.format(', '.join(types))) artist, username = get_artist_username(username, session) # # Range n = max(n, get_max_range(cw)) max_pos = None ids_set = set() if imgs: for img in imgs: ids_set.add(img.id) else: imgs = [] f**k = 0 min_position = None while len(imgs) < n: if mode == 'media': if method == 'tab': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print_('max_pos={}, imgs={}'.format(max_pos, len(imgs))) elif method == 'search': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format( q) print_('max_id={}, imgs={}'.format(max_id, len(imgs))) elif method == 'search2': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format( q, foo) print_('max_pos={}, max_id={}, imgs={}'.format( max_pos, max_id, len(imgs))) else: raise Exception('Invalid method: {}'.format(method)) elif mode == 'likes': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print(url) hdr = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "******", } for try_ in range(16): if cw and not cw.alive: return try: html = downloader.read_html( url, session=session, referer='https://twitter.com/{}'.format(username), headers=hdr) #err except Exception as e: e_msg = print_error(e)[-1] print_('retry... ({}) {}\n{}'.format(try_, url, e_msg)) change_ua(session) continue try: data = json.loads(html) except Exception as e: change_ua(session) soup = Soup(html) login = soup.find('div', class_='LoginForm-input') if login and method == 'tab': raise Exception('Login required!') print_('can not load json: {}'.format(e)) sleep(1) continue break else: print_('over try') if not imgs: raise Exception('No imgs') break if 'items_html' in data: html = data['items_html'] else: print_('no items_html') session.cookies.clear() # ??? #break soup = Soup(html) tweets = soup.findAll('div', class_='tweet') + soup.findAll( 'span', class_='grid-tweet') ids = [] for tweet in tweets: id = int(tweet.attrs['data-tweet-id']) if id in ids_set: print('duplicate') continue ids.append(id) ids_set.add(id) tweet = Tweet(tweet, format, types, session, cw) for img in tweet.imgs: imgs.append(img) if n is not None and len(imgs) >= n: break if not ids: foo = 4 if method != 'search2' else 16 if len(imgs) == 0: raise Exception('No Image') elif f**k > foo: if method == 'tab': ### search method = 'search' f**k = 0 continue elif method == 'search' and not ids and min_position is not None: ### search2 method = 'search2' max_pos = min_position #min_position = None f**k = 0 continue else: print('too much f**k') break else: print('f**k!!!!!') change_ua(session) f**k += 1 elif f**k: print('reset f**k') f**k = 0 max_pos_new = data.get('min_position') # 1028 if max_pos_new is None: if ids: max_pos_new = min(ids) else: max_pos_new = max_pos # max_pos = max_pos_new if data.get('min_position'): min_position = data['min_position'] print('min_position:', min_position) try: if cw is not None: if not cw.alive: break cw.setTitle('{} {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs))) except Exception as e: print(e) raise return imgs
def get_imgs_from_illust(illust, api=None, types={'illust', 'manga', 'ugoira'}, format=None, format_name=None, dir='', print_=None, cw=None): print_ = get_print(cw) if api is None: api = pixiv_auth.get_api() if types is not None and illust.get('type', 'illust') not in types: return [] imgs = [] if illust.type == 'ugoira': sleep(0.2) for try_ in range(N_TRY): print_(('read ugoira... {}').format(illust.id)) try: ugoira_data = api.ugoira_metadata(illust.id, req_auth=True) error = ugoira_data.get('error') if error: raise PixivError(error) break except PixivError as e: api = e.api print_(e) msg = error.get('user_message', '') if u'公開制限エラー' in msg: print_('invalid ugoira; ignore') return [] if u'該当作品の公開レベルにより閲覧できません' in msg: print_('invalid ugoira (2); ignore') return [] if try_ < N_TRY - 1: print_('retry...') sleep(SLEEP) else: raise ugoira_data = ugoira_data.ugoira_metadata url = ugoira_data.zip_urls.medium.replace('600x600', '1920x1080') img = Img(illust, url, ugoira_data=ugoira_data, format_name=format_name) if format is not None: filename = os.path.join(dir, img.filename) filename = os.path.splitext(filename)[0] + '.' + format filename_old = os.path.join(dir, ('{}_ugoira1920x1080.{}').format( img.id, format)) if os.path.isfile(filename_old) and not os.path.isfile(filename): print_( (u'rename: {} -> {}').format(os.path.basename(filename), os.path.basename(filename))) os.rename(filename_old, filename) if os.path.isfile(filename): print_((u'skip ugoira: {}').format(filename)) img = Img(illust, filename, ugoira_data=ugoira_data, format_name=format_name) imgs.append(img) elif illust.page_count == 1: img = Img(illust, illust.meta_single_page.original_image_url, format_name=format_name) imgs.append(img) else: pages = illust.meta_pages for page in pages: img = Img(illust, page.image_urls.original, format_name=format_name) imgs.append(img) return imgs
def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None): if False: # raise NotImplementedError('Not Implemented') print_ = get_print(customWidget) print_(u'types: {}'.format(', '.join(types))) # Range max_pid = get_max_range(customWidget, 2000) local_ids = {} if customWidget is not None: dir = customWidget.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') url_old = 'https://{}.sankakucomplex.com'.format(type) if customWidget is not None: customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {}')".format( tr_(u'읽는 중...'), title))) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break sleep(1) # #url = setPage(url, page) print_(url) html = downloader.read_html(url, referer=url_old, session=session) if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') for i in range(120): sleep(1) if customWidget and not customWidget.alive: return [] continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = re.findall('show/([0-9]+)', url_img)[0] if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d) imgs.append(img) if len(imgs) >= max_pid: break if customWidget and not customWidget.alive: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if customWidget is not None: customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') return imgs
def read(self): type = self.pixiv_type cw = self.customWidget print_ = cw.print_ ui_setting = self.ui_setting if type == 'following': raise NotImplementedError('following') self._format = [None, 'gif', 'webp', 'png'][ui_setting.ugoira_convert.currentIndex()] self._format_name = compatstr(ui_setting.pixivFormat.currentText()) types = [t.lower() for t in query_url(self.url).get('type', [])] if types: s = (u', ').join(sorted(types)) types = set(types) else: s = 'all' types = None print_((u'Type: {}').format(s)) print_((u'info: {}').format(self.info)) api = self.api query = self.id.replace('_bmk', '').replace('_illust', '').replace( 'pixiv_', '').replace('search_', '') if type != 'search': query = int(query) print('pixiv_query:', query) try: if type in ('user', 'bookmark', 'search'): max_pid = get_max_range(cw, 2000) if ui_setting.groupBox_tag.isChecked(): tags = [ compatstr(ui_setting.tagList.item(i).text()) for i in range(ui_setting.tagList.count()) ] else: tags = [] if type == 'search': query = query.replace('+', ' ') name = query else: id = self.id.replace('_bmk', '').replace('pixiv_', '').replace( 'search_', '') print('name', id) name = get_name(id, self.api, cw=cw) cw.artist = name title = u'{} ({})'.format(name, self.id) print_(title) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs(query, type=type, api=api, n=max_pid, tags=tags, types=types, format=self._format, format_name=self._format_name, dir=dir, cw=cw, title=title, info=self.info) elif type == 'illust': for try_ in range(N_TRY): try: detail = api.illust_detail(query, req_auth=True) error = detail.get('error') if error: raise PixivError(error) break except PixivError as e: api = e.api print_(e) if try_ < N_TRY - 1: print_('retry...') sleep(SLEEP) else: raise illust = detail.illust name = illust.title title = (u'{} ({})').format(name, self.id) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs_from_illust(illust, api=api, format=self._format, dir=dir, cw=cw, format_name=self._format_name) except PixivError as e: msg = (u'PixivError: {}').format(e.message) return self.Invalid(msg) self.imgs = imgs for img in imgs: self.urls.append(img.url) self.filenames[img.url] = img.filename self.title = clean_title(title) # 1390
def get_imgs(user_id, type='user', n=None, api=None, tags=[], types={'illust', 'manga', 'ugoira'}, format=None, format_name=None, dir='', cw=None, title=None, info=None): print('get_imgs', user_id, type, dir) if api is None: api = pixiv_auth.get_api() print_ = get_print(cw) imgs = [] offset = 0 bad = 0 error = None tags_ = tags tags = set() tags_ex = set() for tag in tags_: tag = tag.strip().replace(' ', '').lower() if tag.startswith('-'): tags_ex.add(tag[1:].strip()) else: tags.add(tag) print_((u'tags: [{}]').format((u', ').join(tags))) print_((u'tags_ex: [{}]').format((u', ').join(tags_ex))) max_id = None while True: if bad >= N_TRY: raise PixivError(error) if type == 'user': json_result = api.user_illusts(user_id, type=None, req_auth=True, filter=None, offset=offset) elif type == 'search': order = info['order'] sorts = { 'date_d': 'date_desc', 'date': 'date_asc', 'popular_d': 'popular_desc', 'popular': 'popular_asc', 'popular_female_d': 'popular_female_desc', 'popular_female': 'popular_female_asc', 'popular_male_d': 'popular_male_desc', 'popular_male': 'popular_male_asc', } sort = sorts.get(order, 'date_desc') params = { 'word': user_id, 'search_target': 'partial_match_for_tags', 'sort': sort, 'filter': 'for_ios' } if offset: params['offset'] = offset if info.get('blt') is not None: params['bookmark_num_min'] = info['blt'] if info.get('bgt') is not None: params['bookmark_num_max'] = info['bgt'] if info.get('scd') is not None: params['start_date'] = info['scd'] if info.get('ecd') is not None: params['end_date'] = info['ecd'] print(params) #r = api.no_auth_requests_call('GET', '%s/v1/search/illust' % api.hosts, params=params, req_auth=True) #json_result = api.parse_result(r) method, url = api.api.search_illust r = api.requests_(method, url, params=params, auth=True) json_result = api.parse_json(r) elif type == 'bookmark': print('max_id:', max_id) json_result = api.user_bookmarks_illust(user_id, filter=None, max_bookmark_id=max_id, req_auth=True) else: raise Exception(('type "{}" is not supported').format(type)) error = json_result.get('error') if error: print_(error) message = error.get('message', '') if 'Offset must be no more than' in message: break print_('retry...') sleep(SLEEP) bad += 1 continue bad = 0 illusts = json_result.illusts if len(illusts) == 0: break for p, illust in enumerate(illusts): print('illust: {}'.format(illust.id)) tags_illust = set(tag['name'].strip().replace(' ', '').lower() for tag in illust.tags) if not tags or tags & tags_illust: if tags_ex.isdisjoint(tags_illust): imgs += get_imgs_from_illust(illust, api=api, types=types, format=format, format_name=format_name, dir=dir, cw=cw) if cw is not None and (illust.type == 'ugoira' or p == len(illusts) - 1): cw.setTitle( (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), title, len(imgs))) offset += 1 if n is not None and len(imgs) >= n: break if type == 'bookmark': if json_result.next_url is None: break else: max_id = api.parse_qs(json_result.next_url)['max_bookmark_id'] if n is not None and len(imgs) >= n: break if cw is not None and not cw.alive: break if not imgs: raise Exception('no imgs') return imgs[:n]
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None): print_ = get_print(cw) print_('uid: {}, oid:{}'.format(uid, oid)) max_pid = get_max_range(cw) @try_n(4) def get_album_imgs(album, page): url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format( uid, album.id, page, album.type, int(time() * 1000)) referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid) html = downloader.read_html(url, referer, session=session, timeout=30) j = json.loads(html) data = j['data'] imgs = [] for photo in data['photo_list']: host = photo['pic_host'] name = photo['pic_name'] id = photo['photo_id'] timestamp = photo['timestamp'] date = datetime.fromtimestamp(timestamp) t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month, date.day) url = '{}/large/{}'.format(host, name) ext = os.path.splitext(name)[1] filename = '[{}] {}{}'.format(t, id, ext) img = Image(url, filename, timestamp) imgs.append(img) return imgs @try_n(2) def get_albums(page): url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format( uid, page, int(time() * 1000)) referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid) html = downloader.read_html(url, referer, session=session) if '<title>新浪通行证</title>' in html: raise errors.LoginRequired() j = json.loads(html) data = j['data'] albums = [] for album in data['album_list']: id = album['album_id'] type = album['type'] album = Album(id, type) albums.append(album) return albums albums = [] for p in range(1, 101): albums_new = get_albums(p) albums += albums_new print_('p:{}, albums:{}'.format(p, len(albums))) if not albums_new: break imgs = [] for album in albums: print('Album:', album.id, album.type) imgs_album = [] for p in range(1, 101): imgs_new = get_album_imgs(album, p) imgs_album += imgs_new s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs_album) >= max_pid: break if not imgs_new: break sleep(1) imgs += imgs_album imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True) return imgs[:max_pid]
def process_ids(ids, info, imgs, cw, depth=0, tags_add=None): print_ = get_print(cw) max_pid = get_max_range(cw) class Thread(threading.Thread): alive = True rem = 0 def __init__(self, queue): super().__init__(daemon=True) self.queue = queue @classmethod @lock def add_rem(cls, x): cls.rem += x def run(self): while self.alive: try: id_, res, i = self.queue.popleft() except Exception as e: sleep(.1) continue try: info_illust = get_info( 'https://www.pixiv.net/en/artworks/{}'.format(id_), cw, depth=depth + 1, tags_add=tags_add) res[i] = info_illust['imgs'] except Exception as e: if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired ): # logout during extraction res[i] = e print_('process_ids error (id: {}, d:{}):\n{}'.format( id_, depth, print_error(e)[0])) finally: Thread.add_rem(-1) queue = deque() n, step = Downloader_pixiv.STEP print_('{} / {}'.format(n, step)) ts = [] for i in range(n): t = Thread(queue) t.start() ts.append(t) for i in range(0, len(ids), step): res = [[]] * step for j, id_illust in enumerate(ids[i:i + step]): queue.append((id_illust, res, j)) Thread.add_rem(1) while Thread.rem: sleep(.001, cw) for imgs_ in res: if isinstance(imgs_, Exception): raise imgs_ imgs += imgs_ s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs) >= max_pid: break if depth == 0: check_alive(cw) for t in ts: t.alive = False
def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown'] and not SHOW: browser.hide() sd['shown'] = False try: st = soup.find('h2', class_='share-title') if st is None: st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c) info['uid'] = st.text.strip() st = soup.find('h1', class_='share-sub-title') if st is None: st = soup.find('h1', class_=lambda c: c and 'ShareSubTitle' in c) info['nickname'] = st.text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() items = soup.findAll('div', class_='video-feed-item') + soup.findAll( 'div', class_=lambda c: c and 'DivItemContainer' in c) for div in items: a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4
def get_imgs(url, title=None, cw=None, d=None, types=['img', 'gif', 'video'], session=None): if False:# raise NotImplementedError('Not Implemented') print_ = get_print(cw) print_('types: {}'.format(', '.join(types))) if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') info = {} info['single'] = False if '/post/show/' in url: info['single'] = True id = get_id(url) info['imgs'] = [Image(type, id, url, None, cw=cw, d=d)] return info # Range max_pid = get_max_range(cw) local_ids = {} if cw is not None: dir = cw.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() url_old = 'https://{}.sankakucomplex.com'.format(type) if cw is not None: cw.setTitle('{} {}'.format(tr_('읽는 중...'), title)) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break wait(cw) #url = setPage(url, page) print_(url) try: html = downloader.read_html(url, referer=url_old, session=session) except Exception as e: #3366 print_(print_error(e)[0]) break if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') sleep(120, cw) continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = get_id(url_img) #print_(article) if id is None: # sankaku plus continue if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=cw, d=d) imgs.append(img) if len(imgs) >= max_pid: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) #3366 p = int(re.find(r'[?&]page=([0-9]+)', url, default=1)) if p > 100: url = setPage(url, 100) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if cw is not None: cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') info['imgs'] = imgs return info
def get_imgs(url, n_max=2000, title=None, cw=None, session=None): print_ = get_print(cw) for try_ in range(4): try: html = read_html(url, session, cw) m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)', html) if m is None: raise Exception('Invalid page') break except Exception as e: e_ = e print_(print_error(e)[0]) else: raise e_ n = int(m.groups()[0]) n = min(n, n_max) data = get_sd(url, html=html, cw=cw) uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] csrf_token = data['config']['csrf_token'] # session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com') cursor = '' edges = [] bad = 0 while True: check_alive(cw) variables = { 'id': uploader_id, 'first': 12, } if cursor: variables['after'] = cursor #print_(variables)# media = None try: j = get_query('003056d32c2554def87228bc3fd9668a', variables, session, cw) media = j['data']['user']['edge_owner_to_timeline_media'] sleep(2) # except Exception as e: if bad > 10: raise Exception('no media') else: print_(u'no media.. retry... ({}) {}'.format( bad + 1, print_error(e)[0])) sleep(12 * bad, cw) bad += 1 continue bad = 0 edges_new = media.get('edges') if not edges_new or not isinstance(edges_new, list): print('no edges_new') break edges += edges_new s = u'{} {} ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n) if cw is not None: cw.setTitle(s) if not cw.alive: return [] else: print(s) if len(edges) >= n: break page_info = media.get('page_info') if not page_info: break if not page_info.get('has_next_page'): break cursor = page_info.get('end_cursor') if not cursor: break if len(edges) <= n / 2: raise Exception(u'Too short: {} / {}'.format(len(edges), n)) imgs = [] for edge in edges: node = edge['node'] type = node['__typename'] id = node['shortcode'] url = u'https://www.instagram.com/p/{}/'.format(id) ## if type in ['GraphVideo', 'GraphImage']: ## single = True ## else: ## single = False for img in Node(url, session=session, cw=cw, media=node).imgs: imgs.append(img) if len(imgs) >= n_max: break return imgs