def process_ids(ids, info, imgs, cw, depth=0): print_ = get_print(cw) max_pid = get_max_range(cw) for i, id_illust in enumerate(ids): try: info_illust = get_info( 'https://www.pixiv.net/en/artworks/{}'.format(id_illust), cw, depth=depth + 1) except Exception as e: if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired ): # logout during extraction raise e print_('process_ids error ({}):\n{}'.format( depth, print_error(e)[0])) continue imgs += info_illust['imgs'] s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs) >= max_pid: break if depth == 0: check_alive(cw)
def get_videos(url, type='video', only_mp4=False, audio_included=False, max_res=None, max_abr=None, cw=None): info = {} n = get_max_range(cw) if '/channel/' in url or '/user/' in url or '/c/' in url: info = read_channel(url, n=n, cw=cw) info['type'] = 'channel' info['title'] = u'[Channel] {}'.format(info['uploader']) if cw: info['urls'] = filter_range(info['urls'], cw.range) elif '/playlist' in url: info = read_playlist(url, n=n, cw=cw) info['type'] = 'playlist' info['title'] = u'[Playlist] {}'.format(info['title']) if cw: info['urls'] = filter_range(info['urls'], cw.range) else: info['type'] = 'single' info['urls'] = [url] info['videos'] = [ Video(url, type, only_mp4, audio_included, max_res, max_abr, cw) for url in info['urls'] ] return info
def get_ids_multi(q, popular, cw=None): print_ = get_print(cw) max_pid = get_max_range(cw) qs = q.split(' ') qs_pos = [q for q in qs if not q.startswith('-')] qs_neg = [q[1:] for q in qs if q.startswith('-')] q = qs_pos[0] if qs_pos else None ids = get_ids(q, popular, cw) print_('{}: {}'.format(q, len(ids))) # Positive for q in qs_pos[1:]: ids_ = get_ids(q, popular, cw) set_ids_ = set(ids_) ids_old = ids ids = [] for id in ids_old: if id in set_ids_: ids.append(id) print_('{}: {} ({})'.format(q, len(ids_), len(ids))) # Negative for q in qs_neg: ids_ = get_ids(q, popular, cw) set_ids_ = set(ids_) ids_old = ids ids = [] for id in ids_old: if id not in set_ids_: ids.append(id) print_('-{}: {} ({})'.format(q, len(ids_), len(ids))) return ids[:max_pid]
def get_imgs(id, api, cw=None, title=None, type='board'): n = get_max_range(cw) imgs = [] ids = set() print('get_imgs: type={}'.format(type)) if type == 'board': gen = api.board_pins(id) elif type == 'section': gen = api.board_section_pins(id) elif type == 'pin': gen = [api.pin(id)] else: raise Exception((u'Type "{}" is not supported').format(type)) for img in gen: if 'images' not in img: print('skip img:', img['id']) continue img = Image(img) if img.id in ids: print('duplicate:', img.id) continue ids.add(img.id) print(img.url) print(img.filename) print() imgs.append(img) if len(imgs) >= n: break if cw is not None: if not cw.alive: return [] cw.setTitle((u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), title, len(imgs))) return imgs
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format( tags) print_ = get_print(cw) # Range max_pid = get_max_range(cw, 2000) imgs = [] ids = set() count_no_imgs = 0 for p in range(500): #1017 url = setPage(url, len(ids)) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='thumbnail-preview') imgs_new = [] for post in posts: id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id')) if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = urljoin(url, post.find('a')['href']) img = Image(id_, url_img) imgs_new.append(img) if imgs_new: imgs += imgs_new count_no_imgs = 0 else: print('no imgs') count_no_imgs += 1 if count_no_imgs > 1: print('break') break if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) if not imgs: raise Exception('no imgs') return imgs
def get_imgs_more(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None): print_ = get_print(cw) imgs = imgs or [] print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types))) artist, username = get_artist_username(username, session)# # Range n = max(n or 0, get_max_range(cw)) ids_set = set(img.id for img in imgs) count_no_imgs = 0 filter_ = '' if options.get('experimental') else ' filter:media' #2687 while len(imgs) < n: if ids_set: max_id = min(ids_set) - 1 q = 'from:{} max_id:{} exclude:retweets{} -filter:periscope'.format(username, max_id, filter_) else: q = 'from:{} exclude:retweets{} -filter:periscope'.format(username, filter_) print(q) tweets = [] for tweet in list(TwitterAPI(session, cw).search(q)): id = int(tweet['id']) if id in ids_set: print_('duplicate: {}'.format(id)) continue ids_set.add(id) tweets.append(tweet) if tweets: count_no_imgs = 0 else: count_no_imgs += 1 change_ua(session) if count_no_imgs >= 3: break print_('retry...') continue for tweet in tweets: imgs += get_imgs_from_tweet(tweet, session, types, format, cw) msg = '{} {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs)) if cw and not cw.alive: break if cw: cw.setTitle(msg) else: print(msg) return imgs
def get_imgs_all(url, title=None, cw=None, d=None, session=None, stories=True): max_pid = get_max_range(cw) url = clean_url(url) if stories: imgs_str = get_stories(url, title, cw=cw, session=session) else: imgs_str = [] max_pid = max(0, max_pid - len(imgs_str)) imgs = get_imgs(url, max_pid, title=title, cw=cw, session=session) return imgs_str + imgs[:max_pid]
def get_imgs(url, title=None, cw=None): print_ = get_print(cw) url = clean_url(url) id = get_id(url) url = u'https://nijie.info/members_illust.php?id={}'.format(id) # Range max_pid = get_max_range(cw) imgs = [] url_imgs = set() for p in range(1, 1 + 100): url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='nijie') if not posts: print('no posts') break c = 0 for post in posts: url_img = urljoin(url, post.a.attrs['href']) if url_img in url_imgs: print('duplicate:', url_img) continue url_imgs.add(url_img) id = int(re.find('[?&]id=([0-9]+)', url_img)) multi = post.find('div', class_='thumbnail-icon') if multi: imgs_ = get_imgs_post(id, url_img) # else: imgs_ = [Image(id, url_img, 0)] imgs += imgs_ c += 1 if len(imgs) >= max_pid: break msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: if not cw.alive: return cw.setTitle(msg) else: print(msg) if len(imgs) >= max_pid or c == 0: break return imgs
def get_imgs_channel(url, html=None, cw=None): print_ = get_print(cw) if html is None: html = downloader.read_html(url) info = get_info(url, html) # Range max_pid = get_max_range(cw) ids = set() imgs = [] for p in range(1000): url_api = 'https://bcy.net/apiv3/user/selfPosts?uid={}'.format( info['uid']) if imgs: url_api += '&since={}'.format(imgs[-1].id) data_raw = downloader.read_html(url_api, url) data = json.loads(data_raw)['data'] items = data['items'] if not items: print('no items') break c = 0 for item in items: check_alive(cw) id = item['item_detail']['item_id'] if id in ids: print('duplicate') continue c += 1 ids.add(id) url_single = u'https://bcy.net/item/detail/{}'.format(id) imgs_single = get_imgs(url_single, cw=cw) print_(str(id)) for p, img in enumerate(imgs_single): img = Image(img._url, url_single, id, p) imgs.append(img) s = u'{} {} - {}'.format(tr_(u'읽는 중...'), info['artist'], min(len(imgs), max_pid)) if cw: cw.setTitle(s) else: print(s) if len(imgs) >= max_pid: break if not c: print('not c') break if len(imgs) >= max_pid: print('over max_pid:', max_pid) break return imgs[:max_pid]
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw) info = {} info['header'] = header info['username'] = username session = Session() urls = [] ids = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print_(url_api) r = session.post(url_api) data = json.loads(r.text) videos = data.get('videos') #4530 if not videos: print_('empty') break for video in videos: id_ = video['id'] if id_ in ids: print_('duplicate: {}'.format(id_)) continue ids.add(id_) info['name'] = video['pn'] urls.append(urljoin(url_page, video['u'])) if len(urls) >= max_pid: break n = data['nb_videos'] s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: cw.setTitle(s) else: print(s) if len(ids) >= n: break sleep(1, cw) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get_imgs(url, title=None, customWidget=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url).replace(' ', '+') print(tags) tags = urllib.quote(tags, safe='/+') url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) if customWidget is not None: print_ = customWidget.print_ else: def print_(*values): sys.stdout.writelines(values + ('\n', )) # Range max_pid = get_max_range(customWidget, 2000) imgs = [] url_imgs = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: url_img = post.attrs['file_url'] if url_img in url_imgs: print('already exists', url_img) else: url_imgs.add(url_img) id = post.attrs['id'] img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if customWidget is not None: if not customWidget.alive: break customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) return imgs
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw, 2000) info = {} info['header'] = header info['username'] = username session = Session() urls = [] urls_set = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print(url_api) r = session.post(url_api, data='main_cats=false') soup = Soup(r.text) thumbs = soup.findAll('div', class_='thumb-block') if not thumbs: print_('empty') break for thumb in thumbs: info['name'] = thumb.find('span', class_='name').text.strip() href = thumb.find('a')['href'] href = urljoin(url_page, href) if href in urls_set: print_('duplicate: {}'.format(href)) continue urls_set.add(href) urls.append(href) if len(urls) >= max_pid: break s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: if not cw.alive: return cw.setTitle(s) else: print(s) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) print_ = get_print(cw) # Range max_pid = get_max_range(cw) imgs = [] ids = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = try_n(4, sleep=30)(downloader.read_html)(url) #3340 soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: id_ = post.attrs['id'] if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = post.attrs['file_url'] img = Image(id_, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) return imgs
def get_imgs(url, soup=None, cw=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) title = get_title(soup) n = get_max_range(cw) imgs = [] p = 1 while True: imgs_new = get_imgs_p(url, p) if not imgs_new: break imgs += imgs_new update(cw, title, imgs) p += 1 if len(imgs) >= n: break return imgs[:n]
def get_videos(url, cw=None): print_ = get_print(cw) print_(f'get_videos: {url}') info = {} options = { 'extract_flat': True, 'playlistend': get_max_range(cw), } videos = [] ydl = ytdl.YoutubeDL(options, cw=cw) info = ydl.extract_info(url) for e in info['entries']: video = Video(e['url'], cw) video.id = int(e['id']) videos.append(video) if 'name' not in info: info['name'] = ydl.extract_info(e['url'])['creator'] if not videos: raise Exception('no videos') info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True) return info
def get_imgs(username, session, cw=None): print_ = get_print(cw) artist = get_name(username, session) imgs = [] error_count = 0 max_pid = get_max_range(cw) api = TumblrAPI(session, cw) for post in api.posts(username): imgs += post.imgs s = '{} {} (tumblr_{}) - {}'.format(tr_(u'\uc77d\ub294 \uc911...'), artist, username, len(imgs)) if cw: if not cw.alive: return cw.setTitle(s) else: print(s) if len(imgs) > max_pid: break return imgs[:max_pid]
def get_imgs(url, title=None, range_=None, cw=None): if 'donmai.us/artists' in url: raise NotImplementedError('Not Implemented') if 'donmai.us/posts/' in url: raise NotImplementedError('Not Implemented') print_ = get_print(cw) # Range max_pid = get_max_range(cw) if range_ is None: range_ = range(1, 101) print(range_) imgs = [] i = 0 empty_count = 0 empty_count_global = 0 url_imgs = set() while i < len(range_): p = range_[i] url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) articles = soup.findAll('article') if articles: empty_count_global = 0 else: empty_count += 1 if empty_count < 4: s = u'empty page; retry... {}'.format(p) print_(s) continue else: empty_count = 0 empty_count_global += 1 if empty_count_global >= 6: break for article in articles: id = article.attrs['data-id'] url_img = article.attrs['data-file-url'].strip() if url_img.startswith('http://') or url_img.startswith('https://'): pass else: url_img = 'https://{}donmai.us'.format( 'danbooru.' if 'danbooru.' in url else '') + url_img #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) i += 1 return imgs
def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None): if False: # raise NotImplementedError('Not Implemented') print_ = get_print(customWidget) print_(u'types: {}'.format(', '.join(types))) # Range max_pid = get_max_range(customWidget, 2000) local_ids = {} if customWidget is not None: dir = customWidget.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') url_old = 'https://{}.sankakucomplex.com'.format(type) if customWidget is not None: customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {}')".format( tr_(u'읽는 중...'), title))) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break sleep(1) # #url = setPage(url, page) print_(url) html = downloader.read_html(url, referer=url_old, session=session) if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') for i in range(120): sleep(1) if customWidget and not customWidget.alive: return [] continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = re.find('show/([0-9]+)', url_img) print_(article) if id is None: # sankaku plus continue if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d) imgs.append(img) if len(imgs) >= max_pid: break if customWidget and not customWidget.alive: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if customWidget is not None: customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') return imgs
def get_imgs(url, info=None, cw=None): print('get_imgs', url) if info is None: info = get_info(url) imgs = [] # Range max_pid = get_max_range(cw) if info['type'] == 'a': if 'album_images' in info: # legacy imgs_ = info['album_images']['images'] elif 'media' in info: # new imgs_ = info['media'] else: # legacy imgs_ = [info] for img in imgs_: img_url = img.get('url') # new if not img_url: # legacy hash = img['hash'] ext = img['ext'] img_url = 'https://i.imgur.com/{}{}'.format(hash, ext) if img_url in imgs: continue imgs.append(img_url) elif info['type'] == 'r': urls = set() for p in range(100): url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p) print(url_api) html = downloader.read_html(url_api, referer=url) soup = Soup(html) c = 0 for post in soup.findAll('div', class_='post'): a = post.find('a', class_='image-list-link') url_post = urljoin(url, a.attrs['href']) if url_post in urls: continue urls.add(url_post) c += 1 try: # for r18 images imgs += get_imgs(url_post) except Exception as e: print(e) s = (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs)) if cw is not None: if cw.alive: cw.setTitle(s) else: return [] else: print(s) if c == 0: print('same; break') break return imgs
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None): print_ = get_print(cw) print_('uid: {}, oid:{}'.format(uid, oid)) max_pid = get_max_range(cw) @try_n(4) def get_album_imgs(album, page): url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format( uid, album.id, page, album.type, int(time() * 1000)) referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid) html = downloader.read_html(url, referer, session=session, timeout=30) j = json.loads(html) data = j['data'] imgs = [] for photo in data['photo_list']: host = photo['pic_host'] name = photo['pic_name'] id = photo['photo_id'] timestamp = photo['timestamp'] date = datetime.fromtimestamp(timestamp) t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month, date.day) url = '{}/large/{}'.format(host, name) ext = os.path.splitext(name)[1] filename = '[{}] {}{}'.format(t, id, ext) img = Image(url, filename, timestamp) imgs.append(img) return imgs def get_albums(page): url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format( uid, page, int(time() * 1000)) referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid) html = downloader.read_html(url, referer, session=session) j = json.loads(html) data = j['data'] albums = [] for album in data['album_list']: id = album['album_id'] type = album['type'] album = Album(id, type) albums.append(album) return albums albums = [] for p in range(1, 101): albums_new = get_albums(p) albums += albums_new print_('p:{}, albums:{}'.format(p, len(albums))) if not albums_new: break imgs = [] for album in albums: print('Album:', album.id, album.type) imgs_album = [] for p in range(1, 101): imgs_new = get_album_imgs(album, p) imgs_album += imgs_new s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs_album) >= max_pid: break if not imgs_new: break sleep(1) imgs += imgs_album imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True) return imgs[:max_pid]
def get_info(url, cw=None, depth=0): print_ = get_print(cw) api = PixivAPI() info = {} imgs = [] if utils.ui_setting: ugoira_ext = [None, '.gif', '.webp', '.png'][utils.ui_setting.ugoira_convert.currentIndex()] else: ugoira_ext = None if utils.ui_setting: format_ = compatstr(utils.ui_setting.pixivFormat.currentText()) else: format_ = 'id_ppage' max_pid = get_max_range(cw) if api.illust_id(url): # Single post id_ = api.illust_id(url) data = api.illust(id_) login = '******' not in data if FORCE_LOGIN and not login: # raise errors.LoginRequired() if data['xRestrict'] and not login: raise errors.LoginRequired('R-18') info['artist'] = data['userName'] info['artist_id'] = data['userId'] info['raw_title'] = data['illustTitle'] info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_) info['create_date'] = parse_time(data['createDate']) tags_illust = set(tag['tag'] for tag in data['tags']['tags']) if tags_matched(tags_illust, cw): if data['illustType'] == 2: # ugoira data = api.ugoira_meta(id_) ugoira = { 'ext': ugoira_ext, 'delay': [frame['delay'] for frame in data['frames']], } img = Image(data['originalSrc'], url, id_, 0, format_, info, cw, ugoira=ugoira) imgs.append(img) else: data = api.pages(id_) for img in data: img = Image(img['urls']['original'], url, id_, len(imgs), format_, info, cw) imgs.append(img) else: print('tags mismatched') elif '/bookmarks/' in url or 'bookmark.php' in url: # User bookmarks id_ = api.user_id(url) if id_ is None: # id_ = my_id() process_user(id_, info, api) info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id']) ids = [] ids_set = set() offset = 0 while len(ids) < max_pid: data = api.bookmarks(id_, offset) c = 0 for id in [work['id'] for work in data['works']]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break offset += LIMIT if depth == 0: check_alive(cw) process_ids(ids[:max_pid], info, imgs, cw, depth) elif '/tags/' in url or 'search.php' in url: # Search q = unquote( re.find(r'/tags/([^/]+)', url) or re.find('[?&]word=([^&]*)', url, err='no tags')) info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+')) qs = query_url(url) order = qs.get('order', ['date_d'])[0] mode = qs.get('mode', ['all'])[0] ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.search(q, order, mode, p=p) c = 0 for id in [ illust['id'] for illust in data['illustManga']['data'] if 'id' in illust ]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids[:max_pid], info, imgs, cw, depth) elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url: # Newest works: Following r18 = 'bookmark_new_illust_r18.php' in url id_ = my_id() process_user(id_, info, api) info['title'] = '{} (pixiv_following_{}{})'.format( info['artist'], 'r18_' if r18 else '', info['artist_id']) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: c = 0 for id in api.following(p, r18=r18): if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids[:max_pid], info, imgs, cw, depth) elif api.user_id(url): # User illusts id_ = api.user_id(url) process_user(id_, info, api) data = api.profile(id_) info['title'] = '{} (pixiv_{})'.format(info['artist'], info['artist_id']) ids = [] for illusts in [data['illusts'], data['manga']]: if not illusts: continue ids += list(illusts.keys()) ids = sorted(ids, key=int, reverse=True) process_ids(ids[:max_pid], info, imgs, cw, depth) else: raise NotImplementedError() info['imgs'] = imgs[:max_pid] return info
def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage', cw=None): print_ = get_print(cw) ## try: ## return get_imgs_legacy(username, session, title, types, n, format, cw) ## except Exception as e: ## print_(print_error(e)[-1]) # Range n = max(n, get_max_range(cw)) # 2303 ids = set() names = dict() dir_ = os.path.join(get_outdir('twitter'), title) if os.path.isdir(dir_): for name in os.listdir(dir_): id_ = re.find('([0-9]+)_p', name) if id_ is None: continue if get_ext(name).lower() == '.mp4': type_ = 'video' else: type_ = 'img' if type_ not in types: continue id_ = int(id_) ids.add(id_) if id_ in names: names[id_].append(name) else: names[id_] = [name] max_id = max(ids) if ids else 0 imgs = [] enough = False for tweet in TwitterAPI(session, cw).timeline_media(username): imgs += get_imgs_from_tweet(tweet, session, types, format, cw) if n is not None and len(imgs) >= n: break id_ = int(tweet['id_str']) if id_ < max_id: print_('enough') enough = True break msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs)) if cw: if not cw.alive: break cw.setTitle(msg) else: print(msg) if not imgs: raise Exception('no imgs') if not enough and len(imgs) < n: imgs = get_imgs_legacy(username, session, title, types, n, format, cw, method='search', imgs=imgs) # 2303 ids_new = set() for img in imgs: ids_new.add(img.id) for id_ in sorted(ids, reverse=True): if id_ in ids_new: continue imgs += sorted(os.path.join(dir_, name) for name in names[id_]) return imgs
def get_imgs_legacy(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None): print_ = get_print(cw) print_('types: {}'.format(', '.join(types))) artist, username = get_artist_username(username, session) # # Range n = max(n, get_max_range(cw)) max_pos = None ids_set = set() if imgs: for img in imgs: ids_set.add(img.id) else: imgs = [] f**k = 0 min_position = None while len(imgs) < n: if mode == 'media': if method == 'tab': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print_('max_pos={}, imgs={}'.format(max_pos, len(imgs))) elif method == 'search': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format( q) print_('max_id={}, imgs={}'.format(max_id, len(imgs))) elif method == 'search2': # 1028 max_id = min(ids_set) - 1 if ids_set else None if ids_set: q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format( username, max_id) else: q = 'from:{} exclude:retweets filter:media -filter:periscope'.format( username) q = quote(q, '') foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format( q, foo) print_('max_pos={}, max_id={}, imgs={}'.format( max_pos, max_id, len(imgs))) else: raise Exception('Invalid method: {}'.format(method)) elif mode == 'likes': foo = '&max_position={}'.format( max_pos) if max_pos is not None else '' url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format( username, foo) print(url) hdr = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "******", } for try_ in range(16): if cw and not cw.alive: return try: html = downloader.read_html( url, session=session, referer='https://twitter.com/{}'.format(username), headers=hdr) #err except Exception as e: e_msg = print_error(e)[-1] print_('retry... ({}) {}\n{}'.format(try_, url, e_msg)) change_ua(session) continue try: data = json.loads(html) except Exception as e: change_ua(session) soup = Soup(html) login = soup.find('div', class_='LoginForm-input') if login and method == 'tab': raise Exception('Login required!') print_('can not load json: {}'.format(e)) sleep(1) continue break else: print_('over try') if not imgs: raise Exception('No imgs') break if 'items_html' in data: html = data['items_html'] else: print_('no items_html') session.cookies.clear() # ??? #break soup = Soup(html) tweets = soup.findAll('div', class_='tweet') + soup.findAll( 'span', class_='grid-tweet') ids = [] for tweet in tweets: id = int(tweet.attrs['data-tweet-id']) if id in ids_set: print('duplicate') continue ids.append(id) ids_set.add(id) tweet = Tweet(tweet, format, types, session, cw) for img in tweet.imgs: imgs.append(img) if n is not None and len(imgs) >= n: break if not ids: foo = 4 if method != 'search2' else 16 if len(imgs) == 0: raise Exception('No Image') elif f**k > foo: if method == 'tab': ### search method = 'search' f**k = 0 continue elif method == 'search' and not ids and min_position is not None: ### search2 method = 'search2' max_pos = min_position #min_position = None f**k = 0 continue else: print('too much f**k') break else: print('f**k!!!!!') change_ua(session) f**k += 1 elif f**k: print('reset f**k') f**k = 0 max_pos_new = data.get('min_position') # 1028 if max_pos_new is None: if ids: max_pos_new = min(ids) else: max_pos_new = max_pos # max_pos = max_pos_new if data.get('min_position'): min_position = data['min_position'] print('min_position:', min_position) try: if cw is not None: if not cw.alive: break cw.setTitle('{} {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs))) except Exception as e: print(e) raise return imgs
def read(self): type = self.pixiv_type cw = self.cw print_ = cw.print_ ui_setting = self.ui_setting if type == 'following': raise NotImplementedError('following') self._format = [None, 'gif', 'webp', 'png'][ui_setting.ugoira_convert.currentIndex()] self._format_name = compatstr(ui_setting.pixivFormat.currentText()) types = self.get_types() if types: s = ', '.join(sorted(types)) else: s = 'all' types = None print_((u'Type: {}').format(s)) print_((u'info: {}').format(self.info)) api = self.api query = self.id.replace('_bmk', '').replace('_illust', '').replace('pixiv_', '').replace('search_', '') if type != 'search': query = int(query) print('pixiv_query:', query) try: if type in ('user', 'bookmark', 'search'): max_pid = get_max_range(cw) if ui_setting.groupBox_tag.isChecked(): tags = [ compatstr(ui_setting.tagList.item(i).text()) for i in range(ui_setting.tagList.count()) ] else: tags = [] if type == 'search': query = query.replace('+', ' ') name = query else: id = self.id.replace('_bmk', '').replace('pixiv_', '').replace('search_', '') print('name', id) name = get_name(id, self.api, cw=cw) self.artist = name title = u'{} ({})'.format(name, self.id) print_(title) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs(query, type=type, api=api, n=max_pid, tags=tags, types=types, format=self._format, format_name=self._format_name, dir=dir, cw=cw, title=title, info=self.info) elif type == 'illust': for try_ in range(N_TRY): try: detail = api.illust_detail(query, req_auth=True) error = detail.get('error') if error: raise PixivError(error) break except PixivError as e: api = e.api print_(e) if try_ < N_TRY - 1: print_('retry...') sleep(SLEEP, cw) else: raise illust = detail.illust name = illust.title title = (u'{} ({})').format(name, self.id) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs_from_illust(illust, api=api, format=self._format, dir=dir, cw=cw, format_name=self._format_name) except PixivError as e: msg = (u'PixivError: {}').format(e.message) return self.Invalid(msg) self.imgs = [] for img in imgs: d = {'type': img.type, 'url': img.url()} if img.type == 'ugoira': d['filename'] = img.filename d['frames'] = img.ugoira_data.frames self.imgs.append(d) for img in imgs: self.urls.append(img.url) self.title = clean_title(title) # 1390
def process_ids(ids, info, imgs, cw, depth=0, tags_add=None): print_ = get_print(cw) max_pid = get_max_range(cw) class Thread(threading.Thread): alive = True rem = 0 def __init__(self, queue): super().__init__(daemon=True) self.queue = queue @classmethod @lock def add_rem(cls, x): cls.rem += x def run(self): while self.alive: try: id_, res, i = self.queue.popleft() except Exception as e: sleep(.1) continue try: info_illust = get_info( 'https://www.pixiv.net/en/artworks/{}'.format(id_), cw, depth=depth + 1, tags_add=tags_add) res[i] = info_illust['imgs'] except Exception as e: if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました' or type(e) == errors.LoginRequired ): # logout during extraction res[i] = e print_('process_ids error (id: {}, d:{}):\n{}'.format( id_, depth, print_error(e)[0])) finally: Thread.add_rem(-1) queue = deque() n, step = Downloader_pixiv.STEP print_('{} / {}'.format(n, step)) ts = [] for i in range(n): t = Thread(queue) t.start() ts.append(t) for i in range(0, len(ids), step): res = [[]] * step for j, id_illust in enumerate(ids[i:i + step]): queue.append((id_illust, res, j)) Thread.add_rem(1) while Thread.rem: sleep(.001, cw) for imgs_ in res: if isinstance(imgs_, Exception): raise imgs_ imgs += imgs_ s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs)) if cw: cw.setTitle(s) else: print(s) if len(imgs) >= max_pid: break if depth == 0: check_alive(cw) for t in ts: t.alive = False
def get_info(url, cw=None, depth=0, tags_add=None): print_ = get_print(cw) api = PixivAPI() info = {} imgs = [] ugoira_ext = [None, '.gif', '.webp', '.png' ][utils.ui_setting.ugoira_convert.currentIndex( )] if utils.ui_setting else None format_ = compatstr(utils.ui_setting.pixivFormat.currentText() ) if utils.ui_setting else 'id_ppage' max_pid = get_max_range(cw) if api.illust_id(url): # Single post id_ = api.illust_id(url) data = api.illust(id_) login = '******' not in data if FORCE_LOGIN and not login: # raise errors.LoginRequired() if data['xRestrict'] and not login: raise errors.LoginRequired('R-18') info['artist'] = data['userName'] info['artist_id'] = data['userId'] info['raw_title'] = data['illustTitle'] info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_) info['create_date'] = parse_time(data['createDate']) tags_illust = set(tag['tag'] for tag in data['tags']['tags']) if tags_matched(tags_illust, tags_add, cw): if data['illustType'] == 2: # ugoira data = api.ugoira_meta(id_) ugoira = { 'ext': ugoira_ext, 'delay': [frame['delay'] for frame in data['frames']], } img = Image(data['originalSrc'], url, id_, 0, format_, info, cw, ugoira=ugoira) imgs.append(img) else: data = api.pages(id_) for img in data: img = Image(img['urls']['original'], url, id_, len(imgs), format_, info, cw) imgs.append(img) else: print('tags mismatched') elif '/bookmarks/' in url or 'bookmark.php' in url: # User bookmarks id_ = api.user_id(url) if id_ is None: # id_ = my_id() if id_ == my_id(): rests = ['show', 'hide'] else: rests = ['show'] process_user(id_, info, api) info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id']) ids = [] ids_set = set() for rest in rests: offset = 0 while len(ids) < max_pid: data = api.bookmarks(id_, offset, rest=rest) c = 0 for id in [work['id'] for work in data['works']]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break offset += LIMIT if depth == 0: check_alive(cw) process_ids(ids, info, imgs, cw, depth) elif '/tags/' in url or 'search.php' in url: # Search q = unquote( re.find(r'/tags/([^/]+)', url) or re.find('[?&]word=([^&]*)', url, err='no tags')) info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+')) qs = query_url(url) order = qs.get('order', ['date_d'])[0] mode = qs.get('mode', ['all'])[0] s_mode = qs.get('s_mode', ['s_tag_full'])[0] scd = qs.get('scd', [None])[0] ecd = qs.get('ecd', [None])[0] type_ = qs.get('type', ['all'])[0] wlt = qs.get('wlt', [None])[0] wgt = qs.get('wgt', [None])[0] hlt = qs.get('hlt', [None])[0] hgt = qs.get('hgt', [None])[0] blt = qs.get('blt', [None])[0] bgt = qs.get('bgt', [None])[0] ratio = qs.get('ratio', [None])[0] tool = qs.get('tool', [None])[0] logs = [ 'order: {}'.format(order), 'mode: {}'.format(mode), 's_mode: {}'.format(s_mode), 'scd / ecd: {} / {}'.format(scd, ecd), 'type: {}'.format(type_), 'wlt / wgt: {} / {}'.format(wlt, wgt), 'hlt / hgt: {} / {}'.format(hlt, hgt), 'blt / bgt: {} / {}'.format(blt, bgt), 'ratio: {}'.format(ratio), 'tool: {}'.format(tool), ] print_('\n'.join(logs)) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.search(q, order, mode, p=p, s_mode=s_mode, scd=scd, ecd=ecd, type_=type_, wlt=wlt, wgt=wgt, hlt=hlt, hgt=hgt, blt=blt, bgt=bgt, ratio=ratio, tool=tool) c = 0 for id in [ illust['id'] for illust in data['illustManga']['data'] if 'id' in illust ]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids, info, imgs, cw, depth) elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url: # Newest works: Following r18 = 'bookmark_new_illust_r18.php' in url id_ = my_id() process_user(id_, info, api) info['title'] = '{} (pixiv_following_{}{})'.format( info['artist'], 'r18_' if r18 else '', info['artist_id']) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.following(p, r18=r18) c = 0 for id in data['page']['ids']: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids, info, imgs, cw, depth) elif api.user_id(url): # User illusts m = re.search(r'/users/[0-9]+/([\w]+)/?([^\?#/]*)', url) type_ = { 'illustrations': 'illusts', 'manga': 'manga' }.get(m and m.groups()[0]) if type_: types = [type_] else: types = ['illusts', 'manga'] if m: tag = unquote(m.groups()[1]) or None else: tag = None print_('types: {}, tag: {}'.format(types, tag)) id_ = api.user_id(url) process_user(id_, info, api) data = api.profile(id_) info['title'] = '{} (pixiv_{})'.format(info['artist'], info['artist_id']) ids = [] for type_ in types: illusts = data[type_] if not illusts: continue ids += list(illusts.keys()) ids = sorted(ids, key=int, reverse=True) if not ids: raise Exception('no imgs') process_ids(ids, info, imgs, cw, depth, tags_add=[tag] if tag else None) else: raise NotImplementedError() info['imgs'] = imgs[:max_pid] return info
def get_imgs_more(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None): print_ = get_print(cw) imgs = imgs or [] print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types))) artist, username = get_artist_username(username, session, cw) # # Range n = max(n or 0, get_max_range(cw)) ids_set = set(img.id for img in imgs) count_no_tweets = 0 count_no_imgs = 0 while check_alive(cw) or len(imgs) < n: if options.get('experimental') or count_no_tweets: #2687, #3392 filter_ = '' else: filter_ = ' filter:media' cache_guest_token = bool(count_no_tweets) if ids_set: max_id = min(ids_set) - 1 q = 'from:{} max_id:{} exclude:retweets{} -filter:periscope'.format( username, max_id, filter_) else: q = 'from:{} exclude:retweets{} -filter:periscope'.format( username, filter_) print(q) tweets = [] for tweet in list( TwitterAPI(session, cw, cache_guest_token).search(q)): id = int(tweet['id']) if id in ids_set: print_('duplicate: {}'.format(id)) continue ids_set.add(id) tweets.append(tweet) if tweets: exists_more_imgs = False for tweet in tweets: imgs_tweet = get_imgs_from_tweet(tweet, session, types, format, cw) if imgs_tweet: imgs += imgs_tweet exists_more_imgs = True if exists_more_imgs: count_no_imgs = 0 else: count_no_imgs += 1 if count_no_imgs >= RETRY_MORE_IMGS: #4130 break count_no_tweets = 0 else: count_no_tweets += 1 change_ua(session) if count_no_tweets >= RETRY_MORE: break print_('retry... {}'.format(count_no_tweets)) continue msg = '{} {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs)) if cw: cw.setTitle(msg) else: print(msg) return imgs
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() domain = utils.domain(url) if mode in ['pornstar']: url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://{}/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': url_api = 'https://{}/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://{}/{}/{}/videos/upload'\ '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def get_imgs(user_id, session, cw=None): print_ = get_print(cw) url = 'https://{}.bdsmlr.com/'.format(user_id) info = {'c': 0, 'posts': [], 'ids': set()} html = downloader.read_html(url, session=session) soup = Soup(html) sorry = soup.find('div', class_='sorry') if sorry: raise Exception(sorry.text.strip()) username = soup.find('title').text.strip() ### print('username:'******'username'] = username token = soup.find('meta', {'name': 'csrf-token'}).attrs['content'] print_(u'token: {}'.format(token)) foo(url, soup, info) max_pid = get_max_range(cw) #, 2000) n = len(info['ids']) for p in range(1000): url_api = 'https://{}.bdsmlr.com/infinitepb2/{}'.format( user_id, user_id) data = { 'scroll': str(info['c']), 'timenow': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'last': str(info['last']), } print_(u'n:{}, scroll:{}, last:{}'.format(len(info['posts']), data['scroll'], data['last'])) headers = { 'Referer': url, 'X-CSRF-TOKEN': token, } for try_ in range(4): try: r = session.post(url_api, data=data, headers=headers) if p == 0: r.raise_for_status() break except Exception as e: print(e) else: raise soup = Soup(r.text) foo(url, soup, info) if len(info['ids']) == n: print('same; break') break n = len(info['ids']) s = u'{} {} (tumblr_{}) - {}'.format(tr_(u'읽는 중...'), username, user_id, len(info['posts'])) if cw is not None: if not cw.valid or not cw.alive: return [] cw.setTitle(s) else: print(s) if len(info['posts']) > max_pid: break return info
def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage', cw=None): print_ = get_print(cw) # Range n = max(n, get_max_range(cw)) # 2303 ids = set() names = dict() dir_ = os.path.join(get_outdir('twitter'), title) if os.path.isdir(dir_) and cw: for name in cw.names_old: name = os.path.basename(name) id_ = re.find('([0-9]+)_p', name) if id_ is None: continue if get_ext(name).lower() == '.mp4': type_ = 'video' else: type_ = 'img' if type_ not in types: continue id_ = int(id_) ids.add(id_) if id_ in names: names[id_].append(name) else: names[id_] = [name] ids_sure = sorted(ids)[:-100] max_id = max(ids_sure) if ids_sure else 0 #3201 # 2303 imgs_old = [] for id_ in sorted(ids, reverse=True): for p, file in enumerate( sorted(os.path.join(dir_, name) for name in names[id_])): img = Image(file, '', id_, 0, p, format, cw, False) img.url = LazyUrl_twitter(None, lambda _: file, img) img.filename = os.path.basename(file) imgs_old.append(img) imgs_new = [] enough = False c_old = 0 counter = SkipCounter(1) msg = None for tweet in check_alive_iter( cw, TwitterAPI(session, cw).timeline_media(username)): id_ = int(tweet['id_str']) if id_ < max_id: print_('enough') enough = True break if id_ in ids: print_('duplicate: {}'.format(id_)) c_old += 1 continue ids.add(id_) imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw) if len(imgs_new) + c_old >= n: #3201 break if counter.next(): msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new)) if cw: cw.setTitle(msg) else: print(msg) if msg: if cw: cw.setTitle(msg) else: print(msg) if not enough and not imgs_new and c_old == 0: raise Exception('no imgs') imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True) if len(imgs) < n: imgs = get_imgs_more(username, session, title, types, n, format, cw, imgs=imgs) return imgs[:n]