def fix_url(cls, url): if re.search(r'/(videos|clips)\?filter=', url): return url.strip('/') return url.split('?')[0].strip('/')
def get_info(url, cw=None, depth=0, tags_add=None): print_ = get_print(cw) api = PixivAPI() info = {} imgs = [] ugoira_ext = [None, '.gif', '.webp', '.png' ][utils.ui_setting.ugoira_convert.currentIndex( )] if utils.ui_setting else None format_ = compatstr(utils.ui_setting.pixivFormat.currentText() ) if utils.ui_setting else 'id_ppage' max_pid = get_max_range(cw) if api.illust_id(url): # Single post id_ = api.illust_id(url) data = api.illust(id_) login = '******' not in data if FORCE_LOGIN and not login: # raise errors.LoginRequired() if data['xRestrict'] and not login: raise errors.LoginRequired('R-18') info['artist'] = data['userName'] info['artist_id'] = data['userId'] info['raw_title'] = data['illustTitle'] info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_) info['create_date'] = parse_time(data['createDate']) tags_illust = set(tag['tag'] for tag in data['tags']['tags']) if tags_matched(tags_illust, tags_add, cw): if data['illustType'] == 2: # ugoira data = api.ugoira_meta(id_) ugoira = { 'ext': ugoira_ext, 'delay': [frame['delay'] for frame in data['frames']], } img = Image(data['originalSrc'], url, id_, 0, format_, info, cw, ugoira=ugoira) imgs.append(img) else: data = api.pages(id_) for img in data: img = Image(img['urls']['original'], url, id_, len(imgs), format_, info, cw) imgs.append(img) else: print('tags mismatched') elif '/bookmarks/' in url or 'bookmark.php' in url: # User bookmarks id_ = api.user_id(url) if id_ is None: # id_ = my_id() if id_ == my_id(): rests = ['show', 'hide'] else: rests = ['show'] process_user(id_, info, api) info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id']) ids = [] ids_set = set() for rest in rests: offset = 0 while len(ids) < max_pid: data = api.bookmarks(id_, offset, rest=rest) c = 0 for id in [work['id'] for work in data['works']]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break offset += LIMIT if depth == 0: check_alive(cw) process_ids(ids, info, imgs, cw, depth) elif '/tags/' in url or 'search.php' in url: # Search q = unquote( re.find(r'/tags/([^/]+)', url) or re.find('[?&]word=([^&]*)', url, err='no tags')) info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+')) qs = query_url(url) order = qs.get('order', ['date_d'])[0] mode = qs.get('mode', ['all'])[0] s_mode = qs.get('s_mode', ['s_tag_full'])[0] scd = qs.get('scd', [None])[0] ecd = qs.get('ecd', [None])[0] type_ = qs.get('type', ['all'])[0] wlt = qs.get('wlt', [None])[0] wgt = qs.get('wgt', [None])[0] hlt = qs.get('hlt', [None])[0] hgt = qs.get('hgt', [None])[0] blt = qs.get('blt', [None])[0] bgt = qs.get('bgt', [None])[0] ratio = qs.get('ratio', [None])[0] tool = qs.get('tool', [None])[0] logs = [ 'order: {}'.format(order), 'mode: {}'.format(mode), 's_mode: {}'.format(s_mode), 'scd / ecd: {} / {}'.format(scd, ecd), 'type: {}'.format(type_), 'wlt / wgt: {} / {}'.format(wlt, wgt), 'hlt / hgt: {} / {}'.format(hlt, hgt), 'blt / bgt: {} / {}'.format(blt, bgt), 'ratio: {}'.format(ratio), 'tool: {}'.format(tool), ] print_('\n'.join(logs)) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.search(q, order, mode, p=p, s_mode=s_mode, scd=scd, ecd=ecd, type_=type_, wlt=wlt, wgt=wgt, hlt=hlt, hgt=hgt, blt=blt, bgt=bgt, ratio=ratio, tool=tool) c = 0 for id in [ illust['id'] for illust in data['illustManga']['data'] if 'id' in illust ]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids, info, imgs, cw, depth) elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url: # Newest works: Following r18 = 'bookmark_new_illust_r18.php' in url id_ = my_id() process_user(id_, info, api) info['title'] = '{} (pixiv_following_{}{})'.format( info['artist'], 'r18_' if r18 else '', info['artist_id']) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.following(p, r18=r18) c = 0 for id in data['page']['ids']: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids, info, imgs, cw, depth) elif api.user_id(url): # User illusts m = re.search(r'/users/[0-9]+/([\w]+)/?([^\?#/]*)', url) type_ = { 'illustrations': 'illusts', 'manga': 'manga' }.get(m and m.groups()[0]) if type_: types = [type_] else: types = ['illusts', 'manga'] if m: tag = unquote(m.groups()[1]) or None else: tag = None print_('types: {}, tag: {}'.format(types, tag)) id_ = api.user_id(url) process_user(id_, info, api) data = api.profile(id_) info['title'] = '{} (pixiv_{})'.format(info['artist'], info['artist_id']) ids = [] for type_ in types: illusts = data[type_] if not illusts: continue ids += list(illusts.keys()) ids = sorted(ids, key=int, reverse=True) if not ids: raise Exception('no imgs') process_ids(ids, info, imgs, cw, depth, tags_add=[tag] if tag else None) else: raise NotImplementedError() info['imgs'] = imgs[:max_pid] return info
def isVisible(tag): while tag: if re.search('display: *none', tag.get('style', ''), re.IGNORECASE): return False tag = tag.parent return True
def fix_url(cls, url): m = re.search('xhamster(?P<number>[0-9]*)\\.(?P<top>[a-z0-9]+)/', url) number, top = m.groups() return url.replace((u'xhamster{}.{}/').format(number, top), u'xhamster.com/')
def _get_page_id(html): m = re.search("CONFIG\\['page_id'\\]='([0-9]+?)'", html) return m
def read(self): print_ = get_print(self.customWidget) for try_ in range(8): self.customWidget.print_('get_session') try: session = get_session() html = downloader.read_html(self.url, session=session) soup = Soup(html) get_title_artist(soup) break except Exception as e: print(e) else: raise title, self.artist = get_title_artist(soup) self.__title = title title_dir = clean_title((u'[{}] {}').format(self.artist, title)) ex = soup.find('div', id='novel_ex') self.novel_ex = ex.text.strip() if ex else None texts = [] subtitles = soup.findAll('dd', class_='subtitle') if subtitles: for subtitle in subtitles: update = subtitle.parent.find('dt', class_='long_update') update2 = None if update: for span in update.findAll('span'): update2 = span.attrs['title'] span.decompose() update = update.text.strip() if update2: update += (u' ({})').format(update2) a = subtitle.find('a') subtitle = a.text.strip() href = urljoin(self.url, a.attrs['href']) if not re.search(('ncode.syosetu.com/{}/[0-9]+').format(self.id_), href): print_((u'skip: {}').format(href)) continue text = Text(subtitle, update, href, session, False) texts.append(text) else: self.single = True text = Text(title_dir, None, self.url, session, True) texts.append(text) self.print_((u'single: {}').format(self.single)) outdir = get_outdir('syosetu') for text in texts: if self.single: file = os.path.join(outdir, text.filename) else: file = os.path.join(outdir, title_dir, text.filename) if os.path.isfile(file): self.urls.append(file) else: self.urls.append(text.url) self.title = title_dir
def init(self): self.url = self.url.replace('xhamster_', '') if not re.search('xhamster[0-9]*\\.', self.url): self.url = (u'https://xhamster.com/videos/{}').format(self.url)
def f(url): if re.search(PATTERN_ID, url): raise Exception(tr_(u'목록 주소를 입력해주세요')) session = Session() pages = get_pages(url, session=session) return pages
def init(self): if re.search(r'xhamsterlive[0-9]*\.', self.url): raise Exception('xHamsterLive') if not re.search(r'xhamster[0-9]*\.', self.url): self.url = 'https://xhamster.com/videos/{}'.format(self.url)
def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown'] and not SHOW: browser.hide() sd['shown'] = False try: st = soup.find('h2', class_='share-title') if st is None: st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c) info['uid'] = st.text.strip() st = soup.find('h1', class_='share-sub-title') if st is None: st = soup.find('h1', class_=lambda c: c and 'ShareSubTitle' in c) info['nickname'] = st.text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() items = soup.findAll('div', class_='video-feed-item') + soup.findAll( 'div', class_=lambda c: c and 'DivItemContainer' in c) for div in items: a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4
def get(self, url): ''' get ''' cw = self.cw session = self.session print_ = get_print(cw) if self._url: return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id') print_('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) url_test = url.replace('pornhubpremium.com', 'pornhub.com') try: html = downloader.read_html(url_test, session=session) soup = Soup(html) if soup.find('div', id='lockedPlayer'): print_('Locked player') raise Exception('Locked player') url = url_test except: #3511 url = url.replace('pornhub.com', 'pornhubpremium.com') html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) html = soup.html # removed if soup.find('div', class_='removed'): raise Exception('removed') gif = soup.find('div', {'id': 'gifImageSection'}) if gif: print_('GIF') id_ = url.split('/gif/')[1] id_ = re.findall('[0-9a-zA-Z]+', id_)[0] jss = list(gif.children) for js in jss: if 'data-mp4' in getattr(js, 'attrs', {}): break else: raise Exception('gif mp4 url not found') title = js['data-gif-title'] url = js['data-mp4'] url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb') file = File('gif_{}'.format(id_), title, url, url_thumb) else: if id_ is None: raise Exception('no id') print_('Video') # 1968 #title = j['video_title'] title = soup.find('h1', class_='title').text.strip() video_urls = [] video_urls_set = set() def int_or_none(s): try: return int(s) except: return None def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None flashvars = json.loads(re.find(r'var\s+flashvars_\d+\s*=\s*({.+?});', html, err='no flashvars')) url_thumb = flashvars.get('image_url') media_definitions = flashvars.get('mediaDefinitions') if isinstance(media_definitions, list): for definition in media_definitions: if not isinstance(definition, dict): continue video_url = definition.get('videoUrl') if not video_url or not isinstance(video_url, str): continue if video_url in video_urls_set: continue video_urls_set.add(video_url) video_urls.append( (video_url, int_or_none(definition.get('quality')))) def extract_js_vars(webpage, pattern, default=object()): assignments = re.find(pattern, webpage, default=default) if not assignments: return {} assignments = assignments.split(';') js_vars = {} def remove_quotes(s): if s is None or len(s) < 2: return s for quote in ('"', "'", ): if s[0] == quote and s[-1] == quote: return s[1:-1] return s def parse_js_value(inp): inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) if '+' in inp: inps = inp.split('+') return functools.reduce( operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] return remove_quotes(inp) for assn in assignments: assn = assn.strip() if not assn: continue assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) return js_vars def add_video_url(video_url): v_url = url_or_none(video_url) if not v_url: return if v_url in video_urls_set: return video_urls.append((v_url, None)) video_urls_set.add(v_url) def parse_quality_items(quality_items): q_items = json.loads(quality_items) if not isinstance(q_items, list): return for item in q_items: if isinstance(item, dict): add_video_url(item.get('url')) if not video_urls: print_('# extract video_urls 2') FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( html, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): if key.startswith(FORMAT_PREFIXES[-1]): parse_quality_items(format_url) elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', html): raise Exception('Video is locked') ## if not video_urls: ## print_('# extract video_urls 3') ## js_vars = extract_js_vars( ## dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') ## add_video_url(js_vars['mediastring']) for mobj in re.finditer( r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', html): video_url = mobj.group('url') if video_url not in video_urls_set: video_urls.append((video_url, None)) video_urls_set.add(video_url) video_urls_ = video_urls video_urls = [] for video_url, height in video_urls_: if '/video/get_media' in video_url: print_(video_url) medias = downloader.read_json(video_url, session=session) if isinstance(medias, list): for media in medias: if not isinstance(media, dict): continue video_url = url_or_none(media.get('videoUrl')) if not video_url: continue height = int_or_none(media.get('quality')) video_urls.append((video_url, height)) continue video_urls.append((video_url, height)) videos = [] for video_url, height in video_urls: video = {} video['height'] = height or int_or_none(re.find(r'(?P<height>\d+)[pP]?_\d+[kK]', video_url)) video['quality'] = video['height'] or 0 video['videoUrl'] = video_url ext = get_ext(video_url) video['ext'] = ext if ext.lower() == '.m3u8': video['quality'] -= 1 print_('[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl'])) videos.append(video) if not videos: raise Exception('No videos') videos = sorted(videos, key=lambda video: video['quality']) res = get_resolution() videos_good = [video for video in videos if video['quality'] <= res] if videos_good: video = videos_good[-1] else: video = videos[0] print_('\n[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl'])) file = File(id_, title, video['videoUrl'].strip(), url_thumb) self._url = file.url self.title = file.title self.filename = file.filename self.thumb = file.thumb return self._url
def get_imgs(url, n_max=2000, title=None, cw=None, session=None): print_ = get_print(cw) for try_ in range(4): try: html = read_html(url, session, cw) m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)', html) if m is None: raise Exception('Invalid page') break except Exception as e: e_ = e print_(print_error(e)[0]) else: raise e_ n = int(m.groups()[0]) n = min(n, n_max) data = get_sd(url, html=html, cw=cw) uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] csrf_token = data['config']['csrf_token'] # session.cookies.set(name='ig_pr', value='1', path='/', domain='.instagram.com') cursor = '' edges = [] bad = 0 while True: check_alive(cw) variables = { 'id': uploader_id, 'first': 12, } if cursor: variables['after'] = cursor #print_(variables)# media = None try: j = get_query('003056d32c2554def87228bc3fd9668a', variables, session, cw) media = j['data']['user']['edge_owner_to_timeline_media'] sleep(2) # except Exception as e: if bad > 10: raise Exception('no media') else: print_(u'no media.. retry... ({}) {}'.format( bad + 1, print_error(e)[0])) sleep(12 * bad, cw) bad += 1 continue bad = 0 edges_new = media.get('edges') if not edges_new or not isinstance(edges_new, list): print('no edges_new') break edges += edges_new s = u'{} {} ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n) if cw is not None: cw.setTitle(s) if not cw.alive: return [] else: print(s) if len(edges) >= n: break page_info = media.get('page_info') if not page_info: break if not page_info.get('has_next_page'): break cursor = page_info.get('end_cursor') if not cursor: break if len(edges) <= n / 2: raise Exception(u'Too short: {} / {}'.format(len(edges), n)) imgs = [] for edge in edges: node = edge['node'] type = node['__typename'] id = node['shortcode'] url = u'https://www.instagram.com/p/{}/'.format(id) ## if type in ['GraphVideo', 'GraphImage']: ## single = True ## else: ## single = False for img in Node(url, session=session, cw=cw, media=node).imgs: imgs.append(img) if len(imgs) >= n_max: break return imgs