def enter(): print('enter') session = Session() r = session.get(URL_ENTER) # 862 html = r.text soup = Soup(html) box = soup.find('aside', id='FilterBox') data = {} for select in box.findAll('select'): name = select.attrs['name'] value = select.findAll('option')[-1].attrs['value'] print(name, value) data[name] = value for input in box.findAll('input'): name = input.attrs['name'] value = input.attrs['value'] if name.startswith('rating_') or 'CSRF_TOKEN' in name: print(name, value) data[name] = value data.update({ 'filter_media': 'A', 'filter_order': 'date_new', 'filter_type': '0', }) r = session.post(URL_FILTER, data=data, headers={'Referer': r.url}) print(r) return session
def get_videos(url, cw=None): print_ = get_print(cw) info = {} user_id = re.find(r'twitch.tv/([^/?]+)', url, err='no user_id') print(user_id) session = Session() r = session.get(url) s = cut_pair(re.find(r'headers *: *({.*)', r.text, err='no headers')) print(s) headers = json_loads(s) payload = [{ 'operationName': 'ClipsCards__User', 'variables': { 'login': user_id, 'limit': 20, 'criteria': { 'filter': 'ALL_TIME' } }, 'extensions': { 'persistedQuery': { 'version': 1, 'sha256Hash': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777' } }, }] videos = [] cursor = None cursor_new = None while True: if cursor: payload[0]['variables']['cursor'] = cursor r = session.post('https://gql.twitch.tv/gql', json=payload, headers=headers) #print(r) data = r.json() for edge in data[0]['data']['user']['clips']['edges']: url_video = edge['node']['url'] info['name'] = edge['node']['broadcaster']['displayName'] video = Video(url_video) video.id = int(edge['node']['id']) videos.append(video) cursor_new = edge['cursor'] print_('videos: {} / cursor: {}'.format(len(videos), cursor)) if cursor == cursor_new: print_('same cursor') break if cursor_new is None: break cursor = cursor_new if not videos: raise Exception('no videos') info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True) return info
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw) info = {} info['header'] = header info['username'] = username session = Session() urls = [] ids = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print_(url_api) r = session.post(url_api) data = json.loads(r.text) videos = data.get('videos') #4530 if not videos: print_('empty') break for video in videos: id_ = video['id'] if id_ in ids: print_('duplicate: {}'.format(id_)) continue ids.add(id_) info['name'] = video['pn'] urls.append(urljoin(url_page, video['u'])) if len(urls) >= max_pid: break n = data['nb_videos'] s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: cw.setTitle(s) else: print(s) if len(ids) >= n: break sleep(1, cw) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get(self, url): if self._url_video: return self._url_video cw = self.cw print_ = get_print(cw) html = downloader.read_html(url) soup = Soup(html) embedUrl = extract('embedUrl', html, cw) if embedUrl: raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl)) uid = extract('strLocalChUserId', html, cw) pid = extract('nLocalPrgId', html, cw) fid = extract('strFid', html, cw) resolType = extract('strResolType', html, cw) resolArr = extract('strResolArr', html, cw) vodSvr = extract('nVodSvr', html, cw) resols = extract('nInfo', html, cw) runtime = extract('runtime', html, cw) url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/' data = { 'userId': uid, 'prgId': pid, 'fid': fid, 'resolType': resolType, 'resolArr': ','.join(map(str, resolArr)), 'vodSvr': vodSvr, 'resol': max(resols), 'runtime': runtime, 'tvbox': 'false', 'defResol': 'true', 'embed': 'false', } session = Session() r = session.post(url_api, headers={'Referer': url}, data=data) data = json.loads(r.text) self._url_video = data['src'] self.title = soup.find('meta', {'property': 'og:description'})['content'] ext = get_ext(self._url_video) self.filename = format_filename(self.title, pid, ext) self.url_thumb = soup.find('meta', {'property': 'og:image'})['content'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) return self._url_video
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw, 2000) info = {} info['header'] = header info['username'] = username session = Session() urls = [] urls_set = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print(url_api) r = session.post(url_api, data='main_cats=false') soup = Soup(r.text) thumbs = soup.findAll('div', class_='thumb-block') if not thumbs: print_('empty') break for thumb in thumbs: info['name'] = thumb.find('span', class_='name').text.strip() href = thumb.find('a')['href'] href = urljoin(url_page, href) if href in urls_set: print_('duplicate: {}'.format(href)) continue urls_set.add(href) urls.append(href) if len(urls) >= max_pid: break s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: if not cw.alive: return cw.setTitle(s) else: print(s) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() domain = utils.domain(url) if mode in ['pornstar']: url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://{}/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': url_api = 'https://{}/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://{}/{}/{}/videos/upload'\ '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def get_imgs(user_id, cw=None): print_ = get_print(cw) url = 'https://{}.bdsmlr.com/'.format(user_id) session = Session() info = {'c': 0, 'posts': [], 'ids': set()} html = downloader.read_html(url, session=session) soup = Soup(html) sorry = soup.find('div', class_='sorry') if sorry: raise Exception(sorry.text.strip()) username = soup.find('title').text.strip() ### print('username:'******'username'] = username token = soup.find('meta', {'name': 'csrf-token'}).attrs['content'] print_(u'token: {}'.format(token)) foo(url, soup, info) max_pid = get_max_range(cw) #, 2000) n = len(info['ids']) for p in range(1000): url_api = 'https://{}.bdsmlr.com/infinitepb2/{}'.format( user_id, user_id) data = { 'scroll': str(info['c']), 'timenow': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'last': str(info['last']), } print_(u'n:{}, scroll:{}, last:{}'.format(len(info['posts']), data['scroll'], data['last'])) headers = { 'Referer': url, 'X-CSRF-TOKEN': token, } for try_ in range(4): try: r = session.post(url_api, data=data, headers=headers) if p == 0: r.raise_for_status() break except Exception as e: print(e) else: raise soup = Soup(r.text) foo(url, soup, info) if len(info['ids']) == n: print('same; break') break n = len(info['ids']) s = u'{} {} (tumblr_{}) - {}'.format(tr_(u'읽는 중...'), username, user_id, len(info['posts'])) if cw is not None: if not cw.valid or not cw.alive: return [] cw.setTitle(s) else: print(s) if len(info['posts']) > max_pid: break return info
def _guest_token(headers): session = Session() r = session.post('https://api.twitter.com/1.1/guest/activate.json', headers=headers) data = json.loads(r.text) return data['guest_token']