def get_videos(url, cw=None): print_ = get_print(cw) info = {} user_id = re.find(r'twitch.tv/([^/?]+)', url, err='no user_id') print(user_id) session = Session() r = session.get(url) s = cut_pair(re.find(r'headers *: *({.*)', r.text, err='no headers')) print(s) headers = json_loads(s) payload = [{ 'operationName': 'ClipsCards__User', 'variables': { 'login': user_id, 'limit': 20, 'criteria': { 'filter': 'ALL_TIME' } }, 'extensions': { 'persistedQuery': { 'version': 1, 'sha256Hash': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777' } }, }] videos = [] cursor = None cursor_new = None while True: if cursor: payload[0]['variables']['cursor'] = cursor r = session.post('https://gql.twitch.tv/gql', json=payload, headers=headers) #print(r) data = r.json() for edge in data[0]['data']['user']['clips']['edges']: url_video = edge['node']['url'] info['name'] = edge['node']['broadcaster']['displayName'] video = Video(url_video) video.id = int(edge['node']['id']) videos.append(video) cursor_new = edge['cursor'] print_('videos: {} / cursor: {}'.format(len(videos), cursor)) if cursor == cursor_new: print_('same cursor') break if cursor_new is None: break cursor = cursor_new if not videos: raise Exception('no videos') info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True) return info
def get_sd(url, session=None, html=None, cw=None, wait=True): print_ = get_print(cw) if html: soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: raise Exception('no _sharedData!!') else: for try_ in range(4): _wait(cw) html = read_html(url, session, cw) soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: continue break else: raise Exception('no _sharedData') for script in soup.findAll('script'): s = script.string if s and 'window.__additionalDataLoaded(' in s: s = cut_pair(s) j_add = json.loads(s) try: j['entry_data']['PostPage'][0].update(j_add) except: j['entry_data']['ProfilePage'][0].update(j_add) #2900 # Challenge challenge = j['entry_data'].get('Challenge') if challenge: try: for cont in challenge[0]['extraData']['content']: title = cont.get('title') if title: break else: raise Exception('no title') except: title = 'Err' raise errors.LoginRequired(title) # LoginAndSignupPage login = j['entry_data'].get('LoginAndSignupPage') if login: raise errors.LoginRequired() return j
def get_info(url, html): soup = Soup(html) info = {} uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name') info['artist'] = uname.text.strip() s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1]) j = json.loads(json.loads(u'"{}"'.format(s))) if '/detail/' in url: info['uid'] = j['detail']['detail_user']['uid'] info['id'] = j['detail']['post_data']['item_id'] else: info['uid'] = j['homeInfo']['uid'] return info
def get_info(url): url = url.replace('/gallery/', '/a/') if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0: title = re.find(r'/r/([^/]+)', url) info = {} info['title'] = title info['type'] = 'r' else: try: # legacy html = downloader.read_html(url, cookies={'over18':'1'}) s = re.find('image *: *({.+)', html) info_raw = cut_pair(s) except Exception as e: # new print(e) id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id') url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_) info_raw = downloader.read_html(url_api, cookies={'over18':'1'}) info = json.loads(info_raw) info['type'] = 'a' return info
def get_imgs(url, html=None, cw=None): if '/detail/' not in url: return get_imgs_channel(url, html, cw) if html is None: html = downloader.read_html(url) s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1]) s = json.loads(u'"{}"'.format(s)) data = json.loads(s) multi = data['detail']['post_data']['multi'] imgs = [] for m in multi: path = m['original_path'] img = json.loads(u'"{}"'.format(path)) img = Image_single(img, url, len(imgs)) imgs.append(img) return imgs
def get_data(html): data_raw = cut_pair(re.find('window.initials *= *(.+)', html)) return json.loads(data_raw)
def get_pages(html): s = re.find(r'__INITIAL_STATE__=(.+)', html) data_raw = cut_pair(s) data = json.loads(data_raw) pages = data['videoData']['pages'] return pages
def get_imgs_page_legacy(page, session, cw=None, depth=0): if cw is not None and not cw.alive: return print_ = get_print(cw) try: html = read_html(page.url, session) except Exception as e: print_('get_imgs_page_legacy error: {}'.format(e)) if e.args and e.args[0] == 502: return [] raise if isProtected(html): data = get_soup(page.url, cw=cw, session=session) page.url = data['url'] html = data['html'] soup = Soup(html, 'html5lib') # 1653 # skip empty pages if not html: print_(u'empty page: {}'.format(page.title)) return [] # skip invalid pages err = soup.find('span', class_='cf-error-code') if err: print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title)) if depth > 0: return [] else: return get_imgs_page_legacy(page, session, cw, depth + 1) #page.title = get_title_page(soup) matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', '')) matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', '')) img_list = json.loads(matches[0]) if matches else [] img_list1 = json.loads(matches1[0]) if matches1 else [] # 1780 img_list = [img for img in img_list if img] img_list1 = [img for img in img_list1 if img] # 1589 ''' if not img_list and not img_list1: print_((u'no imgs; retry... {}').format(page.title)) raise Exception('No images') ''' for script in soup.findAll('script'): script = script.text if 'var img_list =' in script: break else: raise Exception('No script') seed = int(re.find('view_cnt *= *([0-9]+)', script)) chapter = int(re.find('var +chapter *= *([0-9]+)', script)) try: cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script), '[]') cdn_domains = json.loads(cdn_domains) except Exception as e: print(e) cdn_domains = [] n = max(len(img_list), len(img_list1)) img_list += [''] * (n - len(img_list)) img_list1 += [''] * (n - len(img_list1)) print_(u'{} chapter:{} seed:{} domains:{}'.format( page.title, chapter, seed, len(cdn_domains))) if seed != 0: return 'seed' imgs = [] for p, (img, img1) in enumerate(zip(img_list, img_list1)): # fix img url img = fix_img_url(img, cdn_domains, chapter, p) img1 = fix_img_url(img1, cdn_domains, chapter, p) img = urljoin(page.url, img) if img else '' img1 = urljoin(page.url, img1) if img1 else '' # most likely googledrive if img.strip('/').count('/') == 2: #1425 continue img = Image(img, page, p, img1) imgs.append(img) return imgs
def get_ssr_data(html): s = html.split('window.__ssr_data = JSON.parse("')[1].replace('\\"', '"') s = cut_pair(s).replace('"', '\\"') data = json.loads(json.loads('"{}"'.format(s))) return data