def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') video_id = mobj.group('id') api_json = self._call_api(host, video_id, '', note='Downloading video JSON') search_results = api_json.get('search-results', {}) if 'result' not in search_results: raise ExtractorError('Playlist was not found') result_list = search_results.get('result', {}) if isinstance(result_list, dict): result_list = [result_list] entries = [] for episode in result_list: video = episode.get('mediapackage', {}) entries.append(self._parse_mediapackage(video)) if len(entries) == 0: raise ExtractorError('Playlist has no entries') playlist_title = entries[0].get('series') result_obj = self.playlist_result(entries, playlist_id=video_id, playlist_title=playlist_title) return result_obj
def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError( 'This video is protected by a password, use the --video-password option', expected=True) meetId = self._search_regex( r'<input[^>]+?id="meetId" value="([^\"]+)"', webpage, 'meetId') data = urlencode_postdata({ 'id': meetId, 'passwd': password, 'action': "viewdetailedpage", 'recaptcha': "" }) validation_url = url.split( "zoom.us")[0] + "zoom.us/rec/validate_meet_passwd" validation_response = self._download_json( validation_url, video_id, note='Validating Password...', errnote='Wrong password?', data=data) if validation_response['errorCode'] != 0: raise ExtractorError( 'Login failed, %s said: %r' % (self.IE_NAME, validation_response['errorMessage']))
def _real_extract(self, url): obj = self._get_object_from_url(url) obj = obj.get('result') typ = obj['type'] if typ == u't': return self._extract_track(obj) elif typ not in (u'a', u'p'): raise ExtractorError(u"Unknown object type: `{0}'".format(typ)) # deal with playlists and albums tracks = obj.get('tracks', []) # XXX sometimes the result is a list, sometimes its a dict with an # items item if isinstance(tracks, dict): tracks = tracks.get('items', []) entries = [ self.url_result(t['shortUrl'], video_id=t['key']) for t in tracks ] return self.playlist_result(entries, obj['key'], obj['name'])
def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id, 'Downloading page') mobj = re.search( r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n' '\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage) if mobj is None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) file_id = mobj.group('fileid') server_id = mobj.group('serverid') KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting' keywords = self._html_search_meta('keywords', webpage, 'title') title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith( KEYWORDS_SUFFIX) else '' video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id) thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id) return { 'id': file_id, 'url': video_url, 'thumbnail': thumbnail, 'title': title }
def _real_initialize(self): username = self._downloader.params.get('username') password = self._downloader.params.get('password') if not (username and password): raise ExtractorError(u"Please specify your Rdio credentials") storage = storage_load() user_state = storage.load(username) self.rdio = RdioSession() if user_state: self.rdio._authorization_key = user_state.get('authorization_key') cookies = user_state.get('cookies', {}) self.rdio.cookies = requests.cookies.cookiejar_from_dict(cookies) if not self.rdio._authorization_key: self.rdio.sign_in(username, password) storage.save( username, { 'cookies': dict(self.rdio.cookies), 'authorization_key': self.rdio._authorization_key, })
def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') video_id = mobj.group('id') api_json = self._call_api(host, video_id, '', note='Downloading video JSON') search_results = api_json.get('search-results', {}) if 'result' not in search_results: raise ExtractorError('Video was not found') result_dict = search_results.get('result', {}) if not isinstance(result_dict, dict): raise ExtractorError('More than one video was unexpectedly returned.') video = result_dict.get('mediapackage', {}) result_obj = self._parse_mediapackage(video) return result_obj
def _real_extract(self, url): video_id = self._match_id(url) clean_url = re.search(self._VALID_URL, url).group(0) webpage = self._download_webpage(clean_url, video_id) pin_info_json = self._search_regex( r"<script id=\"initial-state\" type=\"application/json\">(.+?)</script>", webpage, "Pin data JSON", ) pin_info_full = json.loads(pin_info_json) pin_info = next( (r for r in pin_info_full["resourceResponses"] if r["name"] == "PinResource"), None, ) if pin_info: pin_data = pin_info["response"]["data"] video_urls = pin_data.get("videos", {}).get("video_list", {}) video_data = video_urls.get("V_HLSV4") video_url = video_data.get("url") video_thumb = video_data.get("thumbnail") if not video_url: raise ExtractorError("Can't find a video stream URL") title = pin_data.get("title").strip() or "pinterest_video" pinner = pin_data.get("pinner", {}) uploader = pinner.get("full_name") or pinner.get("username") else: raise ExtractorError("Can't find Pin data") return { "id": video_id, "title": title, "description": self._og_search_description(webpage), "uploader": uploader, "url": video_url, "ext": "mp4", "manifest_url": video_url, "thumbnail": video_thumb, }
def test_create__extract_failed__fail(self, podcast, episode_data, user, mocked_youtube, dbs): ydl_error = ExtractorError("Something went wrong here", video_id=episode_data["source_id"]) mocked_youtube.extract_info.side_effect = ydl_error episode_creator = EpisodeCreator( dbs, podcast_id=podcast.id, source_url=episode_data["watch_url"], user_id=user.id, ) with pytest.raises(YoutubeFetchError) as error: await_(episode_creator.create()) assert error.value.details == f"Extracting data for new Episode failed: {ydl_error}"
def _get_object_from_url(self, url): """Get a object (track, album, playlist) from the given URL. """ result = self.rdio.api_call('getObjectFromUrl', url=url, extras=['tracks'], referer=url) return result.json() if not result.get('result'): raise ExtractorError(result.get('message', u"Unknown error")) return result.get('result')
def test_create__same_episode__extract_failed__ok( self, podcast, episode, user, mocked_youtube, dbs ): mocked_youtube.extract_info.side_effect = ExtractorError("Something went wrong here") new_podcast = await_(Podcast.async_create(dbs, **get_podcast_data())) episode_creator = EpisodeCreator( dbs, podcast_id=new_podcast.id, source_url=episode.watch_url, user_id=user.id, ) new_episode: Episode = await_(episode_creator.create()) assert episode is not None assert new_episode.id != episode.id assert new_episode.source_id == episode.source_id assert new_episode.watch_url == episode.watch_url
def _real_extract(self, url): video_id = self._match_id(url) formats = [{ 'format_id': 'default', 'url': 'url:', }] if video_id == '0': raise ExtractorError('foo') if video_id == '2': formats.append({ 'format_id': 'extra', 'url': TEST_URL, }) return { 'id': video_id, 'title': 'Video %s' % video_id, 'formats': formats, }
def _get_playback_info_through_http(self, key, type=u'mp3-high'): player_name = '_web_{0}'.format(random_player_id()) playback_info = self.rdio.api_call('getPlaybackInfo', key=key, manualPlay=False, playerName=player_name, requiresUnlimited=False, finishedAd=False, type=type) playback_info = playback_info.json() if not playback_info.get('result'): reason = playback_info.get('message', u"Unknown error") raise ExtractorError( u"Failed to get playback information: `{0}'".format(reason)) return dict(url=playback_info['result']['surl'])
def _call_api(self, path, video_id, query={}): headers = { 'Authorization': AUTH, } if 'auth_token' in self._downloader.cookiejar.get_dict('.twitter.com'): print('auth_token') ct0 = self._downloader.cookiejar._cookies.get('.twitter.com', {}).get('/', {}).get('ct0') if ct0 is None or ct0.is_expired(): print('Expired cookies') self._downloader.cookiejar.clear() return self._call_api(path, video_id, query) headers["x-twitter-auth-type"] = "OAuth2Session" headers["x-csrf-token"] = ct0.value else: if not self._GUEST_TOKEN: self._GUEST_TOKEN = self._download_json( self._API_BASE + 'guest/activate.json', video_id, 'Downloading guest token', data=b'', headers=headers)['guest_token'] headers['x-guest-token'] = self._GUEST_TOKEN try: return self._download_json(self._API_BASE + path, video_id, headers=headers, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: raise ExtractorError(self._parse_json( e.cause.read().decode(), video_id)['errors'][0]['message'], expected=True) raise
def error(msg): if "This video is no longer available" in msg: raise NotAvailableException("notavailable") raise ExtractorError("Video Downloading failed.")
def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://m.tiktok.com/v/%s.html' % video_id, video_id) # The webpage will have a json embedded in a <script id="__NEXT_DATA__"> tag. The JSON holds all the metadata, so fetch that out. json_string = self._html_search_regex( [r'<script\s+id="__NEXT_DATA__"[^>]*>(.*?)</script>'], webpage, 'next_data') json_data = self._parse_json(json_string, video_id) video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict) # The watermarkless video ID is embedded in the first video file, so we need to download it and get the video ID. watermarked_url = video_data['videoData']['itemInfos']['video'][ 'urls'][0] # watermarked_response = self._download_webpage(watermarked_url, video_id) # idpos = watermarked_response.index("vid:") # watermarkless_video_id = watermarked_response[idpos + 4:idpos + 36] # watermarkless_url = "https://api2-16-h2.musical.ly/aweme/v1/play/?video_id={}&vr_type=0&is_play_url=1&source=PackSourceEnum_PUBLISH&media_type=4".format(watermarkless_video_id) watermarkless_url = watermarked_url # Get extra metadata video_info = try_get(video_data, lambda x: x['videoData']['itemInfos'], dict) author_info = try_get(video_data, lambda x: x['videoData']['authorInfos'], dict) share_info = try_get(video_data, lambda x: x['shareMeta'], dict) unique_id = str_or_none(author_info.get('uniqueId')) timestamp = try_get(video_info, lambda x: int(x['createTime']), int) height = try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) width = try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) thumbnails = [] thumbnails.append({ 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), 'width': width, 'height': height }) formats = [] formats.append({ 'url': watermarkless_url, 'ext': 'mp4', 'height': height, 'width': width }) if video_data.get('statusCode') != 0: raise ExtractorError('Video not available', video_id=video_id) return { 'id': video_id, 'title': self._og_search_title(webpage), 'description': str_or_none(video_info.get('text')) or str_or_none(share_info.get('desc')), 'comment_count': int_or_none(video_info.get('commentCount')), 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), 'height': height, 'like_count': int_or_none(video_info.get('diggCount')), 'repost_count': int_or_none(video_info.get('shareCount')), 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str), 'timestamp': timestamp, 'width': width, 'creator': str_or_none(author_info.get('nickName')), 'uploader': unique_id, 'uploader_id': str_or_none(author_info.get('userId')), 'uploader_url': 'https://www.tiktok.com/@' + unique_id, 'thumbnails': thumbnails, 'webpage_url': self._og_search_url(webpage), 'ext': 'mp4', 'formats': formats, 'http_headers': { 'User-Agent': 'okhttp', } }
def report_warning(self, message): # Don't accept warnings during tests raise ExtractorError(message)
ie = TestIE(FakeYDL({'verbose': False})) script_id = 'mastodon' results = set() def sanitize_hostname(hostname): # trim trailing slashes hostname = re.sub(r'[/\\]+$', '', hostname) # trim port number hostname = re.sub(r':\d+$', '', hostname) return hostname instance_social_api_key = os.environ['INSTANCE_SOCIAL_API_SECRET'] if not instance_social_api_key: raise ExtractorError('You must set INSTANCE_SOCIAL_API_SECRET to work') min_id = None while True: url = 'https://instances.social/api/1.0/instances/list' if min_id: url = f'{url}?min_id={min_id}' data = ie._download_json( url, script_id, note=f'Paging {min_id}, len(results)={len(results)}', headers={'Authorization': f'Bearer {instance_social_api_key}'}) for instance in data['instances']: results.add(sanitize_hostname(instance['name'])) min_id = data['pagination'].get('next_id') if not min_id: break
script_id, note= 'Scraping https://the-federation.info/peertube, len(results)=%d' % (len(results)), headers={ 'content-type': 'application/json, application/graphql', 'accept': 'application/json, application/graphql', }) for instance in data['data']['nodes']: results.add(sanitize_hostname(instance['host'])) break except BaseException: continue if not results: raise ExtractorError('no instances found') results = {x.encode('idna').decode('utf8') for x in results} ie.to_screen('%s: converted domain names to punycode, len(results)=%d' % (script_id, len(results))) results = {x for x in results if '.' in x} ie.to_screen('%s: excluded domain names without dot, len(results)=%d' % (script_id, len(results))) results = { x for x in results if not (x.endswith('.ngrok.io') or x.endswith('.localhost.run') or x.endswith('.serveo.net')) }