def test_get_video_data(self): scrape_file = self.get_data_file('youtube/video_info.txt') response = self.get_response(scrape_file.read()) data = self.loader.get_video_data(response) self.assertEqual(set(data), self.loader.fields) expected_data = { 'title': u'CaramellDansen (Full Version + Lyrics)', 'thumbnail_url': 'http://i3.ytimg.com/vi/J_DV9b0x7v4/hqdefault.jpg', 'tags': [u'caramell', u'dance', u'dansen', u'hip', u'hop', u's\xfcchtig', u'geil', u'cool', u'lustig', u'manga', u'schweden', u'anime', u'musik', u'music', u'funny', u'caramelldansen', u'U-U-U-Aua', u'Dance'], 'files': [VideoFile(url='http://r10---sn-nx57yn7k.c.youtube.com/videoplayback?upn=p0pWpkfxwq4&sparams=cp%2Cgcr%2Cid%2Cip%2Cipbits%2Citag%2Cratebypass%2Csource%2Cupn%2Cexpire&fexp=919330%2C916611%2C920704%2C912806%2C928001%2C922403%2C922405%2C929901%2C913605%2C929104%2C913546%2C913556%2C908496%2C920201%2C913302%2C919009%2C911116%2C901451%2C902556&key=yt1&expire=1356417961&source=youtube&ipbits=8&itag=18&gcr=us&sver=3&signature=7D1D4A9CF0626C3B2A10F6567390165729FA00B8.89C64C08C8B1CA689E4CE21169531E34A56761C1&ratebypass=yes&mt=1356395169&mv=m&ms=au&ip=74.61.34.250&cp=U0hUS1RMVV9LS0NONF9MRllKOmZQN1JFU3lOX2Js&id=27f0d5f5bd31eefe', width=640, height=360, expires=datetime.datetime(2012, 12, 25, 6, 46, 1), mime_type=u'video/mp4'), VideoFile(url='http://r15---sn-nx57yn7r.c.youtube.com/videoplayback?upn=p0pWpkfxwq4&sparams=algorithm%2Cburst%2Ccp%2Cfactor%2Cgcr%2Cid%2Cip%2Cipbits%2Citag%2Csource%2Cupn%2Cexpire&fexp=919330%2C916611%2C920704%2C912806%2C928001%2C922403%2C922405%2C929901%2C913605%2C929104%2C913546%2C913556%2C908496%2C920201%2C913302%2C919009%2C911116%2C901451%2C902556&key=yt1&algorithm=throttle-factor&itag=34&ipbits=8&burst=40&gcr=us&sver=3&signature=9DAA96CCF4CA172A92C78907B0ECE241BAFB07D6.09AF4B0F7C6558D1B9B184A5F60232EDC296FF95&mv=m&mt=1356395169&ip=74.61.34.250&expire=1356417961&source=youtube&ms=au&factor=1.25&cp=U0hUS1RMVV9LS0NONF9MRllKOmZQN1JFU3lOX2Js&id=27f0d5f5bd31eefe', width=640, height=360, expires=datetime.datetime(2012, 12, 25, 6, 46, 1), mime_type=u'video/x-flv'), VideoFile(url='http://r2---sn-nx57yn7d.c.youtube.com/videoplayback?upn=p0pWpkfxwq4&sparams=algorithm%2Cburst%2Ccp%2Cfactor%2Cgcr%2Cid%2Cip%2Cipbits%2Citag%2Csource%2Cupn%2Cexpire&fexp=919330%2C916611%2C920704%2C912806%2C928001%2C922403%2C922405%2C929901%2C913605%2C929104%2C913546%2C913556%2C908496%2C920201%2C913302%2C919009%2C911116%2C901451%2C902556&key=yt1&algorithm=throttle-factor&itag=5&ipbits=8&burst=40&gcr=us&sver=3&signature=A909DD7537DD821A0F5AE14FF1E83C87DD7C4EDF.4FCE8516F456B96DD45DB35A7AB8624A1A45498F&mv=m&mt=1356395169&ip=74.61.34.250&expire=1356417961&source=youtube&ms=au&factor=1.25&cp=U0hUS1RMVV9LS0NONF9MRllKOmZQN1JFU3lOX2Js&id=27f0d5f5bd31eefe', width=400, height=240, expires=datetime.datetime(2012, 12, 25, 6, 46, 1), mime_type=u'video/x-flv'), VideoFile(url='http://r11---sn-nx57yn76.c.youtube.com/videoplayback?upn=p0pWpkfxwq4&sparams=cp%2Cgcr%2Cid%2Cip%2Cipbits%2Citag%2Cratebypass%2Csource%2Cupn%2Cexpire&fexp=919330%2C916611%2C920704%2C912806%2C928001%2C922403%2C922405%2C929901%2C913605%2C929104%2C913546%2C913556%2C908496%2C920201%2C913302%2C919009%2C911116%2C901451%2C902556&key=yt1&expire=1356417961&source=youtube&ipbits=8&itag=43&gcr=us&sver=3&signature=7D0403EBD72E0A28695B35F7BD542A88BC9FE4FA.C4D2BFDD22BBFDC002F8C67CF66363BCC93501EC&ratebypass=yes&mt=1356395169&mv=m&ms=au&ip=74.61.34.250&cp=U0hUS1RMVV9LS0NONF9MRllKOmZQN1JFU3lOX2Js&id=27f0d5f5bd31eefe', width=640, height=360, expires=datetime.datetime(2012, 12, 25, 6, 46, 1), mime_type=u'video/webm')] } self.assertDictEqual(data, expected_data)
def test_serialize__files(self): """ Tests that a video with associated files can still be serialized and deserialized. """ video = Video("http://www.youtube.com/watch?v=J_DV9b0x7v4") now = datetime.datetime.now() video.files = [VideoFile(url='http://google.com', expires=now, length=100, width=50, height=50, mime_type="video/x-flv"), VideoFile(url='http://xkcd.com', expires=now, length=75, width=80, height=80, mime_type="application/x-shockwave-flash"),] data = video.serialize() # verify that the data we expect is in the serialized version. self.assertEqual(data['files'][0]['url'], "http://google.com") self.assertEqual(data['files'][1]['mime_type'], "application/x-shockwave-flash") self.assertEqual(data['files'][0]['expires'], now.isoformat()) # Verify that the data can be deserialized as a video. new_video = Video.deserialize(data) self.assertEqual(dict(video.items()), dict(new_video.items()))
def test_get_file__no_mimetypes(self): """ If none of the videos have mime types, the first file should be returned. """ video = Video("http://www.youtube.com/watch?v=J_DV9b0x7v4") file1 = VideoFile(url='http://google.com') file2 = VideoFile(url='http://xkcd.com') file3 = VideoFile(url='http://example.com') video.files = [file1, file2, file3] self.assertEqual(video.get_file(), file1) video.files = [file3, file2, file1] self.assertEqual(video.get_file(), file3)
def test_get_file__open(self): """ Tests that open video formats are preferred over proprietary. """ video = Video("http://www.youtube.com/watch?v=J_DV9b0x7v4") file1 = VideoFile(url='http://google.com', mime_type="video/ogg") file2 = VideoFile(url='http://xkcd.com', mime_type="application/x-shockwave-flash") file3 = VideoFile(url='http://example.com', mime_type="video/mp4") video.files = [file1, file2, file3] self.assertEqual(video.get_file(), file1) video.files = [file3, file2, file1] self.assertEqual(video.get_file(), file1)
def test_parse_feed_entry_atom(self): fp = feedparser.parse(self._data_file_path('generic/feed.atom')) data = self.feed.get_video_data(fp.entries[0]) self.assertEqual( data, { 'title': u'Atom 1.0', 'description': u"""<h1>Show Notes</h1> <ul> <li>00:01:00 -- Introduction</li> <li>00:15:00 -- Talking about Atom 1.0</li> <li>00:30:00 -- Wrapping up</li> </ul>""", 'tags': None, 'link': u'http://www.example.org/entries/1', 'guid': u'http://www.example.org/entries/1', 'embed_code': None, 'files': [ VideoFile(url=u'http://www.example.org/myvideo.ogg', length=u'1234', mime_type=u'application/ogg') ], 'thumbnail_url': None, 'publish_datetime': datetime.datetime(2005, 7, 15, 12, 0), 'license': 'http://creativecommons.org/licenses/by/2.5/' })
def parse_feed_entry(entry): """ Parses a feedparser entry from a blip rss feed into a dictionary mapping :class:`.Video` fields to values. This is used for blip feeds and blip API requests (since those can also be done with feeds.) """ files = [VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(entry)] data = { 'guid': entry['id'], 'link': entry['link'], 'title': entry['title'], 'description': entry['blip_puredescription'], 'files': files, 'embed_code': entry['media_player']['content'], 'publish_datetime': datetime.strptime(entry['blip_datestamp'], "%Y-%m-%dT%H:%M:%SZ"), 'thumbnail_url': get_entry_thumbnail_url(entry), 'tags': [tag['term'] for tag in entry['tags'] if tag['scheme'] is None][1:], 'user': entry['blip_safeusername'], 'user_url': entry['blip_showpage'] } if 'license' in entry: data['license'] = entry['license'] return data
def get_video_data(self, response): if response.status_code == 402: # 402: Payment required. # A note in the previous code said this could happen when too many # requests were made (per second?) Unclear why, though, or why # this is only caught here. return {} params = urlparse.parse_qs(response.text.encode('utf-8')) if params['status'][0] == 'fail': if params['errorcode'][0] == '150': # unembedable return {'is_embeddable': False} return {} data = { 'title': params['title'][0].decode('utf8'), 'thumbnail_url': params['thumbnail_url'][0], } if 'keywords' in params: data['tags'] = params['keywords'][0].decode('utf8').split(',') if data['thumbnail_url'].endswith('/default.jpg'): # got a crummy version; increase the resolution data['thumbnail_url'] = data['thumbnail_url'].replace( '/default.jpg', '/hqdefault.jpg') url_querystrings = params["url_encoded_fmt_stream_map"][0].split(",") url_data = [urlparse.parse_qs(qs) for qs in url_querystrings] url_data_map = dict( (ud['itag'][0], ud) for ud in url_data if 'itag' in ud) data['files'] = [] for code, mime_type, width, height in self.formats: if code in url_data_map: file_data = url_data_map[code] parsed_file_url = urlparse.urlsplit(file_data['url'][0]) parsed_file_url_qs = dict( urlparse.parse_qsl(parsed_file_url.query)) expires = struct_time_to_datetime( time.gmtime(int(parsed_file_url_qs['expire']))) parsed_file_url_qs['signature'] = file_data['sig'][0] url = urlparse.urlunsplit(parsed_file_url[:3] + ( urllib.urlencode(parsed_file_url_qs), ) + parsed_file_url[4:]) data['files'].append( VideoFile(url=url, expires=expires, mime_type=mime_type, width=width, height=height)) return data
def get_video_data(self, item): files = [VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(item)] data = { 'title': item.title, 'description': item.description, 'thumbnail_url': item.media_thumbnail[0]['url'], 'publish_datetime': struct_time_to_datetime(item.published_parsed), 'user': item['kaltura_userscreenname'], 'files': files or None, } return data
from vidscraper.videos import VideoFile DISQUS_DATA = { 'guid': u'4809E60A-C2AB-11DF-BBAC-A6337D0214E0', 'link': "http://blip.tv/file/4135225", 'title': "Scaling the World's Largest Django Application", 'description': "Disqus, one of the largest Django applications in " "the world, will explain how they deal with scaling " "complexities in a small startup.", 'files': [ VideoFile(url=u'http://blip.tv/file/get/Robertlofthouse-' u'ScalingTheWorldsLargestDjangoApplication558.ogv', length=u'73533796', mime_type=u'video/ogg'), VideoFile(url=u'http://blip.tv/file/get/Robertlofthouse-' u'ScalingTheWorldsLargestDjangoApplication883.flv', length=u'418241604', mime_type=u'video/x-flv') ], 'embed_code': '<embed src="http://blip.tv/play/AYH9xikC" ' 'type="application/x-shockwave-flash" width="480" ' 'height="390" wmode="transparent" ' 'allowscriptaccess="always" allowfullscreen="true" >' '</embed>', 'publish_datetime': datetime.datetime(2010, 9, 17, 22, 31, 14), 'thumbnail_url':
def get_video_data(self, item): if item.get('published_parsed'): best_date = struct_time_to_datetime(item['published_parsed']) elif item.get('updated_parsed'): best_date = struct_time_to_datetime(item['updated_parsed']) else: best_date = None link = item.get('link') if 'links' in item: for possible_link in item.links: if possible_link.get('rel') == 'via': # original URL link = possible_link['href'] break if ('content' in item and item['content'] and item['content'][0]['value']): # Atom description = item['content'][0]['value'] else: description = item.get('summary', '') files = [ VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(item) ] embed_code = None if 'media_player' in item: player = item['media_player'] if player.get('content'): embed_code = convert_entities(player['content']) elif 'url' in player: files.append( VideoFile(url=player['url'], mime_type=player.get('type'))) if not files: files = None if 'media_license' in item: license = item['media_license']['href'] else: license = item.get('license') return { 'link': link, 'title': convert_entities(item.get('title', '')), 'description': description, 'thumbnail_url': get_entry_thumbnail_url(item), 'files': files, 'publish_datetime': best_date, 'guid': item.get('id'), 'embed_code': embed_code, 'tags': [tag['term'] for tag in item['tags'] if tag['scheme'] is None] if 'tags' in item else None, 'license': license }