def scrapevideo(video_url): """Scrapes the url and fixes the data This is sort of a wrapper around `vidscraper.auto_scrape`. It calls that, but then transforms the results into a Python dict and adds some additional computed metadata. :arg video_url: Url of video to scrape. :returns: Python dict of metadata Example: >>> scrapevideo('http://www.youtube.com/watch?v=ywToByBkOTc') {'url': 'http://www.youtube.com/watch?v=ywToByBkOTc', ...} """ video_data = vidscraper.auto_scrape(video_url) data = dict([(field, getattr(video_data, field)) for field in video_data.fields]) for field in ('publish_datetime', 'file_url_expires'): dt = data.get(field, None) if isinstance(dt, datetime.datetime): data[field] = dt.isoformat() data['url'] = video_url if 'youtube.com' in video_url and 'guid' in data and data['guid']: guid = data['guid'].split('/')[-1] data['object_embed_code'] = YOUTUBE_EMBED['object'].format(guid=guid) data['iframe_embed_code'] = YOUTUBE_EMBED['iframe'].format(guid=guid) return data
def test_auto_scrape(self): video = auto_scrape("http://www.youtube.com/watch?v=J_DV9b0x7v4") self.assertEqual(video.title, u'CaramellDansen (Full Version + Lyrics)') self.assertNotEqual(video.file_url, None) self.assertEqual(video.file_url_mimetype, u'video/x-flv') self.assertTrue( video.file_url_expires - datetime.datetime.now() > datetime.timedelta(hours=1))
def test_auto_scrape(self): video = auto_scrape("http://www.youtube.com/watch?v=J_DV9b0x7v4") self.assertEqual(video.title, u'CaramellDansen (Full Version + Lyrics)') self.assertGreater(len(video.files), 0) self.assertTrue(video.files[0].url) self.assertEqual(video.files[0].mime_type, u'video/mp4') self.assertTrue(video.files[0].expires - datetime.datetime.now() > datetime.timedelta(hours=1))
def test_auto_scrape(self): video = auto_scrape("http://www.youtube.com/watch?v=J_DV9b0x7v4") self.assertEqual(video.title, u'CaramellDansen (Full Version + Lyrics)') self.assertGreater(len(video.files), 0) self.assertTrue(video.files[0].url) self.assertEqual(video.files[0].mime_type, u'video/mp4') self.assertTrue( video.files[0].expires - datetime.datetime.now() > datetime.timedelta(hours=1))
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: d = vidscraper.auto_scrape(v.website_url, fields=["publish_date"]) except: pass else: if d: v.when_published = d["publish_date"] v.save()
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: d = vidscraper.auto_scrape(v.website_url, fields=['publish_date']) except: pass else: if d: v.when_published = d['publish_date'] v.save()
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: video = vidscraper.auto_scrape(v.website_url, fields=[ 'publish_datetime'], api_keys=API_KEYS) except: pass else: if video: v.when_published = video.publish_datetime v.save()
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: video = vidscraper.auto_scrape(v.website_url, fields=['publish_datetime'], api_keys=API_KEYS) except: pass else: if video: v.when_published = video.publish_datetime v.save()
def clean_url(self): url = urlparse.urldefrag(self.cleaned_data['url'])[0] self._validate_unique(url=url) self.video_cache = None try: self.video_cache = vidscraper.auto_scrape(url, api_keys=API_KEYS) except (UnhandledVideo, urllib2.URLError): pass else: if self.video_cache.link is not None and url != self.video_cache.link: url = self.video_cache.link self._validate_unique(url=url, guid=self.video_cache.guid) elif self.video_cache.guid is not None: self._validate_unique(guid=self.video_cache.guid) return url
def clean_url(self): url = urlparse.urldefrag(self.cleaned_data['url'])[0] self._validate_unique(url=url) self.video_cache = None try: self.video_cache = vidscraper.auto_scrape(url, api_keys=API_KEYS) except (CantIdentifyUrl, urllib2.URLError): pass else: if self.video_cache.link is not None and url != self.video_cache.link: url = self.video_cache.link self._validate_unique(url=url, guid=self.video_cache.guid) elif self.video_cache.guid is not None: self._validate_unique(guid=self.video_cache.guid) return url
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: video = vidscraper.auto_scrape(v.website_url, fields=[ 'publish_datetime'], api_keys=API_KEYS) except: pass else: if video: v.when_published = video.publish_datetime v.save() # Finally, at the end, if stamps are enabled, update them. if ENABLE_CHANGE_STAMPS: models.create_or_delete_video_needs_published_date_stamp()
def get_scraped_data(url): cache_key = 'vidscraper_data-' + url if len(cache_key) >= 250: # too long, use the hash cache_key = 'vidscraper_data-hash-' + hashlib.sha1(url).hexdigest() scraped_data = cache.get(cache_key) if not scraped_data: # try and scrape the url try: scraped_data = vidscraper.auto_scrape(url) except vidscraper.errors.Error: scraped_data = None cache.add(cache_key, scraped_data) return scraped_data
def handle_noargs(self, **options): if site_too_old(): return for v in models.Video.objects.filter(when_published__isnull=True): try: d = vidscraper.auto_scrape(v.website_url, fields=['publish_date']) except: pass else: if d: v.when_published = d['publish_date'] v.save() # Finally, at the end, if stamps are enabled, update them. if models.ENABLE_CHANGE_STAMPS: models.create_or_delete_video_needs_published_date_stamp()
def get_vidscraper_video(url): cache_key = 'vidscraper_data-' + url if len(cache_key) >= 250: # too long, use the hash cache_key = 'vidscraper_data-hash-' + hashlib.sha1(url).hexdigest() vidscraper_video = cache.get(cache_key) if not vidscraper_video: # try and scrape the url try: vidscraper_video = vidscraper.auto_scrape(url) except vidscraper.errors.Error: vidscraper_video = None cache.add(cache_key, vidscraper_video) return vidscraper_video
def get_vidscraper_video(url): cache_key = 'vidscraper_data-' + url if len(cache_key) >= 250: # too long, use the hash cache_key = 'vidscraper_data-hash-' + hashlib.sha1(url).hexdigest() vidscraper_video = cache.get(cache_key) if not vidscraper_video: # try and scrape the url try: vidscraper_video = vidscraper.auto_scrape(url, api_keys=API_KEYS) except (vidscraper.errors.Error, urllib2.URLError): vidscraper_video = None cache.add(cache_key, vidscraper_video) return vidscraper_video
def save(self, commit=True, request=None): kwargs = { 'video': vidscraper.auto_scrape(self.cleaned_data['original_url']), 'commit': False, } if request and request.user.is_authenticated(): kwargs['owner'] = request.user instance = Video.from_vidscraper_video(**kwargs) def save_m2m(): instance.save_m2m() if commit: instance.save() save_m2m() else: self.save_m2m = save_m2m return instance
def get_data_from_youtube(url): video = vidscraper.auto_scrape(url) return {'thumbnail_url': video['thumbnail_url'], 'embed': video['embed']}
def get_data_from_youtube(url): video = vidscraper.auto_scrape(url) return { 'thumbnail_url': video['thumbnail_url'], 'embed': video['embed'] }