def scrape(self): thumbnail_url, image_data = self._find_thumbnail_image() if not thumbnail_url: return None, None, None, None # When isolated from the context of a webpage, protocol-relative URLs # are ambiguous, so let's absolutify them now. if thumbnail_url.startswith('//'): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) if not image_data: _, image_data = _fetch_url(thumbnail_url, referer=self.url) if not image_data: return None, None, None, None uid = _filename_from_content(image_data) image = str_to_image(image_data) storage_url = upload_media(image, category='previews') width, height = image.size preview_object = { 'uid': uid, 'url': storage_url, 'width': width, 'height': height, } thumbnail = _prepare_image(image) return thumbnail, preview_object, None, None
def _find_thumbnail_image(self): content_type, content = _fetch_url(self.url) # if it's an image. it's pretty easy to guess what we should thumbnail. if content_type and "image" in content_type and content: return self.url if content_type and "html" in content_type and content: soup = BeautifulSoup.BeautifulSoup(content) else: return None # allow the content author to specify the thumbnail: # <meta property="og:image" content="http://..."> og_image = (soup.find('meta', property='og:image') or soup.find('meta', attrs={'name': 'og:image'})) if og_image and og_image['content']: return og_image['content'] # <link rel="image_src" href="http://..."> thumbnail_spec = soup.find('link', rel='image_src') if thumbnail_spec and thumbnail_spec['href']: return thumbnail_spec['href'] # ok, we have no guidance from the author. look for the largest # image on the page with a few caveats. (see below) max_area = 0 max_url = None for image_url in self._extract_image_urls(soup): # When isolated from the context of a webpage, protocol-relative # URLs are ambiguous, so let's absolutify them now. if image_url.startswith('//'): image_url = coerce_url_to_protocol(image_url, self.protocol) size = _fetch_image_size(image_url, referer=self.url) if not size: continue area = size[0] * size[1] # ignore little images if area < 5000: g.log.debug('ignore little %s' % image_url) continue # ignore excessively long/wide images if max(size) / min(size) > 1.5: g.log.debug('ignore dimensions %s' % image_url) continue # penalize images with "sprite" in their name if 'sprite' in image_url.lower(): g.log.debug('penalizing sprite %s' % image_url) area /= 10 if area > max_area: max_area = area max_url = image_url return max_url
def scrape(self): thumbnail_url = self._find_thumbnail_image() # When isolated from the context of a webpage, protocol-relative URLs # are ambiguous, so let's absolutify them now. if thumbnail_url and thumbnail_url.startswith("//"): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) return thumbnail, None, None
def scrape(self): thumbnail_url = self._find_thumbnail_image() # When isolated from the context of a webpage, protocol-relative URLs # are ambiguous, so let's absolutify them now. if thumbnail_url and thumbnail_url.startswith('//'): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) return thumbnail, None, None
def scrape(self): thumbnail_url, image_data = self._find_thumbnail_image() if not thumbnail_url: return None, None, None, None if thumbnail_url.startswith('//'): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) if not image_data: _, image_data = _fetch_url(thumbnail_url, referer=self.url) if not image_data: return None, None, None, None uid = _filename_from_content(image_data) image = str_to_image(image_data) storage_url = upload_media(image, category='previews') width, height = image.size preview_object = { 'uid': uid, 'url': storage_url, 'width': width, 'height': height, } thumbnail = _prepare_image(image) match = self.URL_MATCH.match(self.url) if match and match.group(5): self.url = 'https://player.vimeo.com/video/' + match.group(5) if match.group(6): self.url += match.group(6) oembed = { 'html': '<iframe width="640" height="360" style="max-width: 100%;" src="' + self.url + '" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>', 'width': 640, 'height': 360, 'thumbnail_url': thumbnail_url } media_object = self._make_media_object(oembed) return ( thumbnail, preview_object, media_object, media_object, )
def scrape(self): thumbnail_url, image_data = self._find_thumbnail_image() if not thumbnail_url: return None, None, None, None if thumbnail_url.startswith('//'): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) if not image_data: _, image_data = _fetch_url(thumbnail_url, referer=self.url) if not image_data: return None, None, None, None uid = _filename_from_content(image_data) image = str_to_image(image_data) storage_url = upload_media(image, category='previews') width, height = image.size preview_object = { 'uid': uid, 'url': storage_url, 'width': width, 'height': height, } thumbnail = _prepare_image(image) match = self.URL_MATCH.match(self.url) if match and match.group(3): self.url = match.group(3) oembed = { 'html': '<blockquote class="imgur-embed-pub" lang="en" data-id="a/' + self.url + '"><a href="//imgur.com/' + self.url + '"></a></blockquote><script async src="//s.imgur.com/min/embed.js" charset="utf-8"></script>', 'thumbnail_url': thumbnail_url } media_object = self._make_media_object(oembed) return ( thumbnail, preview_object, media_object, media_object, )
def scrape(self): thumbnail_url, image_data = self._find_thumbnail_image() if not thumbnail_url: return None, None, None, None if thumbnail_url.startswith('//'): thumbnail_url = coerce_url_to_protocol(thumbnail_url, self.protocol) if not image_data: _, image_data = _fetch_url(thumbnail_url, referer=self.url) if not image_data: return None, None, None, None uid = _filename_from_content(image_data) image = str_to_image(image_data) storage_url = upload_media(image, category='previews') width, height = image.size preview_object = { 'uid': uid, 'url': storage_url, 'width': width, 'height': height, } thumbnail = _prepare_image(image) self.url = self.url.replace('/videos/watch/', '/videos/embed/') oembed = { 'html': '<iframe width="560" height="315" sandbox="allow-same-origin allow-scripts" src="' + self.url + '" frameborder="0" allowfullscreen style="max-width: 100%;"></iframe>', 'width': 560, 'height': 315, 'thumbnail_url': thumbnail_url } media_object = self._make_media_object(oembed) return ( thumbnail, preview_object, media_object, media_object, )
def test_coerce_url_to_protocol(self): self.assertEquals( utils.coerce_url_to_protocol('http://example.com/foo'), 'http://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('https://example.com/foo'), 'http://example.com/foo') self.assertEquals(utils.coerce_url_to_protocol('//example.com/foo'), 'http://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('http://example.com/foo', 'https'), 'https://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('https://example.com/foo', 'https'), 'https://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('//example.com/foo', 'https'), 'https://example.com/foo')
def test_coerce_url_to_protocol(self): self.assertEquals( utils.coerce_url_to_protocol('http://example.com/foo'), 'http://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('https://example.com/foo'), 'http://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('//example.com/foo'), 'http://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('http://example.com/foo', 'https'), 'https://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('https://example.com/foo', 'https'), 'https://example.com/foo') self.assertEquals( utils.coerce_url_to_protocol('//example.com/foo', 'https'), 'https://example.com/foo')
def _find_thumbnail_image(self): """Find what we think is the best thumbnail image for a link. Returns a 2-tuple of image url and, as an optimization, the raw image data. A value of None for the former means we couldn't find an image; None for the latter just means we haven't already fetched the image. """ content_type, content = _fetch_url(self.url) # if it's an image, it's pretty easy to guess what we should thumbnail. if content_type and "image" in content_type and content: return self.url, content if content_type and "html" in content_type and content: soup = BeautifulSoup.BeautifulSoup(content) else: return None, None # Allow the content author to specify the thumbnail using the Open # Graph protocol: http://ogp.me/ og_image = (soup.find('meta', property='og:image') or soup.find('meta', attrs={'name': 'og:image'})) if og_image and og_image.get('content'): return og_image['content'], None og_image = (soup.find('meta', property='og:image:url') or soup.find('meta', attrs={'name': 'og:image:url'})) if og_image and og_image.get('content'): return og_image['content'], None # <link rel="image_src" href="http://..."> thumbnail_spec = soup.find('link', rel='image_src') if thumbnail_spec and thumbnail_spec['href']: return thumbnail_spec['href'], None # ok, we have no guidance from the author. look for the largest # image on the page with a few caveats. (see below) max_area = 0 max_url = None for image_url in self._extract_image_urls(soup): # When isolated from the context of a webpage, protocol-relative # URLs are ambiguous, so let's absolutify them now. if image_url.startswith('//'): image_url = coerce_url_to_protocol(image_url, self.protocol) size = _fetch_image_size(image_url, referer=self.url) if not size: continue area = size[0] * size[1] # ignore little images if area < 5000: g.log.debug('ignore little %s' % image_url) continue # ignore excessively long/wide images if max(size) / min(size) > 1.5: g.log.debug('ignore dimensions %s' % image_url) continue # penalize images with "sprite" in their name if 'sprite' in image_url.lower(): g.log.debug('penalizing sprite %s' % image_url) area /= 10 if area > max_area: max_area = area max_url = image_url return max_url, None
def _find_thumbnail_image(self): """Find what we think is the best thumbnail image for a link. Returns a 2-tuple of image url and, as an optimization, the raw image data. A value of None for the former means we couldn't find an image; None for the latter just means we haven't already fetched the image. """ content_type, content = _fetch_url(self.url) # if it's an image, it's pretty easy to guess what we should thumbnail. if content_type and "image" in content_type and content: return self.url, content if content_type and "html" in content_type and content: soup = BeautifulSoup.BeautifulSoup(content) else: return None, None # Allow the content author to specify the thumbnail using the Open # Graph protocol: http://ogp.me/ og_image = (soup.find('meta', property='og:image') or soup.find('meta', attrs={'name': 'og:image'})) if og_image and og_image['content']: return og_image['content'], None og_image = (soup.find('meta', property='og:image:url') or soup.find('meta', attrs={'name': 'og:image:url'})) if og_image and og_image['content']: return og_image['content'], None # <link rel="image_src" href="http://..."> thumbnail_spec = soup.find('link', rel='image_src') if thumbnail_spec and thumbnail_spec['href']: return thumbnail_spec['href'], None # ok, we have no guidance from the author. look for the largest # image on the page with a few caveats. (see below) max_area = 0 max_url = None for image_url in self._extract_image_urls(soup): # When isolated from the context of a webpage, protocol-relative # URLs are ambiguous, so let's absolutify them now. if image_url.startswith('//'): image_url = coerce_url_to_protocol(image_url, self.protocol) size = _fetch_image_size(image_url, referer=self.url) if not size: continue area = size[0] * size[1] # ignore little images if area < 5000: g.log.debug('ignore little %s' % image_url) continue # ignore excessively long/wide images if max(size) / min(size) > 1.5: g.log.debug('ignore dimensions %s' % image_url) continue # penalize images with "sprite" in their name if 'sprite' in image_url.lower(): g.log.debug('penalizing sprite %s' % image_url) area /= 10 if area > max_area: max_area = area max_url = image_url return max_url, None